Repository: hbilen/dynamic-image-nets
Branch: master
Commit: 96b91afab109
Files: 28
Total size: 93.3 KB
Directory structure:
gitextract_6iyvfhqz/
├── .gitmodules
├── Datasets/
│ ├── cnn_hmdb51_of_setup_data.m
│ ├── cnn_hmdb51_setup_data.m
│ ├── cnn_ucf101_of_setup_data.m
│ └── cnn_ucf101_setup_data.m
├── Layers/
│ ├── AppRankPooling.m
│ ├── BatchNormN.m
│ ├── ErrorMultiClass.m
│ ├── L2Normalize.m
│ ├── LossNormalized.m
│ ├── TemporalPooling.m
│ ├── vl_nnarpooltemporal.m
│ ├── vl_nnl2norm.m
│ └── vl_nnpooltemporal.m
├── README.md
├── dicnn/
│ ├── cnn_dicnn_of.m
│ ├── cnn_dicnn_rgb.m
│ ├── cnn_init_cafferef.m
│ ├── cnn_init_resnext.m
│ ├── cnn_single_of.m
│ ├── cnn_single_rgb.m
│ ├── cnn_train_dicnn_dag.m
│ ├── cnn_video_of_get_batch.m
│ ├── cnn_video_rgb_get_batch.m
│ ├── compute_approximate_dynamic_images.m
│ └── visualize_approximate_dynamic_images.m
├── main_train.m
└── utils/
└── extract_frames.sh
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitmodules
================================================
[submodule "matconvnet"]
path = matconvnet
url = https://github.com/vlfeat/matconvnet
branch = master
================================================
FILE: Datasets/cnn_hmdb51_of_setup_data.m
================================================
function imdb = cnn_hmdb51_of_setup_data(varargin)
% CNN_UCF101_SETUP_DATA Initialize UCF101 - Action Recognition Data Set
% http://crcv.ucf.edu/data/UCF101.php
% this script requires UCF101 downloaded and frames extracted in frames
% folder
opts.dataDir = fullfile('data','HMDB51') ;
opts.lite = false ;
% opts = vl_argparse(opts, varargin) ;
%% ------------------------------------------------------------------------
% Load categories metadata
% -------------------------------------------------------------------------
% find images
imagePath = fullfile(opts.dataDir, 'tvl1_flow', 'u', '*') ;
images = dir(imagePath) ;
videoNames = cell(1,numel(images)) ;
frameNames = cell(1,numel(images)) ;
nrFrames = zeros(1,numel(images)) ;
for i=1:numel(images)
frames = dir(fullfile(opts.dataDir,'tvl1_flow','u',images(i).name,'frame*.jpg')) ;
framesc = cell(1,numel(frames)) ;
if ~isempty(numel(frames))
for j=1:numel(frames)
framesc{j} = frames(j).name ;
end
frameNames{i} = framesc ;
frameNames{i} = strcat(images(i).name,'/',framesc) ;
nrFrames(i) = numel(framesc) ;
videoNames{i} = images(i).name ;
end
end
videoNames(nrFrames==0) = [] ;
frameNames(nrFrames==0) = [] ;
% nrFrames(nrFrames==0) = [] ;
frameNamesuv = cell(1,numel(frameNames)) ;
for i=1:numel(frameNames)
nn = frameNames{i} ;
nn1 = strcat('u/',nn) ;
nn2 = strcat('v/',nn) ;
frameNamesuv{i} = cell(1,2*numel(nn1)) ;
frameNamesuv{i}(1:2:end) = nn1 ;
frameNamesuv{i}(2:2:end) = nn2 ;
end
% find metadata
% ncls = 51 ;
metaPath = fullfile(opts.dataDir, 'hmdb51_splits', '*.txt') ;
splits = dir(metaPath) ;
cats = cell(1,numel(videoNames)) ;
sets = zeros(3,numel(videoNames)) ;
catNames = cell(1,numel(splits)) ;
for i=1:numel(splits)
j = strfind(splits(i).name,'_test_') ;
splitno = str2double(splits(i).name(j+11)) ;
catNames{i} = splits(i).name(1:j-1) ;
t = importdata(fullfile(opts.dataDir, 'hmdb51_splits', splits(i).name)) ;
vids = cell(1,numel(t.textdata)) ;
for k=1:numel(t.textdata)
vids{k} = t.textdata{k}(1:end-4) ;
end
[ia,ib] = ismember(vids,videoNames) ;
assert(all(ia)) ;
sets(splitno,ib) = t.data' ;
cats(ib) = repmat(catNames(i),numel(ia),1) ;
end
[cu,~,labels] = unique(cats) ;
sets(sets(:)==2) = 3 ;
imdb.classes.name = cu ;
imdb.images.name = videoNames ;
imdb.images.names = frameNamesuv ;
imdb.images.label = labels' ;
imdb.images.sets = sets ;
imdb.imageDir = fullfile(opts.dataDir, 'tvl1_flow') ;
================================================
FILE: Datasets/cnn_hmdb51_setup_data.m
================================================
function imdb = cnn_hmdb51_setup_data(varargin)
% CNN_UCF101_SETUP_DATA Initialize UCF101 - Action Recognition Data Set
% http://crcv.ucf.edu/data/UCF101.php
% this script requires UCF101 downloaded and frames extracted in frames
% folder
opts.dataDir = fullfile('data','HMDB51') ;
opts.lite = false ;
% opts = vl_argparse(opts, varargin) ;
%% ------------------------------------------------------------------------
% Load categories metadata
% -------------------------------------------------------------------------
% find images
imagePath = fullfile(opts.dataDir, 'frames', '*') ;
images = dir(imagePath) ;
videoNames = cell(1,numel(images)) ;
frameNames = cell(1,numel(images)) ;
nrFrames = zeros(1,numel(images)) ;
for i=1:numel(images)
frames = dir(fullfile(opts.dataDir,'frames',images(i).name,'frame*.jpg')) ;
framesc = cell(1,numel(frames)) ;
if ~isempty(numel(frames))
for j=1:numel(frames)
framesc{j} = frames(j).name ;
end
frameNames{i} = strcat(images(i).name,'/',framesc) ;
nrFrames(i) = numel(framesc) ;
videoNames{i} = images(i).name ;
end
end
videoNames(nrFrames==0) = [] ;
frameNames(nrFrames==0) = [] ;
% nrFrames(nrFrames==0) = [] ;
% find metadata
% ncls = 51 ;
metaPath = fullfile(opts.dataDir, 'hmdb51_splits', '*.txt') ;
splits = dir(metaPath) ;
% splitFiles = cell(1,3*ncls) ;
cats = cell(1,numel(videoNames)) ;
sets = zeros(3,numel(videoNames)) ;
catNames = cell(1,numel(splits)) ;
for i=1:numel(splits)
j = strfind(splits(i).name,'_test_') ;
splitno = str2double(splits(i).name(j+11)) ;
catNames{i} = splits(i).name(1:j-1) ;
t = importdata(fullfile(opts.dataDir, 'hmdb51_splits', splits(i).name)) ;
vids = cell(1,numel(t.textdata)) ;
for k=1:numel(t.textdata)
vids{k} = t.textdata{k}(1:end-4) ;
end
[ia,ib] = ismember(vids,videoNames) ;
assert(all(ia)) ;
sets(splitno,ib) = t.data' ;
cats(ib) = repmat(catNames(i),numel(ia),1) ;
end
[cu,~,labels] = unique(cats) ;
sets(sets(:)==2) = 3 ;
imdb.classes.name = cu ;
imdb.images.name = videoNames ;
imdb.images.names = frameNames ;
imdb.images.label = labels' ;
imdb.images.sets = sets ;
imdb.imageDir = fullfile(opts.dataDir, 'frames') ;
================================================
FILE: Datasets/cnn_ucf101_of_setup_data.m
================================================
function imdb = cnn_ucf101_of_setup_data(varargin)
% CNN_UCF101_SETUP_DATA Initialize UCF101 - Action Recognition Data Set
% http://crcv.ucf.edu/data/UCF101.php
% this script requires UCF101 downloaded and frames extracted in frames
% folder
opts.dataDir = fullfile('data','UCF101') ;
opts.lite = false ;
opts = vl_argparse(opts, varargin) ;
%% ------------------------------------------------------------------------
% Load categories metadata
% -------------------------------------------------------------------------
% find metadata
metaPath = fullfile(opts.dataDir, 'ucfTrainTestlist/classInd.txt') ;
fprintf('using metadata %s\n', metaPath) ;
tmp = importdata(metaPath);
nCls = numel(tmp);
if nCls ~= 101
error('Wrong meta file %s',metaPath);
end
cats = cell(1,nCls);
for i=1:numel(tmp)
t = strsplit(tmp{i});
cats{i} = t{2};
end
imdb.classes.name = sort(cats) ;
imdb.imageDir = fullfile(opts.dataDir, 'tvl1_flow', 'u') ;
%% ------------------------------------------------------------------------
% load image names and labels
% -------------------------------------------------------------------------
fprintf('searching training images ...\n') ;
names = {} ;
name = {};
labels = {} ;
for d = dir(fullfile(imdb.imageDir, 'v_*'))'
[~,lab] = ismember(lower(d.name(3:end-8)), lower(cats)) ;
if lab==0
error('no class label found for %s',d.name);
end
ims = dir(fullfile(imdb.imageDir, d.name, '*.jpg')) ;
name{end+1} = d.name;
names{end+1} = strcat([d.name, filesep], {ims.name}) ;
labels{end+1} = lab ;
if mod(numel(names), 10) == 0, fprintf('.') ; end
if mod(numel(names), 500) == 0, fprintf('\n') ; end
%fprintf('found %s with %d images\n', d.name, numel(ims)) ;
end
% names = horzcat(names{:}) ;
labels = horzcat(labels{:}) ;
% labels = [labels ; labels] ;
labels = labels(:)' ;
for i=1:numel(names)
nn = names{i} ;
nn1 = strcat('u/',nn) ;
nn2 = strcat('v/',nn) ;
names{i} = cell(1,2*numel(nn1)) ;
names{i}(1:2:end) = nn1 ;
names{i}(2:2:end) = nn2 ;
end
imdb.images.id = 1:numel(names) ;
imdb.images.name = name ;
imdb.images.names = names ;
imdb.images.label = labels ;
imdb.imageDir = fullfile(opts.dataDir, 'tvl1_flow') ;
%% ------------------------------------------------------------------------
% load train / test splits
% -------------------------------------------------------------------------
fprintf('labeling data...(this may take couple of minutes)') ;
imdb.images.sets = zeros(3, numel(names)) ;
setNames = {'train','test'};
setVal = [1,3];
for s=1:numel(setNames)
for i=1:3
trainFl = fullfile(opts.dataDir, 'ucfTrainTestlist',sprintf('%slist%02d.txt',...
setNames{s},i)) ;
trainList = importdata(trainFl);
if isfield(trainList,'textdata')
trainList = trainList.textdata;
end
for j=1:numel(trainList)
tmp = strsplit(trainList{j},'/');
[~,lab] = ismember(lower(tmp{2}(1:end-4)), lower(name)) ;
if lab==0
% error('cannot find the video %s',tmp{2}(1:end-4));
warning('cannot find the video %s',tmp{2}(1:end-4));
continue ;
end
% if trainList.data(j) ~= labels(lab)
% error('Labels do not match for %s',tmp{2});
% end
imdb.images.sets(i,lab) = setVal(s);
end
end
end
fprintf('\n') ;
%% ------------------------------------------------------------------------
% Postprocessing
% -------------------------------------------------------------------------
% sort categories by WNID (to be compatible with other implementations)
[imdb.classes.name,perm] = sort(imdb.classes.name) ;
relabel(perm) = 1:numel(imdb.classes.name) ;
ok = imdb.images.label > 0 ;
imdb.images.label(ok) = relabel(imdb.images.label(ok)) ;
if opts.lite
% pick a small number of images for the first 10 classes
% this cannot be done for test as we do not have test labels
clear keep ;
for i=1:10
sel = find(imdb.images.label == i) ;
train = sel(imdb.images.sets(1,sel) == 1) ;
test = sel(imdb.images.sets(1,sel) == 3) ;
keep{i} = [train test] ;
end
keep = keep{:};
imdb.images.id = imdb.images.id(keep) ;
imdb.images.name = imdb.images.name(keep) ;
imdb.images.names = imdb.images.names(keep) ;
imdb.images.sets = imdb.images.sets(1,keep) ;
imdb.images.label = imdb.images.label(keep) ;
end
================================================
FILE: Datasets/cnn_ucf101_setup_data.m
================================================
function imdb = cnn_ucf101_setup_data(varargin)
% CNN_UCF101_SETUP_DATA Initialize UCF101 - Action Recognition Data Set
% http://crcv.ucf.edu/data/UCF101.php
% this script requires UCF101 downloaded and frames extracted in frames
% folder
opts.dataDir = fullfile('data','UCF101') ;
opts.lite = false ;
opts = vl_argparse(opts, varargin) ;
%% ------------------------------------------------------------------------
% Load categories metadata
% -------------------------------------------------------------------------
% find metadata
metaPath = fullfile(opts.dataDir, 'ucfTrainTestlist/classInd.txt') ;
fprintf('using metadata %s\n', metaPath) ;
tmp = importdata(metaPath);
nCls = numel(tmp);
if nCls ~= 101
error('Wrong meta file %s',metaPath);
end
cats = cell(1,nCls);
for i=1:numel(tmp)
t = strsplit(tmp{i});
cats{i} = t{2};
end
imdb.classes.name = cats ;
imdb.imageDir = fullfile(opts.dataDir, 'frames') ;
%% ------------------------------------------------------------------------
% load image names and labels
% -------------------------------------------------------------------------
fprintf('searching training images ...\n') ;
names = {} ;
name = {};
labels = {} ;
for d = dir(fullfile(imdb.imageDir, 'v_*'))'
[~,lab] = ismember(lower(d.name(3:end-8)), lower(cats)) ;
if lab==0
error('no class label found for %s',d.name);
end
ims = dir(fullfile(imdb.imageDir, d.name, '*.jpg')) ;
name{end+1} = d.name;
names{end+1} = strcat([d.name, filesep], {ims.name}) ;
labels{end+1} = lab ;
if mod(numel(names), 10) == 0, fprintf('.') ; end
if mod(numel(names), 500) == 0, fprintf('\n') ; end
%fprintf('found %s with %d images\n', d.name, numel(ims)) ;
end
% names = horzcat(names{:}) ;
labels = horzcat(labels{:}) ;
imdb.images.id = 1:numel(names) ;
imdb.images.name = name ;
imdb.images.names = names ;
imdb.images.label = labels ;
%% ------------------------------------------------------------------------
% load train / test splits
% -------------------------------------------------------------------------
fprintf('labeling data...(this may take couple of minutes)') ;
imdb.images.sets = zeros(3, numel(names)) ;
setNames = {'train','test'};
setVal = [1,3];
for s=1:numel(setNames)
for i=1:3
trainFl = fullfile(opts.dataDir, 'ucfTrainTestlist',sprintf('%slist%02d.txt',...
setNames{s},i)) ;
trainList = importdata(trainFl);
if isfield(trainList,'textdata')
trainList = trainList.textdata;
end
for j=1:numel(trainList)
tmp = strsplit(trainList{j},'/');
[~,lab] = ismember(lower(tmp{2}(1:end-4)), lower(name)) ;
if lab==0
error('cannot find the video %s',tmp{2});
end
% if trainList.data(j) ~= labels(lab)
% error('Labels do not match for %s',tmp{2});
% end
imdb.images.sets(i,lab) = setVal(s);
end
end
end
fprintf('\n') ;
%% ------------------------------------------------------------------------
% Postprocessing
% -------------------------------------------------------------------------
% sort categories by WNID (to be compatible with other implementations)
[imdb.classes.name,perm] = sort(imdb.classes.name) ;
relabel(perm) = 1:numel(imdb.classes.name) ;
ok = imdb.images.label > 0 ;
imdb.images.label(ok) = relabel(imdb.images.label(ok)) ;
if opts.lite
% pick a small number of images for the first 10 classes
% this cannot be done for test as we do not have test labels
clear keep ;
for i=1:10
sel = find(imdb.images.label == i) ;
train = sel(imdb.images.sets(1,sel) == 1) ;
test = sel(imdb.images.sets(1,sel) == 3) ;
keep{i} = [train test] ;
end
keep = keep{:};
imdb.images.id = imdb.images.id(keep) ;
imdb.images.name = imdb.images.name(keep) ;
imdb.images.names = imdb.images.names(keep) ;
imdb.images.sets = imdb.images.sets(1,keep) ;
imdb.images.label = imdb.images.label(keep) ;
end
================================================
FILE: Layers/AppRankPooling.m
================================================
classdef AppRankPooling < dagnn.ElementWise
% author: Hakan Bilen
% dagnn wrapper for approximate rank pooling
properties
scale = 1
end
methods
function outputs = forward(obj, inputs, params)
outputs{1} = vl_nnarpooltemporal(inputs{1},inputs{2}) * obj.scale ;
end
function [derInputs, derParams] = backward(obj, inputs, params, derOutputs)
derInputs = cell(1,2);
derInputs{1} = vl_nnarpooltemporal(inputs{1},inputs{2},derOutputs{1}) * obj.scale;
derParams = {} ;
end
function outputSizes = getOutputSizes(obj, inputSizes)
% This is not correct, dim(4) depends on inputs{2}
outputSizes{1} = inputSizes{1} ;
end
function obj = AppRankPooling(varargin)
obj.load(varargin) ;
end
end
end
================================================
FILE: Layers/BatchNormN.m
================================================
classdef BatchNormN < dagnn.ElementWise
properties
numChannels
epsilon = 1e-5
opts = {'NoCuDNN'} % ours seems slightly faster
end
properties (Transient)
moments
end
methods
function outputs = forward(obj, inputs, params)
if strcmp(obj.net.mode, 'test')
outputs{1} = vl_nnbnorm(inputs{1}, params{1}, params{2}, ...
'moments', params{3}, ...
'epsilon', obj.epsilon, ...
obj.opts{:}) ;
else
[outputs{1},obj.moments] = ...
vl_nnbnorm(inputs{1}, params{1}, params{2}, ...
'epsilon', obj.epsilon, ...
obj.opts{:}) ;
end
end
function [derInputs, derParams] = backward(obj, inputs, params, derOutputs)
[derInputs{1}, derParams{1}, derParams{2}, derParams{3}] = ...
vl_nnbnorm(inputs{1}, params{1}, params{2}, derOutputs{1}, ...
'epsilon', obj.epsilon, ...
'moments', obj.moments, ...
obj.opts{:}) ;
obj.moments = [] ;
% multiply the moments update by the number of images in the batch
% this is required to make the update additive for subbatches
% and will eventually be normalized away
% derParams{3} = derParams{3} * size(inputs{1},4) ;
end
% ---------------------------------------------------------------------
function obj = BatchNormN(varargin)
obj.load(varargin{:}) ;
end
function params = initParams(obj)
params{1} = ones(obj.numChannels,1,'single') ;
params{2} = zeros(obj.numChannels,1,'single') ;
params{3} = zeros(obj.numChannels,2,'single') ;
end
function attach(obj, net, index)
attach@dagnn.ElementWise(obj, net, index) ;
p = net.getParamIndex(net.layers(index).params{3}) ;
net.params(p).trainMethod = 'average' ;
net.params(p).learningRate = 0.1 ;
end
end
end
================================================
FILE: Layers/ErrorMultiClass.m
================================================
classdef ErrorMultiClass < dagnn.Loss
% author: Hakan Bilen
% computes multi-class accuracy
% inputs{1}->scores
% inputs{2}->gt labels
properties
nImgPerClass = []
nCorPred = []
accuracy = []
resetLayer = false
end
methods
function outputs = forward(obj, inputs, params)
if numel(inputs)~=2
error('wrong number of inputs');
end
nCls = size(inputs{1},3);
if obj.resetLayer || isempty(obj.nImgPerClass)
obj.nImgPerClass = zeros(1,size(inputs{1},3));
obj.nCorPred = zeros(1,size(inputs{1},3));
obj.accuracy = zeros(1,size(inputs{1},3));
if obj.resetLayer
obj.resetLayer = false ;
obj.average = 0 ;
end
end
[~,predictions] = max(gather(squeeze(inputs{1})),[],1);
for c=1:nCls
obj.nImgPerClass(c) = obj.nImgPerClass(c) + sum(inputs{2}==c);
obj.nCorPred(c) = obj.nCorPred(c) + sum(predictions==c & inputs{2}==c);
end
ni = obj.nImgPerClass;
ni(ni==0) = 1;
obj.accuracy = obj.nCorPred ./ ni;
obj.average = (1-mean(obj.accuracy));
outputs{1} = obj.average;
end
function [derInputs, derParams] = backward(obj, inputs, params, derOutputs)
derInputs = cell(1,2);
derParams = {} ;
end
function reset(obj)
obj.resetLayer = true ;
% obj.nImgPerClass = [];
% obj.nCorPred = [];
% obj.accuracy = [];
% obj.average = 0;
end
function obj = ErrorMultiClass(varargin)
obj.load(varargin) ;
obj.loss = 'error_multi_class' ;
end
end
end
================================================
FILE: Layers/L2Normalize.m
================================================
classdef L2Normalize < dagnn.ElementWise
% author: Hakan Bilen
% dagnn wrapper for l2 normalization
properties
scale = 1;
clip = [-inf inf];
offset = 0;
end
methods
function outputs = forward(obj, inputs, params)
outputs{1} = vl_nnl2norm(inputs{1},[obj.scale obj.clip obj.offset]);
end
function [derInputs, derParams] = backward(obj, inputs, params, derOutputs)
derInputs{1} = vl_nnl2norm(inputs{1},[obj.scale obj.clip obj.offset],derOutputs{1});
derParams = {} ;
end
function obj = L2Normalize(varargin)
obj.load(varargin) ;
end
end
end
================================================
FILE: Layers/LossNormalized.m
================================================
classdef LossNormalized < dagnn.Loss
% properties
% loss = 'softmaxlog'
% ignoreAverage = false
% opts = {}
% end
% properties (Transient)
% average = 0
% numAveraged = 0
% end
methods
function outputs = forward(obj, inputs, params)
outputs{1} = vl_nnloss(inputs{1}, inputs{2}, [], 'loss', obj.loss, obj.opts{:}) ;
obj.accumulateAverage(inputs, outputs);
if numel(size(inputs{1}))>3
bs = size(inputs{1},4) ;
else
bs = 1 ;
end
outputs{1} = outputs{1} / bs ;
end
function accumulateAverage(obj, inputs, outputs)
if obj.ignoreAverage, return; end;
n = obj.numAveraged ;
m = n + size(inputs{1}, 1) * size(inputs{1}, 2) * size(inputs{1}, 4);
obj.average = bsxfun(@plus, n * obj.average, gather(outputs{1})) / m ;
obj.numAveraged = m ;
end
function [derInputs, derParams] = backward(obj, inputs, params, derOutputs)
if numel(size(inputs{1}))>3
bs = size(inputs{1},4) ;
else
bs = 1 ;
end
derInputs{1} = vl_nnloss(inputs{1}, inputs{2}, derOutputs{1}, 'loss', obj.loss, obj.opts{:}) / bs;
derInputs{2} = [] ;
derParams = {} ;
end
function reset(obj)
obj.average = 0 ;
obj.numAveraged = 0 ;
end
function outputSizes = getOutputSizes(obj, inputSizes, paramSizes)
outputSizes{1} = [1 1 1 inputSizes{1}(4)] ;
end
function rfs = getReceptiveFields(obj)
% the receptive field depends on the dimension of the variables
% which is not known until the network is run
rfs(1,1).size = [NaN NaN] ;
rfs(1,1).stride = [NaN NaN] ;
rfs(1,1).offset = [NaN NaN] ;
rfs(2,1) = rfs(1,1) ;
end
function obj = LossNormalized(varargin)
obj.load(varargin) ;
end
end
end
================================================
FILE: Layers/TemporalPooling.m
================================================
classdef TemporalPooling < dagnn.ElementWise
% author: Hakan Bilen
% dagnn wrapper for approximate rank pooling
properties
method = 'max';
end
methods
function outputs = forward(obj, inputs, params)
outputs{1} = vl_nnpooltemporal(inputs{1},inputs{2},obj.method);
end
function [derInputs, derParams] = backward(obj, inputs, params, derOutputs)
derInputs = cell(1,2);
derInputs{1} = vl_nnpooltemporal(inputs{1},inputs{2},obj.method,derOutputs{1});
derParams = {} ;
end
function obj = TemporalPooling(varargin)
obj.load(varargin) ;
end
end
end
================================================
FILE: Layers/vl_nnarpooltemporal.m
================================================
function Y = vl_nnarpooltemporal(X,ids,dzdy)
% author: Hakan Bilen
% approximate rank pooling
% ids indicates frame-video association (must be in range [1-N])
sz = size(X);
forward = logical(nargin<3);
if numel(ids)~=size(X,4)
error('Error: ids dimension does not match with X!');
end
nVideos = max(ids);
if forward
Y = zeros([sz(1:3),nVideos],'like',X);
else
Y = zeros(size(X),'like',X);
end
for v=1:nVideos
% pool among frames
indv = find(ids==v);
if isempty(indv)
error('Error: No frames in video %d',v);
end
N = numel(indv);
% magic numbers
fw = zeros(1,N);
if N==1
fw = 1;
else
for i=1:N
fw(i) = sum((2*(i:N)-N-1) ./ (i:N));
end
end
if forward
Y(:,:,:,v) = sum(bsxfun(@times,X(:,:,:,indv),...
reshape(single(fw),[1 1 1 numel(indv)])),4);
else
Y(:,:,:,indv) = (bsxfun(@times,repmat(dzdy(:,:,:,v),[1,1,1,numel(indv)]),...
reshape(fw,[1 1 1 numel(indv)]))) ;
end
end
%
% if forward
% fprintf(' fwd-arpool %.2f ',sqrt(sum(Y(:).^2)));
% else
% fprintf(' back-arpool %f ',sqrt(sum(Y(:).^2)));
% end
================================================
FILE: Layers/vl_nnl2norm.m
================================================
function y = vl_nnl2norm(x,param,dzdy)
% author: Hakan Bilen
% l2 normalize whole feature map
sc = param(1);
clip = param(2:3);
offset = param(4);
if nargin == 3
assert(all(size(x) == size(dzdy)));
else
dzdy = [];
end
x_sz = size(x);
if ~all(x_sz([1 2]) == 1)
% Create an array of size #channels x #samples
x = reshape(x, prod(x_sz(1:3)), []);
end
x = x + offset;
if isempty(dzdy)
y = (bsxfun(@times, x, sc./(sqrt(sum(x .* x)) + single(1e-12))));
% clip max values
if all(y(:)<clip(1) | y(:)>clip(2))
warning('Too small clipping interval');
fprintf('min %f max %f\n',min(y(:)),max(y(:)));
end
y(y(:)<clip(1)) = clip(1);
y(y(:)>clip(2)) = clip(2);
else
if ~all(x_sz([1 2]) == 1)
dzdy = reshape(dzdy, prod(x_sz(1:3)), []);
end
len_ = 1./sqrt(sum(x.*x)+single(1e-12));
dzdy_ = bsxfun(@times,dzdy,len_.^3);
y = sc * (bsxfun(@times,dzdy,len_)-bsxfun(@times,x,sum(x.*dzdy_)));
end
if ~all(x_sz([1 2]) == 1)
y = reshape(y, x_sz);
end
%
% if isempty(dzdy)
% fprintf(' fwd-l2 %.2f ',sqrt(sum(y(:).^2)));
% else
% fprintf(' back-l2 %f dzdy %f ',sqrt(sum(y(:).^2)),sqrt(sum(dzdy(:).^2)));
% end
================================================
FILE: Layers/vl_nnpooltemporal.m
================================================
function Y = vl_nnpooltemporal(X,ids,method,dzdy)
% author: Hakan Bilen
% temporal pooling along frames
% ids indicates frame-video association
% method 'max' or 'avg'
sz = size(X);
forward = logical(nargin<4);
Xp = permute(X,[4,1,2,3]);
if numel(ids)~=size(X,4)
error('Error: ids dimension does not match with X!');
end
nVideos = max(ids);
if forward
Yp = zeros([nVideos,sz(1:3)],'like',X);
for v=1:nVideos
% pool among frames
indv = find(ids==v);
Yp(v,:,:,:) = vl_nnpool(Xp(indv,:,:,:), [numel(indv),1], ...
'pad', 0, 'stride', [numel(indv),1], 'method', method) ;
end
else
dzdyp = permute(dzdy,[4,1,2,3]);
Yp = zeros(size(Xp),'like',Xp);
for v=1:nVideos
% pool among frames
indv = find(ids==v);
Yp(indv,:,:,:) = vl_nnpool(Xp(indv,:,:,:), [numel(indv),1], dzdyp(v,:,:,:), ...
'pad', 0, 'stride', [numel(indv),1], 'method', method) ;
end
end
% permute back
Y = permute(Yp,[2,3,4,1]);
% if forward
% fprintf(' fwd-ptemp %.2f ',sqrt(sum(Y(:).^2)));
% else
% fprintf(' back-ptemp %.2f ',sqrt(sum(Y(:).^2)));
% end
================================================
FILE: README.md
================================================
# Dynamic Image Networks for Action Recognition
## Improved Results (see the extended version of CVPR paper)
ResNeXt-50 | HMDB51 (%) | UCF101 (%) |
------------------|--------|--------|
SI | 53.5 | 87.6 |
DI | 57.3 | 86.6 |
OF | 55.8 | 84.9 |
DOF | 58.9 | 86.6 |
SI+OF | 67.5 | 93.9 |
SI+DI | 61.3 | 90.6 |
OF+DOF | 62.6 | 89.1 |
SI+DI+OF+DOF | 71.5 | 95.0 |
SI+DI+OF+DOF+iDT | 74.2 | 95.4 |
* Results are in the standard average multi-class accuracy (%)
* SI: RGB image
* DI: dynamic RBG image
* OF: optical flow
* DOF: dynamic optical flow
* iDT: improved trajectory features
## Installation
1. Clone the Dynamic Image Net repository:
```Shell
git clone --recursive https://github.com/hbilen/dynamic-image-nets
```
2. Compile matconvnet toolbox: (see [http://www.vlfeat.org/matconvnet/install/](http://www.vlfeat.org/matconvnet/install/))
3. Install additional matconvnet packages
```Shell
run matconvnet/matlab/vl_setupnn.m ;
vl_contrib install mcnExtraLayers ; vl_contrib setup mcnExtraLayers ;
vl_contrib install autonn ; vl_contrib setup autonn ;
```
4. Download your dataset : (e.g. UCF101 from [http://crcv.ucf.edu/data/UCF101.php](http://crcv.ucf.edu/data/UCF101.php))
5. Convert videos to frames, resize them to 256x256 and store them in such a directory structure:
Alternatively, you can download RGB and precomputed optical flow frames from [Christoph Feichtenhofer](http://ftp.tugraz.at/pub/feichtenhofer/tsfusion/data/) and copy RGB frames under "UCF101/frames" and optical flow frames under "UCF101/tvl1_flow".
```Shell
data/UCF101/ucfTrainTestlist/
├── classInd.txt
├── testlist01.txt
├── testlist02.txt
├── testlist03.txt
├── trainlist01.txt
├── trainlist02.txt
└── trainlist03.txt
data/UCF101/frames/
├── v_ApplyEyeMakeup_g01_c01
│ ├── 00001.jpg
│ ├── 00002.jpg
│ ├── 00003.jpg
│ ├── 00004.jpg
│ ├── 00005.jpg
```
## Compute and Visualise Approximate Dynamic Images
1. If you want to compute approximate dynamic images, get a list of ordered frames from a video and try
```matlab
di = compute_approximate_dynamic_images(images) ;
```
2. If you want to visualise approximate dynamic images, get a list of ordered frames from a video and try
```matlab
visualize_approximate_dynamic_images(images)
```
## Train a Dynamic Image Net
You can modify the options in `main_train.m` and train your model by running
```matlab
main_train
```
Note: If you want to train a model on a different dataset than UCF101 or HMDB51, you need to write a custom script `cnn_dataset_setup_data` to build your database (imdb).
## Evaluation
1. Download the CNN Models for the UCF101 dataset, that are used in the journal, from [here](http://groups.inf.ed.ac.uk/hbilen-data/data/resnext50_dicnn.tar).
2. Choose the right model, split and input type (e.g.)
```matlab
net = load('resnext50-rgb-arpool-split1.mat') ;
net = dagnn.DagNN.loadobj(net) ;
net.addLayer('errMC',ErrorMultiClass(),{'prediction','label'},'mcerr') ;
opts.network = net ;
opts.split = 1 ;
opts.train.gpus = 1 ;
opts.epochFactor = 0 ;
[net, info] = cnn_dicnn_rgb(opts)
```
## Citing Dynamic Image Networks
If you find the code useful, please cite:
@inproceedings{Bilen2016a,
author = "Bilen, H. and Fernando, B. and Gavves, E. and Vedaldi, A. and Gould, S.",
title = "Dynamic Image Networks for Action Recognition",
booktitle = "CVPR",
year = "2016"
}
@journal{Bilen2017a,
author = "Bilen, H. and Fernando, B. and Gavves, E. and Vedaldi, A.",
title = "Action Recognition with Dynamic Image Networks",
journal = " IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)",
year = "2017"
}
## License
The analysis work performed with the program(s) must be non-proprietary work. Licensee and its contract users must be or be affiliated with an academic facility. Licensee may additionally permit individuals who are students at such academic facility to access and use the program(s). Such students will be considered contract users of licensee. The program(s) may not be used for commercial competitive analysis (such as benchmarking) or for any commercial activity, including consulting.
================================================
FILE: dicnn/cnn_dicnn_of.m
================================================
function [net, info] = cnn_dicnn_of(varargin)
%CNN_DICNN_OF Fine-tunes a pre-trained CNN with dynamic images on optical
% (DOF in pami journal) flow frames on UCF101 dataset
run(fullfile(fileparts(mfilename('fullpath')), ...
'..', 'matconvnet', 'matlab', 'vl_setupnn.m')) ;
run(fullfile(fileparts(mfilename('fullpath')), ...
'..', 'matconvnet', 'contrib', 'mcnExtraLayers', 'setup_mcnExtraLayers.m')) ;
run(fullfile(fileparts(mfilename('fullpath')), ...
'..', 'matconvnet', 'contrib', 'autonn', 'setup_autonn.m')) ;
addpath Layers Datasets
opts.dataDir = fullfile('data','UCF101') ;
opts.expDir = fullfile('exp', 'UCF101') ;
opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat.mat') ;
[opts, varargin] = vl_argparse(opts, varargin) ;
opts.numFetchThreads = 8 ;
opts.lite = false ;
opts.imdbPath = fullfile(opts.dataDir, 'imdb-of.mat');
opts.pool1Layer = 'conv0'; % before conv1
opts.pool1Type = 'arpool'; % before conv1
opts.pool2Layer = 'fc6'; % before conv1
opts.DropOutRate = 0.85 ;
opts.datasetFn = @cnn_ucf101_of_setup_data ;
opts.networkFn = @cnn_init_resnext ;
opts.network = [] ;
opts.split = 1; % data split
opts.reverseDyn = 0; % reverse video frames e.g.[N:-1:1]
opts.numDynImgs = 10 ;
opts.epochFactor = 5 ;
opts.train = struct() ;
opts.train.gpus = [];
opts.train.batchSize = 128 ;
opts.train.numSubBatches = 32 ;
opts.train.solver = [] ;
opts.train.prefetch = true ;
opts.train.learningRate = 1e-2 ;
opts.train.numEpochs = 30 ;
% opts.train.savePreds = true ;
opts.train.randomSeed = 0 ;
opts = vl_argparse(opts, varargin) ;
if ~isfield(opts.train, 'gpus'), opts.train.gpus = []; end;
% -------------------------------------------------------------------------
% Prepare data
% -------------------------------------------------------------------------
if exist(opts.imdbPath,'file')
imdb = load(opts.imdbPath) ;
else
imdb = opts.datasetFn('dataDir', opts.dataDir, 'lite', opts.lite) ;
mkdir(opts.expDir) ;
save(opts.imdbPath, '-struct', 'imdb') ;
end
% UCF101 has 3 data splits
if opts.split>3
error('split should be <=3');
end
imdb.images.set = imdb.images.sets(opts.split,:);
% reverse frame order
if opts.reverseDyn
for i=1:numel(imdb.images.names)
imdb.images.names{i} = imdb.images.names{i}(end:-1:1);
end
end
% -------------------------------------------------------------------------
% Prepare model
% -------------------------------------------------------------------------
if isempty(opts.network)
net = load(opts.modelPath);
if isfield(net,'net')
net = net.net;
end
opts.nCls = max(imdb.images.label) ;
% net = dagnn.DagNN.loadobj(net) ;
net = opts.networkFn(net,opts) ;
% two channels instead of 3 RGB
net.params(1).value = net.params(1).value(:,:,1:2,:) ;
% Set the class names in the network
net.meta.classes.name = imdb.classes.name ;
net.meta.classes.description = imdb.classes.name ;
else
assert(isa(opts.network,'dagnn.DagNN')) ;
net = opts.network ;
end
% -------------------------------------------------------------------------
% Learn
% -------------------------------------------------------------------------
if opts.epochFactor>0
opts.train.train = repmat(find(imdb.images.set==1),[1 opts.epochFactor]) ;
else
opts.train.train = NaN ;
opts.train.numEpochs = 1 ;
end
opts.train.val = find(imdb.images.set==3) ;
[net, info] = cnn_train_dicnn_dag(net, imdb, getBatchFn(opts, net.meta), ...
'expDir', opts.expDir, ...
opts.train) ;
% -------------------------------------------------------------------------
% Report accuracy
% -------------------------------------------------------------------------
errlayer = net.getLayerIndex('errMC') ;
if ~isnan(errlayer)
cats = imdb.classes.name ;
accs = net.layers(errlayer).block.accuracy ;
if numel(cats)~=numel(accs)
error('wrong number of classes\n') ;
end
for i=1:numel(cats)
fprintf('%s acc %.1f\n',cats{i},100*accs(i)) ;
end
fprintf('Mean accuracy %.1f\n',100*mean(accs)) ;
end
% -------------------------------------------------------------------------
function fn = getBatchFn(opts, meta)
% -------------------------------------------------------------------------
useGpu = numel(opts.train.gpus) > 0 ;
bopts.numThreads = opts.numFetchThreads ;
bopts.imageSize = meta.normalization.imageSize ;
if isfield(meta.normalization,'border')
bopts.border = meta.normalization.border ;
else
bopts.border = meta.normalization.imageSize(1:2) ./ ...
meta.normalization.cropSize - meta.normalization.imageSize(1:2);
end
bopts.averageImage = 128 * ones([1 1 2],'single') ;
bopts.numDynImgs = opts.numDynImgs ;
fn = @(x,y) getDagNNBatch(bopts,useGpu,x,y) ;
% -------------------------------------------------------------------------
function inputs = getDagNNBatch(opts, useGpu, imdb, batch)
% -------------------------------------------------------------------------
% batch refers to videos (not for frames)
if isempty(batch)
inputs = {'input', [], 'label', [], 'VideoId1', [], 'VideoId2', []};
return;
end
isVal = ~isempty(batch) && imdb.images.set(batch(1)) ~= 1 ;
if ~isVal, transformation='multiScaleRegular'; else transformation='none';end
names = imdb.images.names(batch);
% images = strcat([imdb.imageDir filesep], imdb.images.name(batch)) ;
namesM = {};
nVids = numel(batch);
VideoId1 = [];
VideoId2 = [];
% step-size
stepSize = 6;
% pool nFrames into a dynamic image
nFrames = 10;
% number of dynamic images to be max pooled later
nDynImgs = opts.numDynImgs ;
opts = rmfield(opts,'numDynImgs') ;
c1 = 1;
for v=1:nVids
name = names{v};
nFrms = numel(name)/2;
nSample = nFrames;
if isVal
startF = 1 ;
else
startF = ceil(stepSize/2) ;
end
nr = numel(startF:stepSize:nFrms);
% jitter by removing 50 % and limit a batch to nMaxs * nSamples images
if nr > 1 && (~isVal && nr>nDynImgs)
rat = min(nDynImgs,ceil(0.50*nr));
ri = randperm(nr);
ri = ri(1:rat);
r = zeros(1,nr);
r(ri) = 1;
else
r = ones(1,nr);
end
c3 = 1;
c2 = 0;
for f=startF:stepSize:nFrms
if r(c3)
idx = f:min(f+nSample-1,nFrms) ;
if numel(idx)<nFrames
idx = [idx idx(end) * ones(1,nFrames-numel(idx))];
end
idxu = 2*idx - 1;
idxv = 2*idx;
idxuv = zeros(1,2 * numel(idxu)) ;
idxuv(1:2:end) = idxu ;
idxuv(2:2:end) = idxv ;
namesM{end+1} = name(idxuv);
VideoId1 = [VideoId1 c1 * ones(1,numel(idx))];
c1 = c1 + 1;
c2 = c2 + 1;
end
c3 = c3 + 1;
end
VideoId2 = [VideoId2 v * ones(1,c2) ] ;
end
images = strcat([imdb.imageDir filesep], horzcat(namesM{:}) ) ;
im = cnn_video_of_get_batch(images, VideoId1, opts, ...
'transformation', transformation, 'prefetch', nargout == 0) ;
if nargout > 0
if useGpu
im = gpuArray(im) ;
end
inputs = {'input', im, 'label', imdb.images.label(batch), ...
'VideoId1', VideoId1, 'VideoId2', VideoId2};
end
================================================
FILE: dicnn/cnn_dicnn_rgb.m
================================================
function [net, info] = cnn_dicnn_rgb(varargin)
%CNN_DICNN_RGB Fine-tunes a pre-trained CNN with dynamic images on RGB frames
% (DI in pami journal) on UCF101 dataset
run(fullfile(fileparts(mfilename('fullpath')), ...
'..', 'matconvnet', 'matlab', 'vl_setupnn.m')) ;
run(fullfile(fileparts(mfilename('fullpath')), ...
'..', 'matconvnet', 'contrib', 'mcnExtraLayers', 'setup_mcnExtraLayers.m')) ;
run(fullfile(fileparts(mfilename('fullpath')), ...
'..', 'matconvnet', 'contrib', 'autonn', 'setup_autonn.m')) ;
addpath Layers Datasets
opts.dataDir = fullfile('data','UCF101') ;
opts.expDir = fullfile('exp', 'UCF101') ;
opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat');
opts.datasetFn = @cnn_ucf101_setup_data ;
opts.networkFn = @cnn_init_resnext ;
opts.network = [] ;
[opts, varargin] = vl_argparse(opts, varargin) ;
opts.numFetchThreads = 8 ;
opts.lite = false ;
opts.imdbPath = fullfile(opts.dataDir, 'imdb-rgb.mat');
opts.pool1Layer = 'conv0'; % before conv1
opts.pool1Type = 'arpool';
opts.pool2Layer = 'pool5';
opts.pool2Type = 'maxpool';
opts.DropOutRate = 0.5 ;
opts.epochFactor = 5 ;
opts.split = 1; % data split
opts.reverseDyn = 0; % reverse video frames e.g.[N:-1:1]
opts.train = struct() ;
opts.train.gpus = [];
opts.train.batchSize = 128 ;
opts.train.numSubBatches = 16 ;
opts.train.solver = [] ;
opts.train.prefetch = true ;
opts.train.numEpochs = 30 ;
opts.train.randomSeed = 0 ;
% resnet50
% opts.train.learningRate = 1e-2 * [ones(1,2), 0.1*ones(1,1), 0.01*ones(1,1)];
% caffe-ref
opts.train.learningRate = 1e-3 * [ones(1,2), 0.1*ones(1,1), 0.01*ones(1,1)];
opts = vl_argparse(opts, varargin) ;
if ~isfield(opts.train, 'gpus'), opts.train.gpus = []; end
% opts.train.numEpochs = numel(opts.train.learningRate);
% -------------------------------------------------------------------------
% Prepare data
% -------------------------------------------------------------------------
if exist(opts.imdbPath,'file')
imdb = load(opts.imdbPath) ;
else
imdb = opts.datasetFn('dataDir', opts.dataDir, 'lite', opts.lite) ;
mkdir(opts.expDir) ;
save(opts.imdbPath, '-struct', 'imdb') ;
end
% UCF101 has 3 data splits
if opts.split>3
error('split should be <=3');
end
imdb.images.set = imdb.images.sets(opts.split,:);
% reverse frame order
if opts.reverseDyn
for i=1:numel(imdb.images.names)
imdb.images.names{i} = imdb.images.names{i}(end:-1:1);
end
end
% -------------------------------------------------------------------------
% Prepare model
% -------------------------------------------------------------------------
if isempty(opts.network)
net = load(opts.modelPath);
if isfield(net,'net')
net = net.net;
end
opts.nCls = max(imdb.images.label) ;
net = opts.networkFn(net,opts);
if numel(net.meta.normalization.averageImage)>3
sz = size(net.meta.normalization.averageImage) ;
net.meta.normalization.averageImage = ...
mean(reshape(net.meta.normalization.averageImage,[sz(1)*sz(2) sz(3)]),1) ;
end
% Set the class names in the network
net.meta.classes.name = imdb.classes.name ;
net.meta.classes.description = imdb.classes.name ;
else
assert(isa(opts.network,'dagnn.DagNN')) ;
net = opts.network ;
end
% -------------------------------------------------------------------------
% Learn
% -------------------------------------------------------------------------
if opts.epochFactor>0
opts.train.train = repmat(find(imdb.images.set==1),[1 opts.epochFactor]) ;
else
opts.train.train = NaN ;
opts.train.numEpochs = 1 ;
end
opts.train.val = find(imdb.images.set==3) ;
[net, info] = cnn_train_dicnn_dag(net, imdb, getBatchFn(opts, net.meta), ...
'expDir', opts.expDir, ...
opts.train) ;
% -------------------------------------------------------------------------
% Report accuracy
% -------------------------------------------------------------------------
errlayer = net.getLayerIndex('errMC') ;
if ~isnan(errlayer)
cats = imdb.classes.name ;
accs = net.layers(errlayer).block.accuracy ;
if numel(cats)~=numel(accs)
error('wrong number of classes\n') ;
end
for i=1:numel(cats)
fprintf('%s acc %.1f\n',cats{i},100*accs(i)) ;
end
fprintf('Mean accuracy %.1f\n',100*mean(accs)) ;
end
% -------------------------------------------------------------------------
function fn = getBatchFn(opts, meta)
% -------------------------------------------------------------------------
useGpu = numel(opts.train.gpus) > 0 ;
bopts.numThreads = opts.numFetchThreads ;
bopts.imageSize = meta.normalization.imageSize ;
if isfield(meta.normalization,'border')
bopts.border = meta.normalization.border ;
else
bopts.border = meta.normalization.imageSize(1:2) ./ ...
meta.normalization.cropSize - meta.normalization.imageSize(1:2);
end
% bopts.averageImage = [];
bopts.averageImage = meta.normalization.averageImage ;
bopts.interpolation = meta.normalization.interpolation ;
bopts.keepAspect = meta.normalization.keepAspect ;
% bopts.rgbVariance = meta.augmentation.rgbVariance ;
% bopts.transformation = meta.augmentation.transformation ;
fn = @(x,y) getDagNNBatch(bopts,useGpu,x,y) ;
% -------------------------------------------------------------------------
function inputs = getDagNNBatch(opts, useGpu, imdb, batch)
% -------------------------------------------------------------------------
% batch refers to videos (not for frames)
if isempty(batch)
inputs = {'input', [], 'label', [], 'VideoId1', [], 'VideoId2', []};
return;
end
isVal = ~isempty(batch) && imdb.images.set(batch(1)) ~= 1 ;
% if ~isVal, transformation='stretch'; else transformation='none';end
if ~isVal, transformation='multiScaleRegular'; else transformation='none';end
names = imdb.images.names(batch);
% images = strcat([imdb.imageDir filesep], imdb.images.name(batch)) ;
namesM = {};
nVids = numel(batch);
VideoId1 = [];
VideoId2 = [];
% step-size
stepSize = 6;
% pool nFrames into a dynamic image
nFrames = 10;
% number of dynamic images to be max pooled later
nDynImgs = 10;
c1 = 1;
for v=1:nVids
name = names{v};
if isVal
startF = 1 ;
else
startF = ceil(stepSize/2) ;
end
nFrms = numel(name);
nSample = nFrames;
nr = numel(startF:stepSize:nFrms);
% jitter by removing 50 % and limit a batch to nMaxs * nSamples images
if nr > 1 && (~isVal && nr>nDynImgs)
rat = min(nDynImgs,ceil(0.50*nr));
ri = randperm(nr);
ri = ri(1:rat);
r = zeros(1,nr);
r(ri) = 1;
else
if nr>2*nDynImgs
rat = 2*nDynImgs;
ri = randperm(nr);
ri = ri(1:rat);
r = zeros(1,nr);
r(ri) = 1;
else
r = ones(1,nr);
end
end
c3 = 1;
c2 = 0;
for f=startF:stepSize:nFrms
if r(c3)
idx = f:min(f+nSample-1,nFrms) ;
if numel(idx)<nFrames
idx = [idx idx(end) * ones(1,nFrames-numel(idx))];
end
namesM{end+1} = name(idx);
VideoId1 = [VideoId1 c1 * ones(1,numel(idx))];
c1 = c1 + 1;
c2 = c2 + 1;
end
c3 = c3 + 1;
end
VideoId2 = [VideoId2 v * ones(1,c2) ] ;
end
images = strcat([imdb.imageDir filesep], horzcat(namesM{:}) ) ;
im = cnn_video_rgb_get_batch(images, VideoId1, opts, ...
'transformation', transformation, 'prefetch', nargout == 0) ;
if nargout > 0
if useGpu
im = gpuArray(im) ;
end
inputs = {'input', im, 'label', imdb.images.label(batch), ...
'VideoId1', VideoId1, 'VideoId2', VideoId2};
end
================================================
FILE: dicnn/cnn_init_cafferef.m
================================================
% -------------------------------------------------------------------------
function net = cnn_init_cafferef(net,opts)
% -------------------------------------------------------------------------
drop6p = find(cellfun(@(a) strcmp(a.name, 'dropout6'), net.layers)==1);
drop7p = find(cellfun(@(a) strcmp(a.name, 'dropout7'), net.layers)==1);
if ~isempty(drop6p)
assert(~isempty(drop7p));
net.layers{drop6p}.rate = opts.DropOutRate;
net.layers{drop7p}.rate = opts.DropOutRate;
else
relu6p = find(cellfun(@(a) strcmp(a.name, 'relu6'), net.layers)==1);
relu7p = find(cellfun(@(a) strcmp(a.name, 'relu7'), net.layers)==1);
drop6 = struct('type','dropout','rate', opts.DropOutRate,'name','dropout6') ;
drop7 = struct('type','dropout','rate', opts.DropOutRate,'name','dropout7') ;
net.layers = [net.layers(1:relu6p) drop6 net.layers(relu6p+1:relu7p) drop7 net.layers(relu7p+1:end)];
end
% replace fc8
fc8l = cellfun(@(a) strcmp(a.name, 'fc8'), net.layers)==1;
nCls = opts.nCls ;
% nCls = 101;
sizeW = size(net.layers{fc8l}.weights{1});
if sizeW(4)~=nCls
net.layers{fc8l}.weights = {zeros(sizeW(1),sizeW(2),sizeW(3),nCls,'single'), ...
zeros(1, nCls, 'single')};
end
% change loss
% net.layers(end) = [];
net.layers{end} = struct('name','loss', 'type','softmaxloss') ;
% convert to dagnn
net = dagnn.DagNN.fromSimpleNN(net, 'canonicalNames', true) ;
poolLyr1 = find(arrayfun(@(a) strcmp(a.name, opts.pool1Layer), net.layers)==1);
assert(~isempty(poolLyr1));
% configure appr-rank-pool
switch opts.pool1Type
case 'arpool'
if strcmp(opts.pool1Layer,'conv1')
net.addLayer('arpool',AppRankPooling('scale',1),{net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImgN');
net.addLayer('l2normalize',L2Normalize('scale',6000,'clip',[-128 128]),...
'DynImgN','DynImg');
else
net.addLayer('arpool',AppRankPooling('scale',0.1),{net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImgN');
net.addLayer('reluP',dagnn.ReLU(),...
{'DynImgN'},'DynImg');
end
net.setLayerInputs(opts.pool1Layer,{'DynImg'}) ;
case 'ppool1'
if strcmp(opts.pool1Layer,'conv1')
net.addLayer('parampool',LinComb('pad',[1 1 10 1]),...
{net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImg',{'conv0f','conv0b'});
else
net.addLayer('parampool',LinComb('pad',[1 1 10 1]),...
{net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImgN',{'conv0f','conv0b'});
net.addLayer('reluP',dagnn.ReLU(),...
{'DynImgN'},'DynImg');
end
net.layers(poolLyr1).inputs{1} = 'DynImg' ;
% net.params(end-1).value = 0.01 * randn(1,1,10,1,'single');
net.params(end-1).value = 0.1 * ones(1,1,10,1,'single');
net.params(end).value = zeros(1,1,'single');
net.params(end-1).learningRate = 0.1 ;
net.params(end).learningRate = 0.2 ;
case 'ppool2'
if strcmp(opts.pool1Layer,'conv1')
net.addLayer('parampool',LinComb('pad',[1 1 10 1]),...
{net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImg',{'conv0f','conv0b'});
else
net.addLayer('parampool',LinComb('pad',[1 1 10 1]),...
{net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImgN',{'conv0f','conv0b'});
net.addLayer('reluP',dagnn.ReLU(),...
{'DynImgN'},'DynImg');
end
net.layers(poolLyr1).inputs{1} = 'DynImg' ;
% net.params(end-1).value = 0.01 * randn(1,1,10,1,'single');
net.params(end-1).value = 0.1 * ones(1,1,10,1,'single');
net.params(end).value = zeros(1,1,'single');
net.params(end-1).learningRate = 0.1 ;
net.params(end).learningRate = 0.2 ;
case 'none'
otherwise
error('Unknown pool type %s', opts.pool1Type) ;
end
% second pool layer (max pooling)
poolLyr2 = find(arrayfun(@(a) strcmp(a.name, opts.pool2Layer), net.layers)==1);
net.addLayer('tempPoolMax',TemporalPooling('method','max'),...
{net.layers(poolLyr2(1)).inputs{1},'VideoId2'},'tempPoolMax');
net.layers(poolLyr2).inputs{1} = 'tempPoolMax';
% add multi-class error
net.addLayer('errMC',ErrorMultiClass(),{'prediction','label'},'mcerr');
net_ = net.saveobj ;
net = dagnn.DagNN.loadobj(net_) ;
net.removeLayer('loss') ;
net.addLayer('loss', ...
LossNormalized('loss', 'softmaxlog') ,...
{'prediction', 'label'}, ...
'objective') ;
% replace standard matconvnet bnorm with my version
bns = find(arrayfun(@(a) strcmp(class(a.block), 'dagnn.BatchNorm'), net.layers)==1);
for i=1:numel(bns)
bb = net.layers(bns(i)).block ;
net.layers(bns(i)).block = BatchNormN('numChannels',bb.numChannels,...
'epsilon',bb.epsilon,...
'opts',bb.opts) ;
end
================================================
FILE: dicnn/cnn_init_resnext.m
================================================
% -------------------------------------------------------------------------
function net = cnn_init_resnext(net,opts)
% -------------------------------------------------------------------------
% initialize classifier
net = dagnn.DagNN.loadobj(net) ;
% convs = find(arrayfun(@(a) isa(a.block, 'dagnn.Conv'), net.layers)==1);
fclayer = net.getLayer('classifier_0') ;
sizeW = size(net.params(fclayer.paramIndexes(1)).value);
% opts.nCls = 101;
nCls = opts.nCls ;
DropOutRate = opts.DropOutRate ;
net.params(fclayer.paramIndexes(1)).value = ...
0.01 * randn([sizeW(1:3),nCls],'single') ;
net.params(fclayer.paramIndexes(2)).value = zeros(nCls,1,'single') ;
% change loss
softmax = find(arrayfun(@(a) isa(a.block, 'dagnn.SoftMax'), net.layers)==1);
if ~isempty(softmax)
net.removeLayer(net.layers(softmax(1)).name) ;
end
% convs = find(arrayfun(@(a) isa(a.block, 'dagnn.Conv'), net.layers)==1);
fclayer = find(arrayfun(@(a) strcmp(a.name, 'classifier_0'), net.layers)==1);
net.renameVar(net.layers(fclayer(end)).name,'prediction') ;
net.renameVar('data','input') ;
%------------------------------------------------------------------------%
% configure appr-rank-pool
switch opts.pool1Type
case 'arpool'
if strcmp(opts.pool1Layer,'conv0')
poolLyr1 = 1 ;
net.addLayer('arpool',AppRankPooling('scale',0.1),{'input','VideoId1'},'DynImg');
net.setLayerInputs(net.layers(1).name,{'DynImg'}) ;
else
poolLyr1 = find(arrayfun(@(a) strcmp(a.name, opts.pool1Layer), net.layers)==1);
assert(~isempty(poolLyr1));
net.addLayer('arpool',AppRankPooling('scale',0.1),{net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImg');
net.setLayerInputs(opts.pool1Layer,{'DynImg'}) ;
end
case 'ppool1'
if strcmp(opts.pool1Layer,'conv0')
poolLyr1 = 1 ;
else
poolLyr1 = find(arrayfun(@(a) strcmp(a.name, opts.pool1Layer), net.layers)==1);
end
net.addLayer('parampool',LinComb('pad',[1 1 10 1]),...
{'features_4_0_merge','VideoId1'},'DynImg0',{'conv0f','conv0b'});
% net.params(end-1).value = 0.1 * ones(1,1,10,1,'single');
net.params(end-1).value = 0.1 * randn(1,1,10,1,'single');
net.params(end).value = zeros(1,1,'single');
net.addLayer('BnormDyn',dagnn.BatchNorm('numChannels',256),'DynImg0','DynImg',...
{'dym','dyb','dybx'}) ;
net.params(end-2).value = ones(256,1,'single') ;
net.params(end-1).value = zeros(256,1,'single') ;
net.params(end).value = zeros(256,2,'single') ;
% net.addLayer('reluP',dagnn.ReLU(),...
% {'DynImg1'},'DynImg');
net.layers(16).inputs{1} = 'DynImg' ;
for i=numel(net.params)-4:numel(net.params),
net.params(i).learningRate = 0.1 * net.params(i).learningRate;
end
case 'none'
otherwise
error('Unknown pool type %s', opts.pool1Type) ;
end
net.rebuild() ;
%------------------------------------------------------------------------%
% second pool layer (max pooling)
% poolLyr2 = find(arrayfun(@(a) strcmp(a.name, 'pool5'), net.layers)==1);
poolLyr2 = find(arrayfun(@(a) strcmp(a.name, 'features_7_1_merge'), net.layers)==1);
net.addLayer('tempPoolMax',TemporalPooling('method','max'),...
{net.layers(poolLyr2(1)).outputs{1},'VideoId2'},'tempPoolMax');
% change the input of fc last layer
% net.setLayerInputs(net.layers(convs(end)).name,'tempPoolMax') ;
% net.addLayer('bnar',dagnn.BatchNorm('numChannels',2048),{'tempPoolMax'},...
% 'tempPoolMaxbn',{'bnar_m','bnar_b','bnar_x'});
poolLyr2next = find(arrayfun(@(a) strcmp(a.name, 'features_7_1_id_relu'), net.layers)==1);
net.setLayerInputs(net.layers(poolLyr2next(1)).name,{'tempPoolMax'}) ;
net.rebuild() ;
%------------------------------------------------------------------------%
% add drop-out layers
if DropOutRate>0
pool5 = find(arrayfun(@(a) strcmp(a.name, 'features_8'), net.layers)==1);
oo = net.layers(pool5(1)).outputs{1};
net.addLayer('drop_pool5',dagnn.DropOut('rate',DropOutRate),...
oo,sprintf('drop_%s',oo),{});
net.setLayerInputs('classifier_permute',{sprintf('drop_%s',oo)}) ;
end
%------------------------------------------------------------------------%
% add multi-class error
net.addLayer('errMC',ErrorMultiClass(),{'prediction','label'},'mcerr');
net.addLayer('loss', ...
LossNormalized('loss', 'softmaxlog') ,...
{'prediction', 'label'}, ...
'objective') ;
%------------------------------------------------------------------------%
net.rebuild()
% replace standard matconvnet bnorm with my version
bns = find(arrayfun(@(a) strcmp(class(a.block), 'dagnn.BatchNorm'), net.layers)==1);
for i=1:numel(bns)
bb = net.layers(bns(i)).block ;
net.layers(bns(i)).block = BatchNormN('numChannels',bb.numChannels,...
'epsilon',bb.epsilon,...
'opts',bb.opts) ;
end
% dagMergeBatchNorm(net) ;
% dagRemoveLayersOfType(net, 'dagnn.BatchNorm') ;
net_ = net.saveobj ;
net = dagnn.DagNN.loadobj(net_) ;
net.meta.normalization.border = [32 32] ;
================================================
FILE: dicnn/cnn_single_of.m
================================================
function [net, info] = cnn_single_of(varargin)
%CNN_SINGLE_OF Demonstrates fine-tuning a pre-trained CNN with static
% optical flow (OF in pami journal) on UCF101 dataset
run(fullfile(fileparts(mfilename('fullpath')), ...
'..', 'matconvnet', 'matlab', 'vl_setupnn.m')) ;
addpath Layers Datasets
opts.dataDir = fullfile('data','UCF101') ;
opts.expDir = fullfile('exp', 'UCF101') ;
opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat.mat') ;
[opts, varargin] = vl_argparse(opts, varargin) ;
opts.numFetchThreads = 8 ;
opts.lite = false ;
opts.imdbPath = fullfile(opts.dataDir, 'imdb-of.mat');
opts.DropOutRate = 0.85 ;
opts.datasetFn = @cnn_ucf101_of_setup_data ;
opts.networkFn = @cnn_resnext_init ;
opts.split = 1; % data split
opts.reverseDyn = 0; % reverse video frames e.g.[N:-1:1]
opts.numDynImgs = 10 ;
opts.epochFactor = 5 ;
opts.pool1Layer = 'conv0'; % before conv1
opts.pool1Type = 'none' ;
opts.pool2Layer = 'fc6' ;
opts.train = struct() ;
opts.train.gpus = [];
opts.train.batchSize = 128 ;
opts.train.numSubBatches = 32 ;
opts.train.solver = [] ;
opts.train.prefetch = true ;
opts.train.learningRate = 1e-2 ;
opts.train.numEpochs = 30 ;
opts = vl_argparse(opts, varargin) ;
if ~isfield(opts.train, 'gpus'), opts.train.gpus = []; end;
% -------------------------------------------------------------------------
% Prepare data
% -------------------------------------------------------------------------
if exist(opts.imdbPath,'file')
imdb = load(opts.imdbPath) ;
else
imdb = opts.datasetFn('dataDir', opts.dataDir, 'lite', opts.lite) ;
mkdir(opts.expDir) ;
save(opts.imdbPath, '-struct', 'imdb') ;
end
% UCF101 has 3 data splits
if opts.split>3
error('split should be <=3');
end
imdb.images.set = imdb.images.sets(opts.split,:);
% reverse frame order
if opts.reverseDyn
for i=1:numel(imdb.images.names)
imdb.images.names{i} = imdb.images.names{i}(end:-1:1);
end
end
% -------------------------------------------------------------------------
% Prepare model
% -------------------------------------------------------------------------
net = load(opts.modelPath);
if isfield(net,'net')
net = net.net;
end
opts.nCls = max(imdb.images.label) ;
% net = dagnn.DagNN.loadobj(net) ;
net = opts.networkFn(net,opts) ;
% two channels instead of 3 RGB
net.params(1).value = net.params(1).value(:,:,1:2,:) ;
% Set the class names in the network
net.meta.classes.name = imdb.classes.name ;
net.meta.classes.description = imdb.classes.name ;
% -------------------------------------------------------------------------
% Learn
% -------------------------------------------------------------------------
if opts.epochFactor>0
opts.train.train = repmat(find(imdb.images.set==1),[1 opts.epochFactor]) ;
else
opts.train.train = NaN ;
end
opts.train.val = find(imdb.images.set==3) ;
[net, info] = cnn_train_dicnn_dag(net, imdb, getBatchFn(opts, net.meta), ...
'expDir', opts.expDir, ...
opts.train) ;
% -------------------------------------------------------------------------
% Report accuracy
% -------------------------------------------------------------------------
errlayer = net.getLayerIndex('errMC') ;
if ~isnan(errlayer)
cats = imdb.classes.name ;
accs = net.layers(errlayer).block.accuracy ;
if numel(cats)~=numel(accs)
error('wrong number of classes\n') ;
end
for i=1:numel(cats)
fprintf('%s acc %.1f\n',cats{i},100*accs(i)) ;
end
fprintf('Mean accuracy %.1f\n',100*mean(accs)) ;
end
% -------------------------------------------------------------------------
function fn = getBatchFn(opts, meta)
% -------------------------------------------------------------------------
useGpu = numel(opts.train.gpus) > 0 ;
bopts.numThreads = opts.numFetchThreads ;
bopts.imageSize = meta.normalization.imageSize ;
if isfield(meta.normalization,'border')
bopts.border = meta.normalization.border ;
else
bopts.border = meta.normalization.imageSize(1:2) ./ ...
meta.normalization.cropSize - meta.normalization.imageSize(1:2);
end
bopts.averageImage = 128 * ones([1 1 2],'single') ;
bopts.numDynImgs = opts.numDynImgs ;
% bopts.averageImage = meta.normalization.averageImage ;
% bopts.rgbVariance = meta.augmentation.rgbVariance ;
% bopts.transformation = meta.augmentation.transformation ;
bopts.transformation = 'stretch' ;
bopts.transformation = 'multiScaleRegular' ;
fn = @(x,y) getDagNNBatch(bopts,useGpu,x,y) ;
% -------------------------------------------------------------------------
function inputs = getDagNNBatch(opts, useGpu, imdb, batch)
% -------------------------------------------------------------------------
% batch refers to videos (not for frames)
if isempty(batch)
inputs = {'input', [], 'label', [], 'VideoId1', [], 'VideoId2', []};
return;
end
isVal = ~isempty(batch) && imdb.images.set(batch(1)) ~= 1 ;
if ~isVal, transformation='multiScaleRegular'; else transformation='none';end
names = imdb.images.names(batch);
% images = strcat([imdb.imageDir filesep], imdb.images.name(batch)) ;
namesM = {};
nVids = numel(batch);
VideoId1 = [];
VideoId2 = [];
% step-size
stepSize = 6;
% pool nFrames into a dynamic image
nFrames = 1;
% number of dynamic images to be max pooled later
nDynImgs = opts.numDynImgs ;
opts = rmfield(opts,'numDynImgs') ;
c1 = 1;
for v=1:nVids
name = names{v};
nFrms = numel(name)/2;
nSample = nFrames;
nr = numel(1:stepSize:nFrms);
% jitter by removing 50 % and limit a batch to nMaxs * nSamples images
if nr > 1 && (~isVal && nr>nDynImgs)
rat = min(nDynImgs,ceil(0.50*nr));
ri = randperm(nr);
ri = ri(1:rat);
r = zeros(1,nr);
r(ri) = 1;
else
r = ones(1,nr);
end
c3 = 1;
c2 = 0;
for f=1:stepSize:nFrms
if r(c3)
idx = f:min(f+nSample-1,nFrms) ;
if numel(idx)<nFrames
idx = [idx idx(end) * ones(1,nFrames-numel(idx))];
end
idxu = 2*idx - 1;
idxv = 2*idx;
idxuv = zeros(1,2 * numel(idxu)) ;
idxuv(1:2:end) = idxu ;
idxuv(2:2:end) = idxv ;
namesM{end+1} = name(idxuv);
VideoId1 = [VideoId1 c1 * ones(1,numel(idx))];
c1 = c1 + 1;
c2 = c2 + 1;
end
c3 = c3 + 1;
end
VideoId2 = [VideoId2 v * ones(1,c2) ] ;
end
images = strcat([imdb.imageDir filesep], horzcat(namesM{:}) ) ;
im = cnn_video_of_get_batch(images, VideoId1, opts, ...
'transformation', transformation, 'prefetch', nargout == 0, ...
'subMean', false) ;
if nargout > 0
if useGpu
im = gpuArray(im) ;
end
inputs = {'input', im, 'label', imdb.images.label(batch), ...
'VideoId2', VideoId2};
end
================================================
FILE: dicnn/cnn_single_rgb.m
================================================
function [net, info] = cnn_single_rgb(varargin)
%CNN_SINGLE_RGB Demonstrates fine-tuning a pre-trained CNN with static
% RGB frames (SI in pami journal) on UCF101 dataset
run(fullfile(fileparts(mfilename('fullpath')), ...
'..', 'matconvnet', 'matlab', 'vl_setupnn.m')) ;
addpath Layers Datasets
opts.dataDir = fullfile('data','UCF101') ;
opts.expDir = fullfile('exp', 'UCF101') ;
opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat');
opts.datasetFn = @cnn_ucf101_setup_data ;
opts.networkFn = @cnn_init_resnext ;
opts.pool1Type = 'none' ;
opts.pool1Layer = 'conv1' ;
opts.pool2Layer = '' ;
[opts, varargin] = vl_argparse(opts, varargin) ;
opts.numFetchThreads = 8 ;
opts.lite = false ;
opts.imdbPath = fullfile(opts.dataDir, 'imdb-rgb.mat');
opts.ARPoolLayer = 'conv0'; % before conv1
opts.DropOutRate = 0.5 ;
opts.epochFactor = 5 ;
opts.split = 1; % data split
opts.reverseDyn = 0; % reverse video frames e.g.[N:-1:1]
opts.train = struct() ;
opts.train.gpus = [];
opts.train.batchSize = 128 ;
opts.train.numSubBatches = 16 ;
opts.train.solver = [] ;
opts.train.prefetch = true ;
opts.train.numEpochs = 30 ;
% resnet50
opts.train.learningRate = 1e-2 * [ones(1,2), 0.1*ones(1,1), 0.01*ones(1,1)];
% caffe-ref
opts.train.learningRate = 1e-4 * [ones(1,2), 0.1*ones(1,1), 0.01*ones(1,1)];
opts = vl_argparse(opts, varargin) ;
if ~isfield(opts.train, 'gpus'), opts.train.gpus = []; end;
% opts.train.numEpochs = numel(opts.train.learningRate);
% -------------------------------------------------------------------------
% Prepare data
% -------------------------------------------------------------------------
if exist(opts.imdbPath,'file')
imdb = load(opts.imdbPath) ;
else
imdb = opts.datasetFn('dataDir', opts.dataDir, 'lite', opts.lite) ;
mkdir(opts.expDir) ;
save(opts.imdbPath, '-struct', 'imdb') ;
end
% UCF101 has 3 data splits
if opts.split>3
error('split should be <=3');
end
imdb.images.set = imdb.images.sets(opts.split,:);
% reverse frame order
if opts.reverseDyn
for i=1:numel(imdb.images.names)
imdb.images.names{i} = imdb.images.names{i}(end:-1:1);
end
end
% -------------------------------------------------------------------------
% Prepare model
% -------------------------------------------------------------------------
net = load(opts.modelPath);
if isfield(net,'net')
net = net.net;
end
opts.nCls = max(imdb.images.label) ;
net = opts.networkFn(net,opts);
if numel(net.meta.normalization.averageImage)>3
sz = size(net.meta.normalization.averageImage) ;
net.meta.normalization.averageImage = ...
mean(reshape(net.meta.normalization.averageImage,[sz(1)*sz(2) sz(3)]),1) ;
end
% Set the class names in the network
net.meta.classes.name = imdb.classes.name ;
net.meta.classes.description = imdb.classes.name ;
% -------------------------------------------------------------------------
% Learn
% -------------------------------------------------------------------------
if opts.epochFactor>0
opts.train.train = repmat(find(imdb.images.set==1),[1 opts.epochFactor]) ;
else
opts.train.train = NaN ;
end
opts.train.val = find(imdb.images.set==3) ;
[net, info] = cnn_train_dicnn_dag(net, imdb, getBatchFn(opts, net.meta), ...
'expDir', opts.expDir, ...
opts.train) ;
% -------------------------------------------------------------------------
% Report accuracy
% -------------------------------------------------------------------------
errlayer = net.getLayerIndex('errMC') ;
if ~isnan(errlayer)
cats = imdb.classes.name ;
accs = net.layers(errlayer).block.accuracy ;
if numel(cats)~=numel(accs)
error('wrong number of classes\n') ;
end
for i=1:numel(cats)
fprintf('%s acc %.1f\n',cats{i},100*accs(i)) ;
end
fprintf('Mean accuracy %.1f\n',100*mean(accs)) ;
end
% -------------------------------------------------------------------------
function fn = getBatchFn(opts, meta)
% -------------------------------------------------------------------------
useGpu = numel(opts.train.gpus) > 0 ;
bopts.numThreads = opts.numFetchThreads ;
bopts.imageSize = meta.normalization.imageSize ;
if isfield(meta.normalization,'border')
bopts.border = meta.normalization.border ;
else
bopts.border = meta.normalization.imageSize(1:2) ./ ...
meta.normalization.cropSize - meta.normalization.imageSize(1:2);
end
% bopts.averageImage = [];
bopts.averageImage = meta.normalization.averageImage ;
bopts.interpolation = meta.normalization.interpolation ;
bopts.keepAspect = meta.normalization.keepAspect ;
% bopts.rgbVariance = meta.augmentation.rgbVariance ;
% bopts.transformation = meta.augmentation.transformation ;
fn = @(x,y) getDagNNBatch(bopts,useGpu,x,y) ;
% -------------------------------------------------------------------------
function inputs = getDagNNBatch(opts, useGpu, imdb, batch)
% -------------------------------------------------------------------------
% batch refers to videos (not for frames)
if isempty(batch)
inputs = {'input', [], 'label', [], 'VideoId1', [], 'VideoId2', []};
return;
end
isVal = ~isempty(batch) && imdb.images.set(batch(1)) ~= 1 ;
% if ~isVal, transformation='stretch'; else transformation='none';end
if ~isVal, transformation='multiScaleRegular'; else transformation='none';end
names = imdb.images.names(batch);
% images = strcat([imdb.imageDir filesep], imdb.images.name(batch)) ;
namesM = {};
nVids = numel(batch);
VideoId1 = [];
VideoId2 = [];
% step-size
stepSize = 6;
% pool nFrames into a dynamic image
nFrames = 1;
% number of dynamic images to be max pooled later
nDynImgs = 10;
c1 = 1;
for v=1:nVids
name = names{v};
nFrms = numel(name);
nSample = nFrames;
nr = numel(1:stepSize:nFrms);
% jitter by removing 50 % and limit a batch to nMaxs * nSamples images
if nr > 1 && (~isVal && nr>nDynImgs)
rat = min(nDynImgs,ceil(0.50*nr));
ri = randperm(nr);
ri = ri(1:rat);
r = zeros(1,nr);
r(ri) = 1;
else
r = ones(1,nr);
end
c3 = 1;
c2 = 0;
for f=1:stepSize:nFrms
if r(c3)
idx = f:min(f+nSample-1,nFrms) ;
if numel(idx)<nFrames
idx = [idx idx(end) * ones(1,nFrames-numel(idx))];
end
namesM{end+1} = name(idx);
VideoId1 = [VideoId1 c1 * ones(1,numel(idx))];
c1 = c1 + 1;
c2 = c2 + 1;
end
c3 = c3 + 1;
end
VideoId2 = [VideoId2 v * ones(1,c2) ] ;
end
images = strcat([imdb.imageDir filesep], horzcat(namesM{:}) ) ;
im = cnn_video_rgb_get_batch(images, VideoId1, opts, ...
'transformation', transformation, 'prefetch', nargout == 0, ...
'subMean', false) ;
if nargout > 0
if useGpu
im = gpuArray(im) ;
end
inputs = {'input', im, 'label', imdb.images.label(batch), ...
'VideoId2', VideoId2};
end
================================================
FILE: dicnn/cnn_train_dicnn_dag.m
================================================
function [net,stats] = cnn_train_dicnn_dag(net, imdb, getBatch, varargin)
%CNN_DICNN_TRAIN_DAG Demonstrates training a CNN using the DagNN wrapper
% CNN_TRAIN_DAG() is similar to CNN_TRAIN(), but works with
% the DagNN wrapper instead of the SimpleNN wrapper.
% Copyright (C) 2014-16 Andrea Vedaldi.
% All rights reserved.
%
% This file is part of the VLFeat library and is made available under
% the terms of the BSD license (see the COPYING file).
addpath(fullfile(vl_rootnn, 'examples'));
opts.expDir = fullfile('data','exp') ;
opts.continue = true ;
opts.batchSize = 256 ;
opts.numSubBatches = 1 ;
opts.train = [] ;
opts.val = [] ;
opts.gpus = [] ;
opts.prefetch = false ;
opts.epochSize = inf;
opts.numEpochs = 300 ;
opts.learningRate = 0.001 ;
opts.weightDecay = 0.0005 ;
opts.solver = [] ; % Empty array means use the default SGD solver
[opts, varargin] = vl_argparse(opts, varargin) ;
if ~isempty(opts.solver)
assert(isa(opts.solver, 'function_handle') && nargout(opts.solver) == 2,...
'Invalid solver; expected a function handle with two outputs.') ;
% Call without input arguments, to get default options
opts.solverOpts = opts.solver() ;
end
opts.momentum = 0.9 ;
opts.saveSolverState = true ;
opts.nesterovUpdate = false ;
opts.randomSeed = 0 ;
opts.profile = false ;
opts.parameterServer.method = 'mmap' ;
opts.parameterServer.prefix = 'mcn' ;
opts.derOutputs = {'objective', 1} ;
opts.extractStatsFn = @extractStats ;
opts.plotStatistics = true;
opts.postEpochFn = [] ; % postEpochFn(net,params,state) called after each epoch; can return a new learning rate, 0 to stop, [] for no change
opts = vl_argparse(opts, varargin) ;
if ~exist(opts.expDir, 'dir'), mkdir(opts.expDir) ; end
if isempty(opts.train), opts.train = find(imdb.images.set==1) ; end
if isempty(opts.val), opts.val = find(imdb.images.set==2) ; end
if isscalar(opts.train) && isnumeric(opts.train) && isnan(opts.train)
opts.train = [] ;
end
if isscalar(opts.val) && isnumeric(opts.val) && isnan(opts.val)
opts.val = [] ;
end
% -------------------------------------------------------------------------
% Initialization
% -------------------------------------------------------------------------
evaluateMode = isempty(opts.train) ;
if ~evaluateMode
if isempty(opts.derOutputs)
error('DEROUTPUTS must be specified when training.\n') ;
end
end
% -------------------------------------------------------------------------
% Train and validate
% -------------------------------------------------------------------------
modelPath = @(ep) fullfile(opts.expDir, sprintf('net-epoch-%d.mat', ep));
modelFigPath = fullfile(opts.expDir, 'net-train.pdf') ;
start = opts.continue * findLastCheckpoint(opts.expDir) ;
if start >= 1
fprintf('%s: resuming by loading epoch %d\n', mfilename, start) ;
[net, state, stats] = loadState(modelPath(start)) ;
else
state = [] ;
end
for epoch=start+1:opts.numEpochs
% Set the random seed based on the epoch and opts.randomSeed.
% This is important for reproducibility, including when training
% is restarted from a checkpoint.
rng(epoch + opts.randomSeed) ;
prepareGPUs(opts, epoch == start+1) ;
% Train for one epoch.
params = opts ;
params.epoch = epoch ;
params.learningRate = opts.learningRate(min(epoch, numel(opts.learningRate))) ;
params.train = opts.train(randperm(numel(opts.train))) ; % shuffle
params.train = params.train(1:min(opts.epochSize, numel(opts.train)));
params.val = opts.val(randperm(numel(opts.val))) ;
params.imdb = imdb ;
params.getBatch = getBatch ;
if numel(opts.gpus) <= 1
[net, state] = processEpoch(net, state, params, 'train') ;
[net, state] = processEpoch(net, state, params, 'val') ;
if ~evaluateMode
saveState(modelPath(epoch), net, state) ;
end
lastStats = state.stats ;
else
spmd
[net, state] = processEpoch(net, state, params, 'train') ;
[net, state] = processEpoch(net, state, params, 'val') ;
if labindex == 1 && ~evaluateMode
saveState(modelPath(epoch), net, state) ;
end
lastStats = state.stats ;
end
lastStats = accumulateStats(lastStats) ;
end
stats.train(epoch) = lastStats.train ;
stats.val(epoch) = lastStats.val ;
clear lastStats ;
saveStats(modelPath(epoch), stats) ;
if opts.plotStatistics
switchFigure(1) ; clf ;
plots = setdiff(...
cat(2,...
fieldnames(stats.train)', ...
fieldnames(stats.val)'), {'num', 'time'}) ;
for p = plots
p = char(p) ;
values = zeros(0, epoch) ;
leg = {} ;
for f = {'train', 'val'}
f = char(f) ;
if isfield(stats.(f), p)
tmp = [stats.(f).(p)] ;
values(end+1,:) = tmp(1,:)' ;
leg{end+1} = f ;
end
end
subplot(1,numel(plots),find(strcmp(p,plots))) ;
plot(1:epoch, values','o-') ;
xlabel('epoch') ;
title(p) ;
legend(leg{:}) ;
grid on ;
end
drawnow ;
print(1, modelFigPath, '-dpdf') ;
end
if ~isempty(opts.postEpochFn)
if nargout(opts.postEpochFn) == 0
opts.postEpochFn(net, params, state) ;
else
lr = opts.postEpochFn(net, params, state) ;
if ~isempty(lr), opts.learningRate = lr; end
if opts.learningRate == 0, break; end
end
end
end
% With multiple GPUs, return one copy
if isa(net, 'Composite'), net = net{1} ; end
% -------------------------------------------------------------------------
function [net, state] = processEpoch(net, state, params, mode)
% -------------------------------------------------------------------------
% Note that net is not strictly needed as an output argument as net
% is a handle class. However, this fixes some aliasing issue in the
% spmd caller.
% initialize with momentum 0
if isempty(state) || isempty(state.solverState)
state.solverState = cell(1, numel(net.params)) ;
state.solverState(:) = {0} ;
end
% move CNN to GPU as needed
numGpus = numel(params.gpus) ;
if numGpus >= 1
net.move('gpu') ;
for i = 1:numel(state.solverState)
s = state.solverState{i} ;
if isnumeric(s)
state.solverState{i} = gpuArray(s) ;
elseif isstruct(s)
state.solverState{i} = structfun(@gpuArray, s, 'UniformOutput', false) ;
end
end
end
if numGpus > 1
parserv = ParameterServer(params.parameterServer) ;
net.setParameterServer(parserv) ;
else
parserv = [] ;
end
% profile
if params.profile
if numGpus <= 1
profile clear ;
profile on ;
else
mpiprofile reset ;
mpiprofile on ;
end
end
num = 0 ;
epoch = params.epoch ;
subset = params.(mode) ;
adjustTime = 0 ;
stats.num = 0 ; % return something even if subset = []
stats.time = 0 ;
start = tic ;
for t=1:params.batchSize:numel(subset)
fprintf('%s: epoch %02d: %3d/%3d:', mode, epoch, ...
fix((t-1)/params.batchSize)+1, ceil(numel(subset)/params.batchSize)) ;
batchSize = min(params.batchSize, numel(subset) - t + 1) ;
for s=1:params.numSubBatches
% get this image batch and prefetch the next
batchStart = t + (labindex-1) + (s-1) * numlabs ;
batchEnd = min(t+params.batchSize-1, numel(subset)) ;
batch = subset(batchStart : params.numSubBatches * numlabs : batchEnd) ;
num = num + numel(batch) ;
if numel(batch) == 0, continue ; end
inputs = params.getBatch(params.imdb, batch) ;
if params.prefetch
if s == params.numSubBatches
batchStart = t + (labindex-1) + params.batchSize ;
batchEnd = min(t+2*params.batchSize-1, numel(subset)) ;
else
batchStart = batchStart + numlabs ;
end
nextBatch = subset(batchStart : params.numSubBatches * numlabs : batchEnd) ;
params.getBatch(params.imdb, nextBatch) ;
end
if strcmp(mode, 'train')
net.mode = 'normal' ;
net.accumulateParamDers = (s ~= 1) ;
net.eval(inputs, params.derOutputs, 'holdOn', s < params.numSubBatches) ;
else
net.mode = 'test' ;
net.eval(inputs) ;
end
end
% Accumulate gradient.
if strcmp(mode, 'train')
if ~isempty(parserv), parserv.sync() ; end
state = accumulateGradients(net, state, params, parserv) ;
end
% Get statistics.
time = toc(start) + adjustTime ;
batchTime = time - stats.time ;
stats.num = num ;
stats.time = time ;
stats = params.extractStatsFn(stats,net) ;
currentSpeed = batchSize / batchTime ;
averageSpeed = (t + batchSize - 1) / time ;
if t == 3*params.batchSize + 1
% compensate for the first three iterations, which are outliers
adjustTime = 4*batchTime - time ;
stats.time = time + adjustTime ;
end
fprintf(' %.1f (%.1f) Hz', averageSpeed, currentSpeed) ;
for f = setdiff(fieldnames(stats)', {'num', 'time'})
f = char(f) ;
fprintf(' %s: %.3f', f, stats.(f)) ;
end
fprintf('\n') ;
end
% Save back to state.
state.stats.(mode) = stats ;
if params.profile
if numGpus <= 1
state.prof.(mode) = profile('info') ;
profile off ;
else
state.prof.(mode) = mpiprofile('info');
mpiprofile off ;
end
end
if ~params.saveSolverState
state.solverState = [] ;
else
for i = 1:numel(state.solverState)
s = state.solverState{i} ;
if isnumeric(s)
state.solverState{i} = gather(s) ;
elseif isstruct(s)
state.solverState{i} = structfun(@gather, s, 'UniformOutput', false) ;
end
end
end
net.reset() ;
net.move('cpu') ;
% -------------------------------------------------------------------------
function state = accumulateGradients(net, state, params, parserv)
% -------------------------------------------------------------------------
numGpus = numel(params.gpus) ;
otherGpus = setdiff(1:numGpus, labindex) ;
den = params.numSubBatches * max(numGpus,1) ;
for p=1:numel(net.params)
if ~isempty(parserv)
parDer = parserv.pullWithIndex(p) ;
else
parDer = net.params(p).der ;
end
switch net.params(p).trainMethod
case 'average' % mainly for batch normalization
thisLR = net.params(p).learningRate ;
net.params(p).value = vl_taccum(...
1 - thisLR, net.params(p).value, ...
(thisLR/den/net.params(p).fanout), parDer) ;
case 'gradient'
thisDecay = params.weightDecay * net.params(p).weightDecay ;
thisLR = params.learningRate * net.params(p).learningRate ;
if thisLR>0 || thisDecay>0
% Normalize gradient and incorporate weight decay.
parDer = vl_taccum(1/den, parDer, ...
thisDecay, net.params(p).value) ;
if isempty(params.solver)
% Default solver is the optimised SGD.
% Update momentum.
state.solverState{p} = vl_taccum(...
params.momentum, state.solverState{p}, ...
-1, parDer) ;
% Nesterov update (aka one step ahead).
if params.nesterovUpdate
delta = params.momentum * state.solverState{p} - parDer ;
else
delta = state.solverState{p} ;
end
% Update parameters.
net.params(p).value = vl_taccum(...
1, net.params(p).value, thisLR, delta) ;
else
% call solver function to update weights
[net.params(p).value, state.solverState{p}] = ...
params.solver(net.params(p).value, state.solverState{p}, ...
parDer, params.solverOpts, thisLR) ;
end
end
otherwise
error('Unknown training method ''%s'' for parameter ''%s''.', ...
net.params(p).trainMethod, ...
net.params(p).name) ;
end
end
% -------------------------------------------------------------------------
function stats = accumulateStats(stats_)
% -------------------------------------------------------------------------
for s = {'train', 'val'}
s = char(s) ;
total = 0 ;
% initialize stats stucture with same fields and same order as
% stats_{1}
stats__ = stats_{1} ;
names = fieldnames(stats__.(s))' ;
values = zeros(1, numel(names)) ;
fields = cat(1, names, num2cell(values)) ;
stats.(s) = struct(fields{:}) ;
for g = 1:numel(stats_)
stats__ = stats_{g} ;
num__ = stats__.(s).num ;
total = total + num__ ;
for f = setdiff(fieldnames(stats__.(s))', 'num')
f = char(f) ;
stats.(s).(f) = stats.(s).(f) + stats__.(s).(f) * num__ ;
if g == numel(stats_)
stats.(s).(f) = stats.(s).(f) / total ;
end
end
end
stats.(s).num = total ;
end
% -------------------------------------------------------------------------
function stats = extractStats(stats, net)
% -------------------------------------------------------------------------
sel = find(cellfun(@(x) isa(x,'dagnn.Loss'), {net.layers.block})) ;
for i = 1:numel(sel)
if net.layers(sel(i)).block.ignoreAverage, continue; end;
stats.(net.layers(sel(i)).outputs{1}) = net.layers(sel(i)).block.average ;
end
% -------------------------------------------------------------------------
function saveState(fileName, net_, state)
% -------------------------------------------------------------------------
net = net_.saveobj() ;
save(fileName, 'net', 'state') ;
% -------------------------------------------------------------------------
function saveStats(fileName, stats)
% -------------------------------------------------------------------------
if exist(fileName)
save(fileName, 'stats', '-append') ;
else
save(fileName, 'stats') ;
end
% -------------------------------------------------------------------------
function [net, state, stats] = loadState(fileName)
% -------------------------------------------------------------------------
load(fileName, 'net', 'state', 'stats') ;
net = dagnn.DagNN.loadobj(net) ;
if isempty(whos('stats'))
error('Epoch ''%s'' was only partially saved. Delete this file and try again.', ...
fileName) ;
end
% -------------------------------------------------------------------------
function epoch = findLastCheckpoint(modelDir)
% -------------------------------------------------------------------------
list = dir(fullfile(modelDir, 'net-epoch-*.mat')) ;
tokens = regexp({list.name}, 'net-epoch-([\d]+).mat', 'tokens') ;
epoch = cellfun(@(x) sscanf(x{1}{1}, '%d'), tokens) ;
epoch = max([epoch 0]) ;
% -------------------------------------------------------------------------
function switchFigure(n)
% -------------------------------------------------------------------------
if get(0,'CurrentFigure') ~= n
try
set(0,'CurrentFigure',n) ;
catch
figure(n) ;
end
end
% -------------------------------------------------------------------------
function clearMex()
% -------------------------------------------------------------------------
clear vl_tmove vl_imreadjpeg ;
% -------------------------------------------------------------------------
function prepareGPUs(opts, cold)
% -------------------------------------------------------------------------
numGpus = numel(opts.gpus) ;
if numGpus > 1
% check parallel pool integrity as it could have timed out
pool = gcp('nocreate') ;
if ~isempty(pool) && pool.NumWorkers ~= numGpus
delete(pool) ;
end
pool = gcp('nocreate') ;
if isempty(pool)
parpool('local', numGpus) ;
cold = true ;
end
end
if numGpus >= 1 && cold
fprintf('%s: resetting GPU\n', mfilename)
clearMex() ;
if numGpus == 1
gpuDevice(opts.gpus)
else
spmd
clearMex() ;
gpuDevice(opts.gpus(labindex))
end
end
end
================================================
FILE: dicnn/cnn_video_of_get_batch.m
================================================
function imo = cnn_video_of_get_batch(images, vids, varargin)
% CNN_VIDEO_OF_GET_BATCH Load, preprocess, and pack images for CNN evaluation
% video ids
% use same spatial jittering for frames from the same video
% NOTE: all the frames from a video should have the same size (wxh)
opts.imageSize = [227, 227] ;
opts.border = [29, 29] ;
opts.keepAspect = true ;
opts.numAugments = 1 ;
opts.transformation = 'multiScaleRegular' ;
opts.averageImage = [] ;
opts.rgbVariance = zeros(0,2,'single') ;
opts.interpolation = 'bilinear' ;
opts.numThreads = 1 ;
opts.prefetch = false ;
opts.lazyResize = true ;
opts.subMean = false; % subtract the mean from each video
opts = vl_argparse(opts, varargin);
% fetch is true if images is a list of filenames (instead of
% a cell array of images)
fetch = numel(images) >= 1 && ischar(images{1}) ;
% prefetch is used to load images in a separate thread
prefetch = fetch & opts.prefetch ;
if prefetch
vl_imreadjpeg(images, 'numThreads', opts.numThreads, 'prefetch') ;
imo = [] ;
return ;
end
if fetch
im = vl_imreadjpeg(images,'numThreads', opts.numThreads) ;
else
im = images ;
end
tfs = [] ;
switch opts.transformation
case 'none'
tfs = [
.5 ;
.5 ;
0 ] ;
case 'f5'
tfs = [...
.5 0 0 1 1 .5 0 0 1 1 ;
.5 0 1 0 1 .5 0 1 0 1 ;
0 0 0 0 0 1 1 1 1 1] ;
case 'f25'
[tx,ty] = meshgrid(linspace(0,1,5)) ;
tfs = [tx(:)' ; ty(:)' ; zeros(1,numel(tx))] ;
tfs_ = tfs ;
tfs_(3,:) = 1 ;
tfs = [tfs,tfs_] ;
case 'stretch'
case 'multiScaleRegular'
otherwise
error('Uknown transformations %s', opts.transformation) ;
end
[~,transformations] = sort(rand(size(tfs,2), numel(images)), 1) ;
if ~isempty(opts.rgbVariance) && isempty(opts.averageImage)
opts.averageImage = zeros(1,1,2) ;
end
if numel(opts.averageImage) == 2
opts.averageImage = reshape(opts.averageImage, 1,1,2) ;
end
imo = zeros(opts.imageSize(1), opts.imageSize(2), 2, ...
numel(images)/2*opts.numAugments, 'single') ;
nVid = max(vids);
si = 1 ;
countv = 1;
for v=1:nVid
vid = find(vids==v);
for i=1:numel(images(vid))
% acquire image
if isempty(im{i})
imt1 = imread(images{2*vid(i)-1}) ;
imt2 = imread(images{2*vid(i)}) ;
else
imt1 = im{2*vid(i)-1} ;
imt2 = im{2*vid(i)} ;
end
imt = single(cat(3,imt1,imt2)) ; % faster than im2single (and multiplies by 255)
% resize
w = size(imt,2) ;
h = size(imt,1) ;
factor = [(opts.imageSize(1)+opts.border(1))/h ...
(opts.imageSize(2)+opts.border(2))/w];
if opts.keepAspect
factor = max(factor) ;
end
if any(abs(factor - 1) > 0.0001)
imt = imresize(imt, ...
'scale', factor, ...
'method', opts.interpolation) ;
end
% crop & flip
if i==1
flip = rand > 0.5 ;
w = size(imt,2) ;
h = size(imt,1) ;
switch opts.transformation
case 'stretch'
sz = round(min(opts.imageSize(1:2)' .* (1-0.1+0.2*rand(2,1)), [w;h])) ;
dx = randi(w - sz(2) + 1, 1) ;
dy = randi(h - sz(1) + 1, 1) ;
% flip = rand > 0.5 ;
case 'multiScaleRegular'
reg_szs = [256, 224, 192, 168] ;
sz(1) = reg_szs(randi(4)); sz(2) = reg_szs(randi(4));
dy = [0 h-sz(1) 0 h-sz(1) floor((h-sz(1)+1)/2)] + 1;
dx = [0 w-sz(2) w-sz(2) 0 floor((w-sz(2)+1)/2)] + 1;
corner = randi(5);
dx = dx(corner); dy = dy(corner);
otherwise
tf = tfs(:, transformations(mod(0, numel(transformations)) + 1)) ;
sz = opts.imageSize(1:2) ;
dx = floor((w - sz(2)) * tf(2)) + 1 ;
dy = floor((h - sz(1)) * tf(1)) + 1 ;
% flip = tf(3) ;
end
end
if opts.lazyResize
sx = round(linspace(dx, sz(2)+dx-1, opts.imageSize(2))) ;
sy = round(linspace(dy, sz(1)+dy-1, opts.imageSize(1))) ;
else
factor = [opts.imageSize(1)/sz(1) ...
opts.imageSize(2)/sz(2)];
if any(abs(factor - 1) > 0.0001)
imt = imresize(gather(imt(dy:sz(1)+dy-1,dx:sz(2)+dx-1,:)), [opts.imageSize(1:2)],...
'Antialiasing', false, 'Method', opts.interpolation);
end
sx = 1:opts.imageSize(2); sy = 1:opts.imageSize(1);
end
if flip
sx = fliplr(sx) ;
imo(:,:,1,si) = 255 - imt(sy,sx,1) ;
imo(:,:,2,si) = imt(sy,sx,2) ;
else
imo(:,:,:,si) = imt(sy,sx,:) ;
end
si = si + 1 ;
end
countv = countv + numel(images(vid));
end
if ~isempty(opts.averageImage) && numel(opts.averageImage)==2
if ~isempty(opts.rgbVariance)
imo = bsxfun(@minus, imo, opts.averageImage+reshape(opts.rgbVariance * randn(2,1), 1,1,3)) ;
else
imo = bsxfun(@minus, imo, opts.averageImage) ;
end
end
================================================
FILE: dicnn/cnn_video_rgb_get_batch.m
================================================
function imo = cnn_video_rgb_get_batch(images, vids, varargin)
% CNN_VIDEO_RGB_GET_BATCH Load, preprocess, and pack images for CNN evaluation
% video ids
% use same spatial jittering for frames from the same video
% NOTE: all the frames from a video should have the same size (wxh)
opts.imageSize = [227, 227] ;
opts.border = [29, 29] ;
opts.keepAspect = true ;
opts.numAugments = 1 ;
opts.transformation = 'none' ;
opts.averageImage = [] ;
opts.rgbVariance = zeros(0,3,'single') ;
opts.interpolation = 'bilinear' ;
opts.numThreads = 1 ;
opts.prefetch = false ;
opts.subMean = false ; % subtract the mean from each video
opts.lazyResize = true ;
opts = vl_argparse(opts, varargin);
% fetch is true if images is a list of filenames (instead of
% a cell array of images)
fetch = numel(images) >= 1 && ischar(images{1}) ;
% prefetch is used to load images in a separate thread
prefetch = fetch & opts.prefetch ;
if prefetch
vl_imreadjpeg(images, 'numThreads', opts.numThreads, 'prefetch') ;
imo = [] ;
return ;
end
if fetch
im = vl_imreadjpeg(images,'numThreads', opts.numThreads) ;
else
im = images ;
end
tfs = [] ;
switch opts.transformation
case 'none'
tfs = [
.5 ;
.5 ;
0 ] ;
case 'f5'
tfs = [...
.5 0 0 1 1 .5 0 0 1 1 ;
.5 0 1 0 1 .5 0 1 0 1 ;
0 0 0 0 0 1 1 1 1 1] ;
case 'f25'
[tx,ty] = meshgrid(linspace(0,1,5)) ;
tfs = [tx(:)' ; ty(:)' ; zeros(1,numel(tx))] ;
tfs_ = tfs ;
tfs_(3,:) = 1 ;
tfs = [tfs,tfs_] ;
case 'stretch'
case 'multiScaleRegular'
otherwise
error('Uknown transformations %s', opts.transformation) ;
end
[~,transformations] = sort(rand(size(tfs,2), numel(images)), 1) ;
if ~isempty(opts.rgbVariance) && isempty(opts.averageImage)
opts.averageImage = zeros(1,1,3) ;
end
if numel(opts.averageImage) == 3
opts.averageImage = reshape(opts.averageImage, 1,1,3) ;
end
imo = zeros(opts.imageSize(1), opts.imageSize(2), 3, ...
numel(images)*opts.numAugments, 'single') ;
nVid = max(vids);
si = 1 ;
countv = 1;
for v=1:nVid
vid = find(vids==v);
for i=1:numel(images(vid))
% acquire image
if isempty(im{i})
imt = imread(images{vid(i)}) ;
imt = single(imt) ; % faster than im2single (and multiplies by 255)
else
imt = im{vid(i)} ;
end
if size(imt,3) == 1
imt = cat(3, imt, imt, imt) ;
end
% resize
w = size(imt,2) ;
h = size(imt,1) ;
factor = [(opts.imageSize(1)+opts.border(1))/h ...
(opts.imageSize(2)+opts.border(2))/w];
if opts.keepAspect
factor = max(factor) ;
end
if any(abs(factor - 1) > 0.0001)
imt = imresize(imt, ...
'scale', factor, ...
'method', opts.interpolation) ;
end
% crop & flip
if i==1
w = size(imt,2) ;
h = size(imt,1) ;
switch opts.transformation
case 'stretch'
sz = round(min(opts.imageSize(1:2)' .* (1-0.1+0.2*rand(2,1)), [w;h])) ;
dx = randi(w - sz(2) + 1, 1) ;
dy = randi(h - sz(1) + 1, 1) ;
flip = rand > 0.5 ;
case 'multiScaleRegular'
reg_szs = [256, 224, 192, 168] ;
sz(1) = reg_szs(randi(4)); sz(2) = reg_szs(randi(4));
dy = [0 h-sz(1) 0 h-sz(1) floor((h-sz(1)+1)/2)] + 1;
dx = [0 w-sz(2) w-sz(2) 0 floor((w-sz(2)+1)/2)] + 1;
corner = randi(5);
dx = dx(corner); dy = dy(corner);
flip = rand > 0.5 ;
otherwise
tf = tfs(:, transformations(mod(0, numel(transformations)) + 1)) ;
sz = opts.imageSize(1:2) ;
dx = floor((w - sz(2)) * tf(2)) + 1 ;
dy = floor((h - sz(1)) * tf(1)) + 1 ;
flip = tf(3) ;
end
end
if opts.lazyResize
sx = round(linspace(dx, sz(2)+dx-1, opts.imageSize(2))) ;
sy = round(linspace(dy, sz(1)+dy-1, opts.imageSize(1))) ;
else
factor = [opts.imageSize(1)/sz(1) ...
opts.imageSize(2)/sz(2)];
if any(abs(factor - 1) > 0.0001)
imt = imresize(gather(imt(dy:sz(1)+dy-1,dx:sz(2)+dx-1,:)), ...
opts.imageSize(1:2), 'Antialiasing', false, ...
'Method', opts.interpolation);
end
sx = 1:opts.imageSize(2); sy = 1:opts.imageSize(1);
end
if flip
sx = fliplr(sx) ;
end
imo(:,:,:,si) = imt(sy,sx,:) ;
si = si + 1 ;
end
countv = countv + numel(images(vid));
end
if ~isempty(opts.averageImage) && numel(opts.averageImage)==3
if ~isempty(opts.rgbVariance)
imo = bsxfun(@minus, imo, opts.averageImage+reshape(opts.rgbVariance * randn(3,1), 1,1,3)) ;
else
imo = bsxfun(@minus, imo, opts.averageImage) ;
end
end
================================================
FILE: dicnn/compute_approximate_dynamic_images.m
================================================
function di = compute_approximate_dynamic_images(images)
% Computes approximate dynamic images for a given array of images
% IMAGES must be a tensor of H x W x D x N dimensionality or
% cell of image names
% For the exact dynamic images, use the code
% http://users.cecs.anu.edu.au/~basura/dynamic_images/code.zip
% Explained here http://arxiv.org/abs/1512.01848
if isempty(images)
di = [] ;
return ;
end
if iscell(images)
imagesA = cell(1,numel(images)) ;
for i=1:numel(images)
if ~ischar(images{i})
error('images must be an array of images or cell of image names') ;
end
imagesA{i} = imread(images{i}) ;
end
images = cat(4,imagesA{:}) ;
end
N = size(images,4) ;
di = vl_nnarpooltemporal(single(images),ones(1,N)) ;
================================================
FILE: dicnn/visualize_approximate_dynamic_images.m
================================================
function visualize_approximate_dynamic_images(images)
% VISUALIZE_DYNAMIC_IMAGES
di = compute_approximate_dynamic_images(images) ;
di = di - min(di(:)) ;
di = 255 * di ./ max(di(:)) ;
image(uint8(di)) ;
================================================
FILE: main_train.m
================================================
model = 'resnext50' ; % {'cafferef','resnext50','resnext101'}
input = 'rgb' ; % {'rgb','of'}
dataset = 'ucf101' ; % {'ucf101','hmdb51'} hmdb51 requires more iterations to train (add more epochs to learning rate)
opts.train.batchSize = 128 ;
opts.train.numSubBatches = 32 ; % increase the number (16,32) if it does not fit into gpu mem
opts.epochFactor = 5 ;
opts.split = 1 ;
opts.train.gpus = 1 ;
run matconvnet/matlab/vl_setupnn.m ;
vl_contrib install mcnExtraLayers ; vl_contrib setup mcnExtraLayers ;
vl_contrib install autonn ; vl_contrib setup autonn ;
% addpath(fullfile('matconvnet','contrib','mcnExtraLayers','matlab')) ;
opts.expDir = ['exp/' model 'rgb-arpool-split' num2str(opts.split)] ;
if strcmp(input,'rgb')
opts.DropOutRate = 0.5 ;
trainfn = @cnn_dicnn_rgb ;
elseif strcmp(input,'of')
opts.DropOutRate = 0.8 ;
trainfn = @cnn_dicnn_of ;
end
if strcmp(model,'cafferef')
opts.pool1Layer = 'conv1' ;
% download from http://www.vlfeat.org/matconvnet/models/imagenet-caffe-ref.mat
opts.modelPath = fullfile('models','imagenet-caffe-ref.mat') ;
opts.networkFn = @cnn_init_cafferef ;
if strcmp(input,'rgb')
opts.train.learningRate = 1e-3 * [ones(1,2) 0.1*ones(1,2)] ;
else
opts.train.learningRate = 3e-3 * [ones(1,10) 0.1*ones(1,2)] ;
end
opts.train.numEpochs = numel(opts.train.learningRate) ;
elseif strcmp(model,'resnext50') || strcmp(model,'resnext101')
% download from http://www.robots.ox.ac.uk/~albanie/models/pytorch-imports/resnext_50_32x4d-pt-mcn.mat
% download from http://www.robots.ox.ac.uk/~albanie/models/pytorch-imports/resnext_101_32x4d-pt-mcn.mat
if strcmp(model,'resnext50')
opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat') ;
else
opts.modelPath = fullfile('models','resnext_101_32x4d-pt-mcn.mat') ;
end
opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat') ;
opts.networkFn = @cnn_init_resnext ;
if strcmp(input,'rgb')
opts.train.learningRate = 1e-2 * [ones(1,2) 0.1*ones(1,8) ] ;
else
opts.train.learningRate = 1e-2 * [ones(1,2) 0.1*ones(1,2) ] ;
end
end
addpath dicnn ;
[net, info] = trainfn(opts)
================================================
FILE: utils/extract_frames.sh
================================================
# !/bin/bash
# This script converts videos into frames
# for different fps change (-r 1)
for f in *.avi
do g=`echo $f | sed 's/\.avi//'`;
echo Processing $f;
mkdir -p frames/$g/ ;
ffmpeg -i $f frames/$g/image-%04d.jpeg ;
done
gitextract_6iyvfhqz/
├── .gitmodules
├── Datasets/
│ ├── cnn_hmdb51_of_setup_data.m
│ ├── cnn_hmdb51_setup_data.m
│ ├── cnn_ucf101_of_setup_data.m
│ └── cnn_ucf101_setup_data.m
├── Layers/
│ ├── AppRankPooling.m
│ ├── BatchNormN.m
│ ├── ErrorMultiClass.m
│ ├── L2Normalize.m
│ ├── LossNormalized.m
│ ├── TemporalPooling.m
│ ├── vl_nnarpooltemporal.m
│ ├── vl_nnl2norm.m
│ └── vl_nnpooltemporal.m
├── README.md
├── dicnn/
│ ├── cnn_dicnn_of.m
│ ├── cnn_dicnn_rgb.m
│ ├── cnn_init_cafferef.m
│ ├── cnn_init_resnext.m
│ ├── cnn_single_of.m
│ ├── cnn_single_rgb.m
│ ├── cnn_train_dicnn_dag.m
│ ├── cnn_video_of_get_batch.m
│ ├── cnn_video_rgb_get_batch.m
│ ├── compute_approximate_dynamic_images.m
│ └── visualize_approximate_dynamic_images.m
├── main_train.m
└── utils/
└── extract_frames.sh
Condensed preview — 28 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (100K chars).
[
{
"path": ".gitmodules",
"chars": 105,
"preview": "[submodule \"matconvnet\"]\n\tpath = matconvnet\n\turl = https://github.com/vlfeat/matconvnet\n\tbranch = master\n"
},
{
"path": "Datasets/cnn_hmdb51_of_setup_data.m",
"chars": 2531,
"preview": "function imdb = cnn_hmdb51_of_setup_data(varargin)\n% CNN_UCF101_SETUP_DATA Initialize UCF101 - Action Recognition Data S"
},
{
"path": "Datasets/cnn_hmdb51_setup_data.m",
"chars": 2247,
"preview": "function imdb = cnn_hmdb51_setup_data(varargin)\n% CNN_UCF101_SETUP_DATA Initialize UCF101 - Action Recognition Data Set\n"
},
{
"path": "Datasets/cnn_ucf101_of_setup_data.m",
"chars": 4494,
"preview": "function imdb = cnn_ucf101_of_setup_data(varargin)\n% CNN_UCF101_SETUP_DATA Initialize UCF101 - Action Recognition Data S"
},
{
"path": "Datasets/cnn_ucf101_setup_data.m",
"chars": 4088,
"preview": "function imdb = cnn_ucf101_setup_data(varargin)\n% CNN_UCF101_SETUP_DATA Initialize UCF101 - Action Recognition Data Set\n"
},
{
"path": "Layers/AppRankPooling.m",
"chars": 807,
"preview": "classdef AppRankPooling < dagnn.ElementWise\n % author: Hakan Bilen\n % dagnn wrapper for approximate rank pooling\n \n "
},
{
"path": "Layers/BatchNormN.m",
"chars": 1985,
"preview": "classdef BatchNormN < dagnn.ElementWise\n properties\n numChannels\n epsilon = 1e-5\n opts = {'NoCuDNN'} % ours se"
},
{
"path": "Layers/ErrorMultiClass.m",
"chars": 1681,
"preview": "classdef ErrorMultiClass < dagnn.Loss\n% author: Hakan Bilen\n% computes multi-class accuracy\n% inputs{1}->scores\n% inputs"
},
{
"path": "Layers/L2Normalize.m",
"chars": 636,
"preview": "classdef L2Normalize < dagnn.ElementWise\n % author: Hakan Bilen\n % dagnn wrapper for l2 normalization\n \n properties\n"
},
{
"path": "Layers/LossNormalized.m",
"chars": 1833,
"preview": "classdef LossNormalized < dagnn.Loss\n% properties\n% loss = 'softmaxlog'\n% ignoreAverage = false\n% opts = {"
},
{
"path": "Layers/TemporalPooling.m",
"chars": 636,
"preview": "classdef TemporalPooling < dagnn.ElementWise\n % author: Hakan Bilen\n % dagnn wrapper for approximate rank pooling\n \n "
},
{
"path": "Layers/vl_nnarpooltemporal.m",
"chars": 1096,
"preview": "function Y = vl_nnarpooltemporal(X,ids,dzdy)\n% author: Hakan Bilen\n% approximate rank pooling\n% ids indicates frame-vide"
},
{
"path": "Layers/vl_nnl2norm.m",
"chars": 1154,
"preview": "function y = vl_nnl2norm(x,param,dzdy)\n% author: Hakan Bilen\n% l2 normalize whole feature map\n\nsc = param(1);\nclip = par"
},
{
"path": "Layers/vl_nnpooltemporal.m",
"chars": 1076,
"preview": "function Y = vl_nnpooltemporal(X,ids,method,dzdy)\n% author: Hakan Bilen\n% temporal pooling along frames\n% ids indicates "
},
{
"path": "README.md",
"chars": 4555,
"preview": "# Dynamic Image Networks for Action Recognition\n## Improved Results (see the extended version of CVPR paper)\n\n\nResNeXt-5"
},
{
"path": "dicnn/cnn_dicnn_of.m",
"chars": 7227,
"preview": "function [net, info] = cnn_dicnn_of(varargin)\n%CNN_DICNN_OF Fine-tunes a pre-trained CNN with dynamic images on optical\n"
},
{
"path": "dicnn/cnn_dicnn_rgb.m",
"chars": 7655,
"preview": "function [net, info] = cnn_dicnn_rgb(varargin)\n%CNN_DICNN_RGB Fine-tunes a pre-trained CNN with dynamic images on RGB fr"
},
{
"path": "dicnn/cnn_init_cafferef.m",
"chars": 4595,
"preview": "% -------------------------------------------------------------------------\nfunction net = cnn_init_cafferef(net,opts)\n%"
},
{
"path": "dicnn/cnn_init_resnext.m",
"chars": 4962,
"preview": "% -------------------------------------------------------------------------\nfunction net = cnn_init_resnext(net,opts)\n% "
},
{
"path": "dicnn/cnn_single_of.m",
"chars": 6875,
"preview": "function [net, info] = cnn_single_of(varargin)\n%CNN_SINGLE_OF Demonstrates fine-tuning a pre-trained CNN with static \n% "
},
{
"path": "dicnn/cnn_single_rgb.m",
"chars": 6998,
"preview": " function [net, info] = cnn_single_rgb(varargin)\n%CNN_SINGLE_RGB Demonstrates fine-tuning a pre-trained CNN with static"
},
{
"path": "dicnn/cnn_train_dicnn_dag.m",
"chars": 15465,
"preview": "function [net,stats] = cnn_train_dicnn_dag(net, imdb, getBatch, varargin)\n%CNN_DICNN_TRAIN_DAG Demonstrates training a C"
},
{
"path": "dicnn/cnn_video_of_get_batch.m",
"chars": 4788,
"preview": "function imo = cnn_video_of_get_batch(images, vids, varargin)\n% CNN_VIDEO_OF_GET_BATCH Load, preprocess, and pack image"
},
{
"path": "dicnn/cnn_video_rgb_get_batch.m",
"chars": 4688,
"preview": "function imo = cnn_video_rgb_get_batch(images, vids, varargin)\n% CNN_VIDEO_RGB_GET_BATCH Load, preprocess, and pack ima"
},
{
"path": "dicnn/compute_approximate_dynamic_images.m",
"chars": 755,
"preview": "function di = compute_approximate_dynamic_images(images)\n% Computes approximate dynamic images for a given array of imag"
},
{
"path": "dicnn/visualize_approximate_dynamic_images.m",
"chars": 205,
"preview": "function visualize_approximate_dynamic_images(images)\n% VISUALIZE_DYNAMIC_IMAGES\n\ndi = compute_approximate_dynamic_image"
},
{
"path": "main_train.m",
"chars": 2149,
"preview": "model = 'resnext50' ; % {'cafferef','resnext50','resnext101'}\ninput = 'rgb' ; % {'rgb','of'}\ndataset = 'ucf101' ; % {'uc"
},
{
"path": "utils/extract_frames.sh",
"chars": 238,
"preview": "# !/bin/bash\n\n# This script converts videos into frames\n# for different fps change (-r 1)\n\nfor f in *.avi\n do g=`echo $"
}
]
About this extraction
This page contains the full source code of the hbilen/dynamic-image-nets GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 28 files (93.3 KB), approximately 28.2k tokens. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.