Repository: hbilen/dynamic-image-nets Branch: master Commit: 96b91afab109 Files: 28 Total size: 93.3 KB Directory structure: gitextract_6iyvfhqz/ ├── .gitmodules ├── Datasets/ │ ├── cnn_hmdb51_of_setup_data.m │ ├── cnn_hmdb51_setup_data.m │ ├── cnn_ucf101_of_setup_data.m │ └── cnn_ucf101_setup_data.m ├── Layers/ │ ├── AppRankPooling.m │ ├── BatchNormN.m │ ├── ErrorMultiClass.m │ ├── L2Normalize.m │ ├── LossNormalized.m │ ├── TemporalPooling.m │ ├── vl_nnarpooltemporal.m │ ├── vl_nnl2norm.m │ └── vl_nnpooltemporal.m ├── README.md ├── dicnn/ │ ├── cnn_dicnn_of.m │ ├── cnn_dicnn_rgb.m │ ├── cnn_init_cafferef.m │ ├── cnn_init_resnext.m │ ├── cnn_single_of.m │ ├── cnn_single_rgb.m │ ├── cnn_train_dicnn_dag.m │ ├── cnn_video_of_get_batch.m │ ├── cnn_video_rgb_get_batch.m │ ├── compute_approximate_dynamic_images.m │ └── visualize_approximate_dynamic_images.m ├── main_train.m └── utils/ └── extract_frames.sh ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitmodules ================================================ [submodule "matconvnet"] path = matconvnet url = https://github.com/vlfeat/matconvnet branch = master ================================================ FILE: Datasets/cnn_hmdb51_of_setup_data.m ================================================ function imdb = cnn_hmdb51_of_setup_data(varargin) % CNN_UCF101_SETUP_DATA Initialize UCF101 - Action Recognition Data Set % http://crcv.ucf.edu/data/UCF101.php % this script requires UCF101 downloaded and frames extracted in frames % folder opts.dataDir = fullfile('data','HMDB51') ; opts.lite = false ; % opts = vl_argparse(opts, varargin) ; %% ------------------------------------------------------------------------ % Load categories metadata % ------------------------------------------------------------------------- % find images imagePath = fullfile(opts.dataDir, 'tvl1_flow', 'u', '*') ; images = dir(imagePath) ; videoNames = cell(1,numel(images)) ; frameNames = cell(1,numel(images)) ; nrFrames = zeros(1,numel(images)) ; for i=1:numel(images) frames = dir(fullfile(opts.dataDir,'tvl1_flow','u',images(i).name,'frame*.jpg')) ; framesc = cell(1,numel(frames)) ; if ~isempty(numel(frames)) for j=1:numel(frames) framesc{j} = frames(j).name ; end frameNames{i} = framesc ; frameNames{i} = strcat(images(i).name,'/',framesc) ; nrFrames(i) = numel(framesc) ; videoNames{i} = images(i).name ; end end videoNames(nrFrames==0) = [] ; frameNames(nrFrames==0) = [] ; % nrFrames(nrFrames==0) = [] ; frameNamesuv = cell(1,numel(frameNames)) ; for i=1:numel(frameNames) nn = frameNames{i} ; nn1 = strcat('u/',nn) ; nn2 = strcat('v/',nn) ; frameNamesuv{i} = cell(1,2*numel(nn1)) ; frameNamesuv{i}(1:2:end) = nn1 ; frameNamesuv{i}(2:2:end) = nn2 ; end % find metadata % ncls = 51 ; metaPath = fullfile(opts.dataDir, 'hmdb51_splits', '*.txt') ; splits = dir(metaPath) ; cats = cell(1,numel(videoNames)) ; sets = zeros(3,numel(videoNames)) ; catNames = cell(1,numel(splits)) ; for i=1:numel(splits) j = strfind(splits(i).name,'_test_') ; splitno = str2double(splits(i).name(j+11)) ; catNames{i} = splits(i).name(1:j-1) ; t = importdata(fullfile(opts.dataDir, 'hmdb51_splits', splits(i).name)) ; vids = cell(1,numel(t.textdata)) ; for k=1:numel(t.textdata) vids{k} = t.textdata{k}(1:end-4) ; end [ia,ib] = ismember(vids,videoNames) ; assert(all(ia)) ; sets(splitno,ib) = t.data' ; cats(ib) = repmat(catNames(i),numel(ia),1) ; end [cu,~,labels] = unique(cats) ; sets(sets(:)==2) = 3 ; imdb.classes.name = cu ; imdb.images.name = videoNames ; imdb.images.names = frameNamesuv ; imdb.images.label = labels' ; imdb.images.sets = sets ; imdb.imageDir = fullfile(opts.dataDir, 'tvl1_flow') ; ================================================ FILE: Datasets/cnn_hmdb51_setup_data.m ================================================ function imdb = cnn_hmdb51_setup_data(varargin) % CNN_UCF101_SETUP_DATA Initialize UCF101 - Action Recognition Data Set % http://crcv.ucf.edu/data/UCF101.php % this script requires UCF101 downloaded and frames extracted in frames % folder opts.dataDir = fullfile('data','HMDB51') ; opts.lite = false ; % opts = vl_argparse(opts, varargin) ; %% ------------------------------------------------------------------------ % Load categories metadata % ------------------------------------------------------------------------- % find images imagePath = fullfile(opts.dataDir, 'frames', '*') ; images = dir(imagePath) ; videoNames = cell(1,numel(images)) ; frameNames = cell(1,numel(images)) ; nrFrames = zeros(1,numel(images)) ; for i=1:numel(images) frames = dir(fullfile(opts.dataDir,'frames',images(i).name,'frame*.jpg')) ; framesc = cell(1,numel(frames)) ; if ~isempty(numel(frames)) for j=1:numel(frames) framesc{j} = frames(j).name ; end frameNames{i} = strcat(images(i).name,'/',framesc) ; nrFrames(i) = numel(framesc) ; videoNames{i} = images(i).name ; end end videoNames(nrFrames==0) = [] ; frameNames(nrFrames==0) = [] ; % nrFrames(nrFrames==0) = [] ; % find metadata % ncls = 51 ; metaPath = fullfile(opts.dataDir, 'hmdb51_splits', '*.txt') ; splits = dir(metaPath) ; % splitFiles = cell(1,3*ncls) ; cats = cell(1,numel(videoNames)) ; sets = zeros(3,numel(videoNames)) ; catNames = cell(1,numel(splits)) ; for i=1:numel(splits) j = strfind(splits(i).name,'_test_') ; splitno = str2double(splits(i).name(j+11)) ; catNames{i} = splits(i).name(1:j-1) ; t = importdata(fullfile(opts.dataDir, 'hmdb51_splits', splits(i).name)) ; vids = cell(1,numel(t.textdata)) ; for k=1:numel(t.textdata) vids{k} = t.textdata{k}(1:end-4) ; end [ia,ib] = ismember(vids,videoNames) ; assert(all(ia)) ; sets(splitno,ib) = t.data' ; cats(ib) = repmat(catNames(i),numel(ia),1) ; end [cu,~,labels] = unique(cats) ; sets(sets(:)==2) = 3 ; imdb.classes.name = cu ; imdb.images.name = videoNames ; imdb.images.names = frameNames ; imdb.images.label = labels' ; imdb.images.sets = sets ; imdb.imageDir = fullfile(opts.dataDir, 'frames') ; ================================================ FILE: Datasets/cnn_ucf101_of_setup_data.m ================================================ function imdb = cnn_ucf101_of_setup_data(varargin) % CNN_UCF101_SETUP_DATA Initialize UCF101 - Action Recognition Data Set % http://crcv.ucf.edu/data/UCF101.php % this script requires UCF101 downloaded and frames extracted in frames % folder opts.dataDir = fullfile('data','UCF101') ; opts.lite = false ; opts = vl_argparse(opts, varargin) ; %% ------------------------------------------------------------------------ % Load categories metadata % ------------------------------------------------------------------------- % find metadata metaPath = fullfile(opts.dataDir, 'ucfTrainTestlist/classInd.txt') ; fprintf('using metadata %s\n', metaPath) ; tmp = importdata(metaPath); nCls = numel(tmp); if nCls ~= 101 error('Wrong meta file %s',metaPath); end cats = cell(1,nCls); for i=1:numel(tmp) t = strsplit(tmp{i}); cats{i} = t{2}; end imdb.classes.name = sort(cats) ; imdb.imageDir = fullfile(opts.dataDir, 'tvl1_flow', 'u') ; %% ------------------------------------------------------------------------ % load image names and labels % ------------------------------------------------------------------------- fprintf('searching training images ...\n') ; names = {} ; name = {}; labels = {} ; for d = dir(fullfile(imdb.imageDir, 'v_*'))' [~,lab] = ismember(lower(d.name(3:end-8)), lower(cats)) ; if lab==0 error('no class label found for %s',d.name); end ims = dir(fullfile(imdb.imageDir, d.name, '*.jpg')) ; name{end+1} = d.name; names{end+1} = strcat([d.name, filesep], {ims.name}) ; labels{end+1} = lab ; if mod(numel(names), 10) == 0, fprintf('.') ; end if mod(numel(names), 500) == 0, fprintf('\n') ; end %fprintf('found %s with %d images\n', d.name, numel(ims)) ; end % names = horzcat(names{:}) ; labels = horzcat(labels{:}) ; % labels = [labels ; labels] ; labels = labels(:)' ; for i=1:numel(names) nn = names{i} ; nn1 = strcat('u/',nn) ; nn2 = strcat('v/',nn) ; names{i} = cell(1,2*numel(nn1)) ; names{i}(1:2:end) = nn1 ; names{i}(2:2:end) = nn2 ; end imdb.images.id = 1:numel(names) ; imdb.images.name = name ; imdb.images.names = names ; imdb.images.label = labels ; imdb.imageDir = fullfile(opts.dataDir, 'tvl1_flow') ; %% ------------------------------------------------------------------------ % load train / test splits % ------------------------------------------------------------------------- fprintf('labeling data...(this may take couple of minutes)') ; imdb.images.sets = zeros(3, numel(names)) ; setNames = {'train','test'}; setVal = [1,3]; for s=1:numel(setNames) for i=1:3 trainFl = fullfile(opts.dataDir, 'ucfTrainTestlist',sprintf('%slist%02d.txt',... setNames{s},i)) ; trainList = importdata(trainFl); if isfield(trainList,'textdata') trainList = trainList.textdata; end for j=1:numel(trainList) tmp = strsplit(trainList{j},'/'); [~,lab] = ismember(lower(tmp{2}(1:end-4)), lower(name)) ; if lab==0 % error('cannot find the video %s',tmp{2}(1:end-4)); warning('cannot find the video %s',tmp{2}(1:end-4)); continue ; end % if trainList.data(j) ~= labels(lab) % error('Labels do not match for %s',tmp{2}); % end imdb.images.sets(i,lab) = setVal(s); end end end fprintf('\n') ; %% ------------------------------------------------------------------------ % Postprocessing % ------------------------------------------------------------------------- % sort categories by WNID (to be compatible with other implementations) [imdb.classes.name,perm] = sort(imdb.classes.name) ; relabel(perm) = 1:numel(imdb.classes.name) ; ok = imdb.images.label > 0 ; imdb.images.label(ok) = relabel(imdb.images.label(ok)) ; if opts.lite % pick a small number of images for the first 10 classes % this cannot be done for test as we do not have test labels clear keep ; for i=1:10 sel = find(imdb.images.label == i) ; train = sel(imdb.images.sets(1,sel) == 1) ; test = sel(imdb.images.sets(1,sel) == 3) ; keep{i} = [train test] ; end keep = keep{:}; imdb.images.id = imdb.images.id(keep) ; imdb.images.name = imdb.images.name(keep) ; imdb.images.names = imdb.images.names(keep) ; imdb.images.sets = imdb.images.sets(1,keep) ; imdb.images.label = imdb.images.label(keep) ; end ================================================ FILE: Datasets/cnn_ucf101_setup_data.m ================================================ function imdb = cnn_ucf101_setup_data(varargin) % CNN_UCF101_SETUP_DATA Initialize UCF101 - Action Recognition Data Set % http://crcv.ucf.edu/data/UCF101.php % this script requires UCF101 downloaded and frames extracted in frames % folder opts.dataDir = fullfile('data','UCF101') ; opts.lite = false ; opts = vl_argparse(opts, varargin) ; %% ------------------------------------------------------------------------ % Load categories metadata % ------------------------------------------------------------------------- % find metadata metaPath = fullfile(opts.dataDir, 'ucfTrainTestlist/classInd.txt') ; fprintf('using metadata %s\n', metaPath) ; tmp = importdata(metaPath); nCls = numel(tmp); if nCls ~= 101 error('Wrong meta file %s',metaPath); end cats = cell(1,nCls); for i=1:numel(tmp) t = strsplit(tmp{i}); cats{i} = t{2}; end imdb.classes.name = cats ; imdb.imageDir = fullfile(opts.dataDir, 'frames') ; %% ------------------------------------------------------------------------ % load image names and labels % ------------------------------------------------------------------------- fprintf('searching training images ...\n') ; names = {} ; name = {}; labels = {} ; for d = dir(fullfile(imdb.imageDir, 'v_*'))' [~,lab] = ismember(lower(d.name(3:end-8)), lower(cats)) ; if lab==0 error('no class label found for %s',d.name); end ims = dir(fullfile(imdb.imageDir, d.name, '*.jpg')) ; name{end+1} = d.name; names{end+1} = strcat([d.name, filesep], {ims.name}) ; labels{end+1} = lab ; if mod(numel(names), 10) == 0, fprintf('.') ; end if mod(numel(names), 500) == 0, fprintf('\n') ; end %fprintf('found %s with %d images\n', d.name, numel(ims)) ; end % names = horzcat(names{:}) ; labels = horzcat(labels{:}) ; imdb.images.id = 1:numel(names) ; imdb.images.name = name ; imdb.images.names = names ; imdb.images.label = labels ; %% ------------------------------------------------------------------------ % load train / test splits % ------------------------------------------------------------------------- fprintf('labeling data...(this may take couple of minutes)') ; imdb.images.sets = zeros(3, numel(names)) ; setNames = {'train','test'}; setVal = [1,3]; for s=1:numel(setNames) for i=1:3 trainFl = fullfile(opts.dataDir, 'ucfTrainTestlist',sprintf('%slist%02d.txt',... setNames{s},i)) ; trainList = importdata(trainFl); if isfield(trainList,'textdata') trainList = trainList.textdata; end for j=1:numel(trainList) tmp = strsplit(trainList{j},'/'); [~,lab] = ismember(lower(tmp{2}(1:end-4)), lower(name)) ; if lab==0 error('cannot find the video %s',tmp{2}); end % if trainList.data(j) ~= labels(lab) % error('Labels do not match for %s',tmp{2}); % end imdb.images.sets(i,lab) = setVal(s); end end end fprintf('\n') ; %% ------------------------------------------------------------------------ % Postprocessing % ------------------------------------------------------------------------- % sort categories by WNID (to be compatible with other implementations) [imdb.classes.name,perm] = sort(imdb.classes.name) ; relabel(perm) = 1:numel(imdb.classes.name) ; ok = imdb.images.label > 0 ; imdb.images.label(ok) = relabel(imdb.images.label(ok)) ; if opts.lite % pick a small number of images for the first 10 classes % this cannot be done for test as we do not have test labels clear keep ; for i=1:10 sel = find(imdb.images.label == i) ; train = sel(imdb.images.sets(1,sel) == 1) ; test = sel(imdb.images.sets(1,sel) == 3) ; keep{i} = [train test] ; end keep = keep{:}; imdb.images.id = imdb.images.id(keep) ; imdb.images.name = imdb.images.name(keep) ; imdb.images.names = imdb.images.names(keep) ; imdb.images.sets = imdb.images.sets(1,keep) ; imdb.images.label = imdb.images.label(keep) ; end ================================================ FILE: Layers/AppRankPooling.m ================================================ classdef AppRankPooling < dagnn.ElementWise % author: Hakan Bilen % dagnn wrapper for approximate rank pooling properties scale = 1 end methods function outputs = forward(obj, inputs, params) outputs{1} = vl_nnarpooltemporal(inputs{1},inputs{2}) * obj.scale ; end function [derInputs, derParams] = backward(obj, inputs, params, derOutputs) derInputs = cell(1,2); derInputs{1} = vl_nnarpooltemporal(inputs{1},inputs{2},derOutputs{1}) * obj.scale; derParams = {} ; end function outputSizes = getOutputSizes(obj, inputSizes) % This is not correct, dim(4) depends on inputs{2} outputSizes{1} = inputSizes{1} ; end function obj = AppRankPooling(varargin) obj.load(varargin) ; end end end ================================================ FILE: Layers/BatchNormN.m ================================================ classdef BatchNormN < dagnn.ElementWise properties numChannels epsilon = 1e-5 opts = {'NoCuDNN'} % ours seems slightly faster end properties (Transient) moments end methods function outputs = forward(obj, inputs, params) if strcmp(obj.net.mode, 'test') outputs{1} = vl_nnbnorm(inputs{1}, params{1}, params{2}, ... 'moments', params{3}, ... 'epsilon', obj.epsilon, ... obj.opts{:}) ; else [outputs{1},obj.moments] = ... vl_nnbnorm(inputs{1}, params{1}, params{2}, ... 'epsilon', obj.epsilon, ... obj.opts{:}) ; end end function [derInputs, derParams] = backward(obj, inputs, params, derOutputs) [derInputs{1}, derParams{1}, derParams{2}, derParams{3}] = ... vl_nnbnorm(inputs{1}, params{1}, params{2}, derOutputs{1}, ... 'epsilon', obj.epsilon, ... 'moments', obj.moments, ... obj.opts{:}) ; obj.moments = [] ; % multiply the moments update by the number of images in the batch % this is required to make the update additive for subbatches % and will eventually be normalized away % derParams{3} = derParams{3} * size(inputs{1},4) ; end % --------------------------------------------------------------------- function obj = BatchNormN(varargin) obj.load(varargin{:}) ; end function params = initParams(obj) params{1} = ones(obj.numChannels,1,'single') ; params{2} = zeros(obj.numChannels,1,'single') ; params{3} = zeros(obj.numChannels,2,'single') ; end function attach(obj, net, index) attach@dagnn.ElementWise(obj, net, index) ; p = net.getParamIndex(net.layers(index).params{3}) ; net.params(p).trainMethod = 'average' ; net.params(p).learningRate = 0.1 ; end end end ================================================ FILE: Layers/ErrorMultiClass.m ================================================ classdef ErrorMultiClass < dagnn.Loss % author: Hakan Bilen % computes multi-class accuracy % inputs{1}->scores % inputs{2}->gt labels properties nImgPerClass = [] nCorPred = [] accuracy = [] resetLayer = false end methods function outputs = forward(obj, inputs, params) if numel(inputs)~=2 error('wrong number of inputs'); end nCls = size(inputs{1},3); if obj.resetLayer || isempty(obj.nImgPerClass) obj.nImgPerClass = zeros(1,size(inputs{1},3)); obj.nCorPred = zeros(1,size(inputs{1},3)); obj.accuracy = zeros(1,size(inputs{1},3)); if obj.resetLayer obj.resetLayer = false ; obj.average = 0 ; end end [~,predictions] = max(gather(squeeze(inputs{1})),[],1); for c=1:nCls obj.nImgPerClass(c) = obj.nImgPerClass(c) + sum(inputs{2}==c); obj.nCorPred(c) = obj.nCorPred(c) + sum(predictions==c & inputs{2}==c); end ni = obj.nImgPerClass; ni(ni==0) = 1; obj.accuracy = obj.nCorPred ./ ni; obj.average = (1-mean(obj.accuracy)); outputs{1} = obj.average; end function [derInputs, derParams] = backward(obj, inputs, params, derOutputs) derInputs = cell(1,2); derParams = {} ; end function reset(obj) obj.resetLayer = true ; % obj.nImgPerClass = []; % obj.nCorPred = []; % obj.accuracy = []; % obj.average = 0; end function obj = ErrorMultiClass(varargin) obj.load(varargin) ; obj.loss = 'error_multi_class' ; end end end ================================================ FILE: Layers/L2Normalize.m ================================================ classdef L2Normalize < dagnn.ElementWise % author: Hakan Bilen % dagnn wrapper for l2 normalization properties scale = 1; clip = [-inf inf]; offset = 0; end methods function outputs = forward(obj, inputs, params) outputs{1} = vl_nnl2norm(inputs{1},[obj.scale obj.clip obj.offset]); end function [derInputs, derParams] = backward(obj, inputs, params, derOutputs) derInputs{1} = vl_nnl2norm(inputs{1},[obj.scale obj.clip obj.offset],derOutputs{1}); derParams = {} ; end function obj = L2Normalize(varargin) obj.load(varargin) ; end end end ================================================ FILE: Layers/LossNormalized.m ================================================ classdef LossNormalized < dagnn.Loss % properties % loss = 'softmaxlog' % ignoreAverage = false % opts = {} % end % properties (Transient) % average = 0 % numAveraged = 0 % end methods function outputs = forward(obj, inputs, params) outputs{1} = vl_nnloss(inputs{1}, inputs{2}, [], 'loss', obj.loss, obj.opts{:}) ; obj.accumulateAverage(inputs, outputs); if numel(size(inputs{1}))>3 bs = size(inputs{1},4) ; else bs = 1 ; end outputs{1} = outputs{1} / bs ; end function accumulateAverage(obj, inputs, outputs) if obj.ignoreAverage, return; end; n = obj.numAveraged ; m = n + size(inputs{1}, 1) * size(inputs{1}, 2) * size(inputs{1}, 4); obj.average = bsxfun(@plus, n * obj.average, gather(outputs{1})) / m ; obj.numAveraged = m ; end function [derInputs, derParams] = backward(obj, inputs, params, derOutputs) if numel(size(inputs{1}))>3 bs = size(inputs{1},4) ; else bs = 1 ; end derInputs{1} = vl_nnloss(inputs{1}, inputs{2}, derOutputs{1}, 'loss', obj.loss, obj.opts{:}) / bs; derInputs{2} = [] ; derParams = {} ; end function reset(obj) obj.average = 0 ; obj.numAveraged = 0 ; end function outputSizes = getOutputSizes(obj, inputSizes, paramSizes) outputSizes{1} = [1 1 1 inputSizes{1}(4)] ; end function rfs = getReceptiveFields(obj) % the receptive field depends on the dimension of the variables % which is not known until the network is run rfs(1,1).size = [NaN NaN] ; rfs(1,1).stride = [NaN NaN] ; rfs(1,1).offset = [NaN NaN] ; rfs(2,1) = rfs(1,1) ; end function obj = LossNormalized(varargin) obj.load(varargin) ; end end end ================================================ FILE: Layers/TemporalPooling.m ================================================ classdef TemporalPooling < dagnn.ElementWise % author: Hakan Bilen % dagnn wrapper for approximate rank pooling properties method = 'max'; end methods function outputs = forward(obj, inputs, params) outputs{1} = vl_nnpooltemporal(inputs{1},inputs{2},obj.method); end function [derInputs, derParams] = backward(obj, inputs, params, derOutputs) derInputs = cell(1,2); derInputs{1} = vl_nnpooltemporal(inputs{1},inputs{2},obj.method,derOutputs{1}); derParams = {} ; end function obj = TemporalPooling(varargin) obj.load(varargin) ; end end end ================================================ FILE: Layers/vl_nnarpooltemporal.m ================================================ function Y = vl_nnarpooltemporal(X,ids,dzdy) % author: Hakan Bilen % approximate rank pooling % ids indicates frame-video association (must be in range [1-N]) sz = size(X); forward = logical(nargin<3); if numel(ids)~=size(X,4) error('Error: ids dimension does not match with X!'); end nVideos = max(ids); if forward Y = zeros([sz(1:3),nVideos],'like',X); else Y = zeros(size(X),'like',X); end for v=1:nVideos % pool among frames indv = find(ids==v); if isempty(indv) error('Error: No frames in video %d',v); end N = numel(indv); % magic numbers fw = zeros(1,N); if N==1 fw = 1; else for i=1:N fw(i) = sum((2*(i:N)-N-1) ./ (i:N)); end end if forward Y(:,:,:,v) = sum(bsxfun(@times,X(:,:,:,indv),... reshape(single(fw),[1 1 1 numel(indv)])),4); else Y(:,:,:,indv) = (bsxfun(@times,repmat(dzdy(:,:,:,v),[1,1,1,numel(indv)]),... reshape(fw,[1 1 1 numel(indv)]))) ; end end % % if forward % fprintf(' fwd-arpool %.2f ',sqrt(sum(Y(:).^2))); % else % fprintf(' back-arpool %f ',sqrt(sum(Y(:).^2))); % end ================================================ FILE: Layers/vl_nnl2norm.m ================================================ function y = vl_nnl2norm(x,param,dzdy) % author: Hakan Bilen % l2 normalize whole feature map sc = param(1); clip = param(2:3); offset = param(4); if nargin == 3 assert(all(size(x) == size(dzdy))); else dzdy = []; end x_sz = size(x); if ~all(x_sz([1 2]) == 1) % Create an array of size #channels x #samples x = reshape(x, prod(x_sz(1:3)), []); end x = x + offset; if isempty(dzdy) y = (bsxfun(@times, x, sc./(sqrt(sum(x .* x)) + single(1e-12)))); % clip max values if all(y(:)clip(2)) warning('Too small clipping interval'); fprintf('min %f max %f\n',min(y(:)),max(y(:))); end y(y(:)clip(2)) = clip(2); else if ~all(x_sz([1 2]) == 1) dzdy = reshape(dzdy, prod(x_sz(1:3)), []); end len_ = 1./sqrt(sum(x.*x)+single(1e-12)); dzdy_ = bsxfun(@times,dzdy,len_.^3); y = sc * (bsxfun(@times,dzdy,len_)-bsxfun(@times,x,sum(x.*dzdy_))); end if ~all(x_sz([1 2]) == 1) y = reshape(y, x_sz); end % % if isempty(dzdy) % fprintf(' fwd-l2 %.2f ',sqrt(sum(y(:).^2))); % else % fprintf(' back-l2 %f dzdy %f ',sqrt(sum(y(:).^2)),sqrt(sum(dzdy(:).^2))); % end ================================================ FILE: Layers/vl_nnpooltemporal.m ================================================ function Y = vl_nnpooltemporal(X,ids,method,dzdy) % author: Hakan Bilen % temporal pooling along frames % ids indicates frame-video association % method 'max' or 'avg' sz = size(X); forward = logical(nargin<4); Xp = permute(X,[4,1,2,3]); if numel(ids)~=size(X,4) error('Error: ids dimension does not match with X!'); end nVideos = max(ids); if forward Yp = zeros([nVideos,sz(1:3)],'like',X); for v=1:nVideos % pool among frames indv = find(ids==v); Yp(v,:,:,:) = vl_nnpool(Xp(indv,:,:,:), [numel(indv),1], ... 'pad', 0, 'stride', [numel(indv),1], 'method', method) ; end else dzdyp = permute(dzdy,[4,1,2,3]); Yp = zeros(size(Xp),'like',Xp); for v=1:nVideos % pool among frames indv = find(ids==v); Yp(indv,:,:,:) = vl_nnpool(Xp(indv,:,:,:), [numel(indv),1], dzdyp(v,:,:,:), ... 'pad', 0, 'stride', [numel(indv),1], 'method', method) ; end end % permute back Y = permute(Yp,[2,3,4,1]); % if forward % fprintf(' fwd-ptemp %.2f ',sqrt(sum(Y(:).^2))); % else % fprintf(' back-ptemp %.2f ',sqrt(sum(Y(:).^2))); % end ================================================ FILE: README.md ================================================ # Dynamic Image Networks for Action Recognition ## Improved Results (see the extended version of CVPR paper) ResNeXt-50 | HMDB51 (%) | UCF101 (%) | ------------------|--------|--------| SI | 53.5 | 87.6 | DI | 57.3 | 86.6 | OF | 55.8 | 84.9 | DOF | 58.9 | 86.6 | SI+OF | 67.5 | 93.9 | SI+DI | 61.3 | 90.6 | OF+DOF | 62.6 | 89.1 | SI+DI+OF+DOF | 71.5 | 95.0 | SI+DI+OF+DOF+iDT | 74.2 | 95.4 | * Results are in the standard average multi-class accuracy (%) * SI: RGB image * DI: dynamic RBG image * OF: optical flow * DOF: dynamic optical flow * iDT: improved trajectory features ## Installation 1. Clone the Dynamic Image Net repository: ```Shell git clone --recursive https://github.com/hbilen/dynamic-image-nets ``` 2. Compile matconvnet toolbox: (see [http://www.vlfeat.org/matconvnet/install/](http://www.vlfeat.org/matconvnet/install/)) 3. Install additional matconvnet packages ```Shell run matconvnet/matlab/vl_setupnn.m ; vl_contrib install mcnExtraLayers ; vl_contrib setup mcnExtraLayers ; vl_contrib install autonn ; vl_contrib setup autonn ; ``` 4. Download your dataset : (e.g. UCF101 from [http://crcv.ucf.edu/data/UCF101.php](http://crcv.ucf.edu/data/UCF101.php)) 5. Convert videos to frames, resize them to 256x256 and store them in such a directory structure: Alternatively, you can download RGB and precomputed optical flow frames from [Christoph Feichtenhofer](http://ftp.tugraz.at/pub/feichtenhofer/tsfusion/data/) and copy RGB frames under "UCF101/frames" and optical flow frames under "UCF101/tvl1_flow". ```Shell data/UCF101/ucfTrainTestlist/ ├── classInd.txt ├── testlist01.txt ├── testlist02.txt ├── testlist03.txt ├── trainlist01.txt ├── trainlist02.txt └── trainlist03.txt data/UCF101/frames/ ├── v_ApplyEyeMakeup_g01_c01 │ ├── 00001.jpg │ ├── 00002.jpg │ ├── 00003.jpg │ ├── 00004.jpg │ ├── 00005.jpg ``` ## Compute and Visualise Approximate Dynamic Images 1. If you want to compute approximate dynamic images, get a list of ordered frames from a video and try ```matlab di = compute_approximate_dynamic_images(images) ; ``` 2. If you want to visualise approximate dynamic images, get a list of ordered frames from a video and try ```matlab visualize_approximate_dynamic_images(images) ``` ## Train a Dynamic Image Net You can modify the options in `main_train.m` and train your model by running ```matlab main_train ``` Note: If you want to train a model on a different dataset than UCF101 or HMDB51, you need to write a custom script `cnn_dataset_setup_data` to build your database (imdb). ## Evaluation 1. Download the CNN Models for the UCF101 dataset, that are used in the journal, from [here](http://groups.inf.ed.ac.uk/hbilen-data/data/resnext50_dicnn.tar). 2. Choose the right model, split and input type (e.g.) ```matlab net = load('resnext50-rgb-arpool-split1.mat') ; net = dagnn.DagNN.loadobj(net) ; net.addLayer('errMC',ErrorMultiClass(),{'prediction','label'},'mcerr') ; opts.network = net ; opts.split = 1 ; opts.train.gpus = 1 ; opts.epochFactor = 0 ; [net, info] = cnn_dicnn_rgb(opts) ``` ## Citing Dynamic Image Networks If you find the code useful, please cite: @inproceedings{Bilen2016a, author = "Bilen, H. and Fernando, B. and Gavves, E. and Vedaldi, A. and Gould, S.", title = "Dynamic Image Networks for Action Recognition", booktitle = "CVPR", year = "2016" } @journal{Bilen2017a, author = "Bilen, H. and Fernando, B. and Gavves, E. and Vedaldi, A.", title = "Action Recognition with Dynamic Image Networks", journal = " IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)", year = "2017" } ## License The analysis work performed with the program(s) must be non-proprietary work. Licensee and its contract users must be or be affiliated with an academic facility. Licensee may additionally permit individuals who are students at such academic facility to access and use the program(s). Such students will be considered contract users of licensee. The program(s) may not be used for commercial competitive analysis (such as benchmarking) or for any commercial activity, including consulting. ================================================ FILE: dicnn/cnn_dicnn_of.m ================================================ function [net, info] = cnn_dicnn_of(varargin) %CNN_DICNN_OF Fine-tunes a pre-trained CNN with dynamic images on optical % (DOF in pami journal) flow frames on UCF101 dataset run(fullfile(fileparts(mfilename('fullpath')), ... '..', 'matconvnet', 'matlab', 'vl_setupnn.m')) ; run(fullfile(fileparts(mfilename('fullpath')), ... '..', 'matconvnet', 'contrib', 'mcnExtraLayers', 'setup_mcnExtraLayers.m')) ; run(fullfile(fileparts(mfilename('fullpath')), ... '..', 'matconvnet', 'contrib', 'autonn', 'setup_autonn.m')) ; addpath Layers Datasets opts.dataDir = fullfile('data','UCF101') ; opts.expDir = fullfile('exp', 'UCF101') ; opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat.mat') ; [opts, varargin] = vl_argparse(opts, varargin) ; opts.numFetchThreads = 8 ; opts.lite = false ; opts.imdbPath = fullfile(opts.dataDir, 'imdb-of.mat'); opts.pool1Layer = 'conv0'; % before conv1 opts.pool1Type = 'arpool'; % before conv1 opts.pool2Layer = 'fc6'; % before conv1 opts.DropOutRate = 0.85 ; opts.datasetFn = @cnn_ucf101_of_setup_data ; opts.networkFn = @cnn_init_resnext ; opts.network = [] ; opts.split = 1; % data split opts.reverseDyn = 0; % reverse video frames e.g.[N:-1:1] opts.numDynImgs = 10 ; opts.epochFactor = 5 ; opts.train = struct() ; opts.train.gpus = []; opts.train.batchSize = 128 ; opts.train.numSubBatches = 32 ; opts.train.solver = [] ; opts.train.prefetch = true ; opts.train.learningRate = 1e-2 ; opts.train.numEpochs = 30 ; % opts.train.savePreds = true ; opts.train.randomSeed = 0 ; opts = vl_argparse(opts, varargin) ; if ~isfield(opts.train, 'gpus'), opts.train.gpus = []; end; % ------------------------------------------------------------------------- % Prepare data % ------------------------------------------------------------------------- if exist(opts.imdbPath,'file') imdb = load(opts.imdbPath) ; else imdb = opts.datasetFn('dataDir', opts.dataDir, 'lite', opts.lite) ; mkdir(opts.expDir) ; save(opts.imdbPath, '-struct', 'imdb') ; end % UCF101 has 3 data splits if opts.split>3 error('split should be <=3'); end imdb.images.set = imdb.images.sets(opts.split,:); % reverse frame order if opts.reverseDyn for i=1:numel(imdb.images.names) imdb.images.names{i} = imdb.images.names{i}(end:-1:1); end end % ------------------------------------------------------------------------- % Prepare model % ------------------------------------------------------------------------- if isempty(opts.network) net = load(opts.modelPath); if isfield(net,'net') net = net.net; end opts.nCls = max(imdb.images.label) ; % net = dagnn.DagNN.loadobj(net) ; net = opts.networkFn(net,opts) ; % two channels instead of 3 RGB net.params(1).value = net.params(1).value(:,:,1:2,:) ; % Set the class names in the network net.meta.classes.name = imdb.classes.name ; net.meta.classes.description = imdb.classes.name ; else assert(isa(opts.network,'dagnn.DagNN')) ; net = opts.network ; end % ------------------------------------------------------------------------- % Learn % ------------------------------------------------------------------------- if opts.epochFactor>0 opts.train.train = repmat(find(imdb.images.set==1),[1 opts.epochFactor]) ; else opts.train.train = NaN ; opts.train.numEpochs = 1 ; end opts.train.val = find(imdb.images.set==3) ; [net, info] = cnn_train_dicnn_dag(net, imdb, getBatchFn(opts, net.meta), ... 'expDir', opts.expDir, ... opts.train) ; % ------------------------------------------------------------------------- % Report accuracy % ------------------------------------------------------------------------- errlayer = net.getLayerIndex('errMC') ; if ~isnan(errlayer) cats = imdb.classes.name ; accs = net.layers(errlayer).block.accuracy ; if numel(cats)~=numel(accs) error('wrong number of classes\n') ; end for i=1:numel(cats) fprintf('%s acc %.1f\n',cats{i},100*accs(i)) ; end fprintf('Mean accuracy %.1f\n',100*mean(accs)) ; end % ------------------------------------------------------------------------- function fn = getBatchFn(opts, meta) % ------------------------------------------------------------------------- useGpu = numel(opts.train.gpus) > 0 ; bopts.numThreads = opts.numFetchThreads ; bopts.imageSize = meta.normalization.imageSize ; if isfield(meta.normalization,'border') bopts.border = meta.normalization.border ; else bopts.border = meta.normalization.imageSize(1:2) ./ ... meta.normalization.cropSize - meta.normalization.imageSize(1:2); end bopts.averageImage = 128 * ones([1 1 2],'single') ; bopts.numDynImgs = opts.numDynImgs ; fn = @(x,y) getDagNNBatch(bopts,useGpu,x,y) ; % ------------------------------------------------------------------------- function inputs = getDagNNBatch(opts, useGpu, imdb, batch) % ------------------------------------------------------------------------- % batch refers to videos (not for frames) if isempty(batch) inputs = {'input', [], 'label', [], 'VideoId1', [], 'VideoId2', []}; return; end isVal = ~isempty(batch) && imdb.images.set(batch(1)) ~= 1 ; if ~isVal, transformation='multiScaleRegular'; else transformation='none';end names = imdb.images.names(batch); % images = strcat([imdb.imageDir filesep], imdb.images.name(batch)) ; namesM = {}; nVids = numel(batch); VideoId1 = []; VideoId2 = []; % step-size stepSize = 6; % pool nFrames into a dynamic image nFrames = 10; % number of dynamic images to be max pooled later nDynImgs = opts.numDynImgs ; opts = rmfield(opts,'numDynImgs') ; c1 = 1; for v=1:nVids name = names{v}; nFrms = numel(name)/2; nSample = nFrames; if isVal startF = 1 ; else startF = ceil(stepSize/2) ; end nr = numel(startF:stepSize:nFrms); % jitter by removing 50 % and limit a batch to nMaxs * nSamples images if nr > 1 && (~isVal && nr>nDynImgs) rat = min(nDynImgs,ceil(0.50*nr)); ri = randperm(nr); ri = ri(1:rat); r = zeros(1,nr); r(ri) = 1; else r = ones(1,nr); end c3 = 1; c2 = 0; for f=startF:stepSize:nFrms if r(c3) idx = f:min(f+nSample-1,nFrms) ; if numel(idx) 0 if useGpu im = gpuArray(im) ; end inputs = {'input', im, 'label', imdb.images.label(batch), ... 'VideoId1', VideoId1, 'VideoId2', VideoId2}; end ================================================ FILE: dicnn/cnn_dicnn_rgb.m ================================================ function [net, info] = cnn_dicnn_rgb(varargin) %CNN_DICNN_RGB Fine-tunes a pre-trained CNN with dynamic images on RGB frames % (DI in pami journal) on UCF101 dataset run(fullfile(fileparts(mfilename('fullpath')), ... '..', 'matconvnet', 'matlab', 'vl_setupnn.m')) ; run(fullfile(fileparts(mfilename('fullpath')), ... '..', 'matconvnet', 'contrib', 'mcnExtraLayers', 'setup_mcnExtraLayers.m')) ; run(fullfile(fileparts(mfilename('fullpath')), ... '..', 'matconvnet', 'contrib', 'autonn', 'setup_autonn.m')) ; addpath Layers Datasets opts.dataDir = fullfile('data','UCF101') ; opts.expDir = fullfile('exp', 'UCF101') ; opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat'); opts.datasetFn = @cnn_ucf101_setup_data ; opts.networkFn = @cnn_init_resnext ; opts.network = [] ; [opts, varargin] = vl_argparse(opts, varargin) ; opts.numFetchThreads = 8 ; opts.lite = false ; opts.imdbPath = fullfile(opts.dataDir, 'imdb-rgb.mat'); opts.pool1Layer = 'conv0'; % before conv1 opts.pool1Type = 'arpool'; opts.pool2Layer = 'pool5'; opts.pool2Type = 'maxpool'; opts.DropOutRate = 0.5 ; opts.epochFactor = 5 ; opts.split = 1; % data split opts.reverseDyn = 0; % reverse video frames e.g.[N:-1:1] opts.train = struct() ; opts.train.gpus = []; opts.train.batchSize = 128 ; opts.train.numSubBatches = 16 ; opts.train.solver = [] ; opts.train.prefetch = true ; opts.train.numEpochs = 30 ; opts.train.randomSeed = 0 ; % resnet50 % opts.train.learningRate = 1e-2 * [ones(1,2), 0.1*ones(1,1), 0.01*ones(1,1)]; % caffe-ref opts.train.learningRate = 1e-3 * [ones(1,2), 0.1*ones(1,1), 0.01*ones(1,1)]; opts = vl_argparse(opts, varargin) ; if ~isfield(opts.train, 'gpus'), opts.train.gpus = []; end % opts.train.numEpochs = numel(opts.train.learningRate); % ------------------------------------------------------------------------- % Prepare data % ------------------------------------------------------------------------- if exist(opts.imdbPath,'file') imdb = load(opts.imdbPath) ; else imdb = opts.datasetFn('dataDir', opts.dataDir, 'lite', opts.lite) ; mkdir(opts.expDir) ; save(opts.imdbPath, '-struct', 'imdb') ; end % UCF101 has 3 data splits if opts.split>3 error('split should be <=3'); end imdb.images.set = imdb.images.sets(opts.split,:); % reverse frame order if opts.reverseDyn for i=1:numel(imdb.images.names) imdb.images.names{i} = imdb.images.names{i}(end:-1:1); end end % ------------------------------------------------------------------------- % Prepare model % ------------------------------------------------------------------------- if isempty(opts.network) net = load(opts.modelPath); if isfield(net,'net') net = net.net; end opts.nCls = max(imdb.images.label) ; net = opts.networkFn(net,opts); if numel(net.meta.normalization.averageImage)>3 sz = size(net.meta.normalization.averageImage) ; net.meta.normalization.averageImage = ... mean(reshape(net.meta.normalization.averageImage,[sz(1)*sz(2) sz(3)]),1) ; end % Set the class names in the network net.meta.classes.name = imdb.classes.name ; net.meta.classes.description = imdb.classes.name ; else assert(isa(opts.network,'dagnn.DagNN')) ; net = opts.network ; end % ------------------------------------------------------------------------- % Learn % ------------------------------------------------------------------------- if opts.epochFactor>0 opts.train.train = repmat(find(imdb.images.set==1),[1 opts.epochFactor]) ; else opts.train.train = NaN ; opts.train.numEpochs = 1 ; end opts.train.val = find(imdb.images.set==3) ; [net, info] = cnn_train_dicnn_dag(net, imdb, getBatchFn(opts, net.meta), ... 'expDir', opts.expDir, ... opts.train) ; % ------------------------------------------------------------------------- % Report accuracy % ------------------------------------------------------------------------- errlayer = net.getLayerIndex('errMC') ; if ~isnan(errlayer) cats = imdb.classes.name ; accs = net.layers(errlayer).block.accuracy ; if numel(cats)~=numel(accs) error('wrong number of classes\n') ; end for i=1:numel(cats) fprintf('%s acc %.1f\n',cats{i},100*accs(i)) ; end fprintf('Mean accuracy %.1f\n',100*mean(accs)) ; end % ------------------------------------------------------------------------- function fn = getBatchFn(opts, meta) % ------------------------------------------------------------------------- useGpu = numel(opts.train.gpus) > 0 ; bopts.numThreads = opts.numFetchThreads ; bopts.imageSize = meta.normalization.imageSize ; if isfield(meta.normalization,'border') bopts.border = meta.normalization.border ; else bopts.border = meta.normalization.imageSize(1:2) ./ ... meta.normalization.cropSize - meta.normalization.imageSize(1:2); end % bopts.averageImage = []; bopts.averageImage = meta.normalization.averageImage ; bopts.interpolation = meta.normalization.interpolation ; bopts.keepAspect = meta.normalization.keepAspect ; % bopts.rgbVariance = meta.augmentation.rgbVariance ; % bopts.transformation = meta.augmentation.transformation ; fn = @(x,y) getDagNNBatch(bopts,useGpu,x,y) ; % ------------------------------------------------------------------------- function inputs = getDagNNBatch(opts, useGpu, imdb, batch) % ------------------------------------------------------------------------- % batch refers to videos (not for frames) if isempty(batch) inputs = {'input', [], 'label', [], 'VideoId1', [], 'VideoId2', []}; return; end isVal = ~isempty(batch) && imdb.images.set(batch(1)) ~= 1 ; % if ~isVal, transformation='stretch'; else transformation='none';end if ~isVal, transformation='multiScaleRegular'; else transformation='none';end names = imdb.images.names(batch); % images = strcat([imdb.imageDir filesep], imdb.images.name(batch)) ; namesM = {}; nVids = numel(batch); VideoId1 = []; VideoId2 = []; % step-size stepSize = 6; % pool nFrames into a dynamic image nFrames = 10; % number of dynamic images to be max pooled later nDynImgs = 10; c1 = 1; for v=1:nVids name = names{v}; if isVal startF = 1 ; else startF = ceil(stepSize/2) ; end nFrms = numel(name); nSample = nFrames; nr = numel(startF:stepSize:nFrms); % jitter by removing 50 % and limit a batch to nMaxs * nSamples images if nr > 1 && (~isVal && nr>nDynImgs) rat = min(nDynImgs,ceil(0.50*nr)); ri = randperm(nr); ri = ri(1:rat); r = zeros(1,nr); r(ri) = 1; else if nr>2*nDynImgs rat = 2*nDynImgs; ri = randperm(nr); ri = ri(1:rat); r = zeros(1,nr); r(ri) = 1; else r = ones(1,nr); end end c3 = 1; c2 = 0; for f=startF:stepSize:nFrms if r(c3) idx = f:min(f+nSample-1,nFrms) ; if numel(idx) 0 if useGpu im = gpuArray(im) ; end inputs = {'input', im, 'label', imdb.images.label(batch), ... 'VideoId1', VideoId1, 'VideoId2', VideoId2}; end ================================================ FILE: dicnn/cnn_init_cafferef.m ================================================ % ------------------------------------------------------------------------- function net = cnn_init_cafferef(net,opts) % ------------------------------------------------------------------------- drop6p = find(cellfun(@(a) strcmp(a.name, 'dropout6'), net.layers)==1); drop7p = find(cellfun(@(a) strcmp(a.name, 'dropout7'), net.layers)==1); if ~isempty(drop6p) assert(~isempty(drop7p)); net.layers{drop6p}.rate = opts.DropOutRate; net.layers{drop7p}.rate = opts.DropOutRate; else relu6p = find(cellfun(@(a) strcmp(a.name, 'relu6'), net.layers)==1); relu7p = find(cellfun(@(a) strcmp(a.name, 'relu7'), net.layers)==1); drop6 = struct('type','dropout','rate', opts.DropOutRate,'name','dropout6') ; drop7 = struct('type','dropout','rate', opts.DropOutRate,'name','dropout7') ; net.layers = [net.layers(1:relu6p) drop6 net.layers(relu6p+1:relu7p) drop7 net.layers(relu7p+1:end)]; end % replace fc8 fc8l = cellfun(@(a) strcmp(a.name, 'fc8'), net.layers)==1; nCls = opts.nCls ; % nCls = 101; sizeW = size(net.layers{fc8l}.weights{1}); if sizeW(4)~=nCls net.layers{fc8l}.weights = {zeros(sizeW(1),sizeW(2),sizeW(3),nCls,'single'), ... zeros(1, nCls, 'single')}; end % change loss % net.layers(end) = []; net.layers{end} = struct('name','loss', 'type','softmaxloss') ; % convert to dagnn net = dagnn.DagNN.fromSimpleNN(net, 'canonicalNames', true) ; poolLyr1 = find(arrayfun(@(a) strcmp(a.name, opts.pool1Layer), net.layers)==1); assert(~isempty(poolLyr1)); % configure appr-rank-pool switch opts.pool1Type case 'arpool' if strcmp(opts.pool1Layer,'conv1') net.addLayer('arpool',AppRankPooling('scale',1),{net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImgN'); net.addLayer('l2normalize',L2Normalize('scale',6000,'clip',[-128 128]),... 'DynImgN','DynImg'); else net.addLayer('arpool',AppRankPooling('scale',0.1),{net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImgN'); net.addLayer('reluP',dagnn.ReLU(),... {'DynImgN'},'DynImg'); end net.setLayerInputs(opts.pool1Layer,{'DynImg'}) ; case 'ppool1' if strcmp(opts.pool1Layer,'conv1') net.addLayer('parampool',LinComb('pad',[1 1 10 1]),... {net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImg',{'conv0f','conv0b'}); else net.addLayer('parampool',LinComb('pad',[1 1 10 1]),... {net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImgN',{'conv0f','conv0b'}); net.addLayer('reluP',dagnn.ReLU(),... {'DynImgN'},'DynImg'); end net.layers(poolLyr1).inputs{1} = 'DynImg' ; % net.params(end-1).value = 0.01 * randn(1,1,10,1,'single'); net.params(end-1).value = 0.1 * ones(1,1,10,1,'single'); net.params(end).value = zeros(1,1,'single'); net.params(end-1).learningRate = 0.1 ; net.params(end).learningRate = 0.2 ; case 'ppool2' if strcmp(opts.pool1Layer,'conv1') net.addLayer('parampool',LinComb('pad',[1 1 10 1]),... {net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImg',{'conv0f','conv0b'}); else net.addLayer('parampool',LinComb('pad',[1 1 10 1]),... {net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImgN',{'conv0f','conv0b'}); net.addLayer('reluP',dagnn.ReLU(),... {'DynImgN'},'DynImg'); end net.layers(poolLyr1).inputs{1} = 'DynImg' ; % net.params(end-1).value = 0.01 * randn(1,1,10,1,'single'); net.params(end-1).value = 0.1 * ones(1,1,10,1,'single'); net.params(end).value = zeros(1,1,'single'); net.params(end-1).learningRate = 0.1 ; net.params(end).learningRate = 0.2 ; case 'none' otherwise error('Unknown pool type %s', opts.pool1Type) ; end % second pool layer (max pooling) poolLyr2 = find(arrayfun(@(a) strcmp(a.name, opts.pool2Layer), net.layers)==1); net.addLayer('tempPoolMax',TemporalPooling('method','max'),... {net.layers(poolLyr2(1)).inputs{1},'VideoId2'},'tempPoolMax'); net.layers(poolLyr2).inputs{1} = 'tempPoolMax'; % add multi-class error net.addLayer('errMC',ErrorMultiClass(),{'prediction','label'},'mcerr'); net_ = net.saveobj ; net = dagnn.DagNN.loadobj(net_) ; net.removeLayer('loss') ; net.addLayer('loss', ... LossNormalized('loss', 'softmaxlog') ,... {'prediction', 'label'}, ... 'objective') ; % replace standard matconvnet bnorm with my version bns = find(arrayfun(@(a) strcmp(class(a.block), 'dagnn.BatchNorm'), net.layers)==1); for i=1:numel(bns) bb = net.layers(bns(i)).block ; net.layers(bns(i)).block = BatchNormN('numChannels',bb.numChannels,... 'epsilon',bb.epsilon,... 'opts',bb.opts) ; end ================================================ FILE: dicnn/cnn_init_resnext.m ================================================ % ------------------------------------------------------------------------- function net = cnn_init_resnext(net,opts) % ------------------------------------------------------------------------- % initialize classifier net = dagnn.DagNN.loadobj(net) ; % convs = find(arrayfun(@(a) isa(a.block, 'dagnn.Conv'), net.layers)==1); fclayer = net.getLayer('classifier_0') ; sizeW = size(net.params(fclayer.paramIndexes(1)).value); % opts.nCls = 101; nCls = opts.nCls ; DropOutRate = opts.DropOutRate ; net.params(fclayer.paramIndexes(1)).value = ... 0.01 * randn([sizeW(1:3),nCls],'single') ; net.params(fclayer.paramIndexes(2)).value = zeros(nCls,1,'single') ; % change loss softmax = find(arrayfun(@(a) isa(a.block, 'dagnn.SoftMax'), net.layers)==1); if ~isempty(softmax) net.removeLayer(net.layers(softmax(1)).name) ; end % convs = find(arrayfun(@(a) isa(a.block, 'dagnn.Conv'), net.layers)==1); fclayer = find(arrayfun(@(a) strcmp(a.name, 'classifier_0'), net.layers)==1); net.renameVar(net.layers(fclayer(end)).name,'prediction') ; net.renameVar('data','input') ; %------------------------------------------------------------------------% % configure appr-rank-pool switch opts.pool1Type case 'arpool' if strcmp(opts.pool1Layer,'conv0') poolLyr1 = 1 ; net.addLayer('arpool',AppRankPooling('scale',0.1),{'input','VideoId1'},'DynImg'); net.setLayerInputs(net.layers(1).name,{'DynImg'}) ; else poolLyr1 = find(arrayfun(@(a) strcmp(a.name, opts.pool1Layer), net.layers)==1); assert(~isempty(poolLyr1)); net.addLayer('arpool',AppRankPooling('scale',0.1),{net.layers(poolLyr1).inputs{1},'VideoId1'},'DynImg'); net.setLayerInputs(opts.pool1Layer,{'DynImg'}) ; end case 'ppool1' if strcmp(opts.pool1Layer,'conv0') poolLyr1 = 1 ; else poolLyr1 = find(arrayfun(@(a) strcmp(a.name, opts.pool1Layer), net.layers)==1); end net.addLayer('parampool',LinComb('pad',[1 1 10 1]),... {'features_4_0_merge','VideoId1'},'DynImg0',{'conv0f','conv0b'}); % net.params(end-1).value = 0.1 * ones(1,1,10,1,'single'); net.params(end-1).value = 0.1 * randn(1,1,10,1,'single'); net.params(end).value = zeros(1,1,'single'); net.addLayer('BnormDyn',dagnn.BatchNorm('numChannels',256),'DynImg0','DynImg',... {'dym','dyb','dybx'}) ; net.params(end-2).value = ones(256,1,'single') ; net.params(end-1).value = zeros(256,1,'single') ; net.params(end).value = zeros(256,2,'single') ; % net.addLayer('reluP',dagnn.ReLU(),... % {'DynImg1'},'DynImg'); net.layers(16).inputs{1} = 'DynImg' ; for i=numel(net.params)-4:numel(net.params), net.params(i).learningRate = 0.1 * net.params(i).learningRate; end case 'none' otherwise error('Unknown pool type %s', opts.pool1Type) ; end net.rebuild() ; %------------------------------------------------------------------------% % second pool layer (max pooling) % poolLyr2 = find(arrayfun(@(a) strcmp(a.name, 'pool5'), net.layers)==1); poolLyr2 = find(arrayfun(@(a) strcmp(a.name, 'features_7_1_merge'), net.layers)==1); net.addLayer('tempPoolMax',TemporalPooling('method','max'),... {net.layers(poolLyr2(1)).outputs{1},'VideoId2'},'tempPoolMax'); % change the input of fc last layer % net.setLayerInputs(net.layers(convs(end)).name,'tempPoolMax') ; % net.addLayer('bnar',dagnn.BatchNorm('numChannels',2048),{'tempPoolMax'},... % 'tempPoolMaxbn',{'bnar_m','bnar_b','bnar_x'}); poolLyr2next = find(arrayfun(@(a) strcmp(a.name, 'features_7_1_id_relu'), net.layers)==1); net.setLayerInputs(net.layers(poolLyr2next(1)).name,{'tempPoolMax'}) ; net.rebuild() ; %------------------------------------------------------------------------% % add drop-out layers if DropOutRate>0 pool5 = find(arrayfun(@(a) strcmp(a.name, 'features_8'), net.layers)==1); oo = net.layers(pool5(1)).outputs{1}; net.addLayer('drop_pool5',dagnn.DropOut('rate',DropOutRate),... oo,sprintf('drop_%s',oo),{}); net.setLayerInputs('classifier_permute',{sprintf('drop_%s',oo)}) ; end %------------------------------------------------------------------------% % add multi-class error net.addLayer('errMC',ErrorMultiClass(),{'prediction','label'},'mcerr'); net.addLayer('loss', ... LossNormalized('loss', 'softmaxlog') ,... {'prediction', 'label'}, ... 'objective') ; %------------------------------------------------------------------------% net.rebuild() % replace standard matconvnet bnorm with my version bns = find(arrayfun(@(a) strcmp(class(a.block), 'dagnn.BatchNorm'), net.layers)==1); for i=1:numel(bns) bb = net.layers(bns(i)).block ; net.layers(bns(i)).block = BatchNormN('numChannels',bb.numChannels,... 'epsilon',bb.epsilon,... 'opts',bb.opts) ; end % dagMergeBatchNorm(net) ; % dagRemoveLayersOfType(net, 'dagnn.BatchNorm') ; net_ = net.saveobj ; net = dagnn.DagNN.loadobj(net_) ; net.meta.normalization.border = [32 32] ; ================================================ FILE: dicnn/cnn_single_of.m ================================================ function [net, info] = cnn_single_of(varargin) %CNN_SINGLE_OF Demonstrates fine-tuning a pre-trained CNN with static % optical flow (OF in pami journal) on UCF101 dataset run(fullfile(fileparts(mfilename('fullpath')), ... '..', 'matconvnet', 'matlab', 'vl_setupnn.m')) ; addpath Layers Datasets opts.dataDir = fullfile('data','UCF101') ; opts.expDir = fullfile('exp', 'UCF101') ; opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat.mat') ; [opts, varargin] = vl_argparse(opts, varargin) ; opts.numFetchThreads = 8 ; opts.lite = false ; opts.imdbPath = fullfile(opts.dataDir, 'imdb-of.mat'); opts.DropOutRate = 0.85 ; opts.datasetFn = @cnn_ucf101_of_setup_data ; opts.networkFn = @cnn_resnext_init ; opts.split = 1; % data split opts.reverseDyn = 0; % reverse video frames e.g.[N:-1:1] opts.numDynImgs = 10 ; opts.epochFactor = 5 ; opts.pool1Layer = 'conv0'; % before conv1 opts.pool1Type = 'none' ; opts.pool2Layer = 'fc6' ; opts.train = struct() ; opts.train.gpus = []; opts.train.batchSize = 128 ; opts.train.numSubBatches = 32 ; opts.train.solver = [] ; opts.train.prefetch = true ; opts.train.learningRate = 1e-2 ; opts.train.numEpochs = 30 ; opts = vl_argparse(opts, varargin) ; if ~isfield(opts.train, 'gpus'), opts.train.gpus = []; end; % ------------------------------------------------------------------------- % Prepare data % ------------------------------------------------------------------------- if exist(opts.imdbPath,'file') imdb = load(opts.imdbPath) ; else imdb = opts.datasetFn('dataDir', opts.dataDir, 'lite', opts.lite) ; mkdir(opts.expDir) ; save(opts.imdbPath, '-struct', 'imdb') ; end % UCF101 has 3 data splits if opts.split>3 error('split should be <=3'); end imdb.images.set = imdb.images.sets(opts.split,:); % reverse frame order if opts.reverseDyn for i=1:numel(imdb.images.names) imdb.images.names{i} = imdb.images.names{i}(end:-1:1); end end % ------------------------------------------------------------------------- % Prepare model % ------------------------------------------------------------------------- net = load(opts.modelPath); if isfield(net,'net') net = net.net; end opts.nCls = max(imdb.images.label) ; % net = dagnn.DagNN.loadobj(net) ; net = opts.networkFn(net,opts) ; % two channels instead of 3 RGB net.params(1).value = net.params(1).value(:,:,1:2,:) ; % Set the class names in the network net.meta.classes.name = imdb.classes.name ; net.meta.classes.description = imdb.classes.name ; % ------------------------------------------------------------------------- % Learn % ------------------------------------------------------------------------- if opts.epochFactor>0 opts.train.train = repmat(find(imdb.images.set==1),[1 opts.epochFactor]) ; else opts.train.train = NaN ; end opts.train.val = find(imdb.images.set==3) ; [net, info] = cnn_train_dicnn_dag(net, imdb, getBatchFn(opts, net.meta), ... 'expDir', opts.expDir, ... opts.train) ; % ------------------------------------------------------------------------- % Report accuracy % ------------------------------------------------------------------------- errlayer = net.getLayerIndex('errMC') ; if ~isnan(errlayer) cats = imdb.classes.name ; accs = net.layers(errlayer).block.accuracy ; if numel(cats)~=numel(accs) error('wrong number of classes\n') ; end for i=1:numel(cats) fprintf('%s acc %.1f\n',cats{i},100*accs(i)) ; end fprintf('Mean accuracy %.1f\n',100*mean(accs)) ; end % ------------------------------------------------------------------------- function fn = getBatchFn(opts, meta) % ------------------------------------------------------------------------- useGpu = numel(opts.train.gpus) > 0 ; bopts.numThreads = opts.numFetchThreads ; bopts.imageSize = meta.normalization.imageSize ; if isfield(meta.normalization,'border') bopts.border = meta.normalization.border ; else bopts.border = meta.normalization.imageSize(1:2) ./ ... meta.normalization.cropSize - meta.normalization.imageSize(1:2); end bopts.averageImage = 128 * ones([1 1 2],'single') ; bopts.numDynImgs = opts.numDynImgs ; % bopts.averageImage = meta.normalization.averageImage ; % bopts.rgbVariance = meta.augmentation.rgbVariance ; % bopts.transformation = meta.augmentation.transformation ; bopts.transformation = 'stretch' ; bopts.transformation = 'multiScaleRegular' ; fn = @(x,y) getDagNNBatch(bopts,useGpu,x,y) ; % ------------------------------------------------------------------------- function inputs = getDagNNBatch(opts, useGpu, imdb, batch) % ------------------------------------------------------------------------- % batch refers to videos (not for frames) if isempty(batch) inputs = {'input', [], 'label', [], 'VideoId1', [], 'VideoId2', []}; return; end isVal = ~isempty(batch) && imdb.images.set(batch(1)) ~= 1 ; if ~isVal, transformation='multiScaleRegular'; else transformation='none';end names = imdb.images.names(batch); % images = strcat([imdb.imageDir filesep], imdb.images.name(batch)) ; namesM = {}; nVids = numel(batch); VideoId1 = []; VideoId2 = []; % step-size stepSize = 6; % pool nFrames into a dynamic image nFrames = 1; % number of dynamic images to be max pooled later nDynImgs = opts.numDynImgs ; opts = rmfield(opts,'numDynImgs') ; c1 = 1; for v=1:nVids name = names{v}; nFrms = numel(name)/2; nSample = nFrames; nr = numel(1:stepSize:nFrms); % jitter by removing 50 % and limit a batch to nMaxs * nSamples images if nr > 1 && (~isVal && nr>nDynImgs) rat = min(nDynImgs,ceil(0.50*nr)); ri = randperm(nr); ri = ri(1:rat); r = zeros(1,nr); r(ri) = 1; else r = ones(1,nr); end c3 = 1; c2 = 0; for f=1:stepSize:nFrms if r(c3) idx = f:min(f+nSample-1,nFrms) ; if numel(idx) 0 if useGpu im = gpuArray(im) ; end inputs = {'input', im, 'label', imdb.images.label(batch), ... 'VideoId2', VideoId2}; end ================================================ FILE: dicnn/cnn_single_rgb.m ================================================ function [net, info] = cnn_single_rgb(varargin) %CNN_SINGLE_RGB Demonstrates fine-tuning a pre-trained CNN with static % RGB frames (SI in pami journal) on UCF101 dataset run(fullfile(fileparts(mfilename('fullpath')), ... '..', 'matconvnet', 'matlab', 'vl_setupnn.m')) ; addpath Layers Datasets opts.dataDir = fullfile('data','UCF101') ; opts.expDir = fullfile('exp', 'UCF101') ; opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat'); opts.datasetFn = @cnn_ucf101_setup_data ; opts.networkFn = @cnn_init_resnext ; opts.pool1Type = 'none' ; opts.pool1Layer = 'conv1' ; opts.pool2Layer = '' ; [opts, varargin] = vl_argparse(opts, varargin) ; opts.numFetchThreads = 8 ; opts.lite = false ; opts.imdbPath = fullfile(opts.dataDir, 'imdb-rgb.mat'); opts.ARPoolLayer = 'conv0'; % before conv1 opts.DropOutRate = 0.5 ; opts.epochFactor = 5 ; opts.split = 1; % data split opts.reverseDyn = 0; % reverse video frames e.g.[N:-1:1] opts.train = struct() ; opts.train.gpus = []; opts.train.batchSize = 128 ; opts.train.numSubBatches = 16 ; opts.train.solver = [] ; opts.train.prefetch = true ; opts.train.numEpochs = 30 ; % resnet50 opts.train.learningRate = 1e-2 * [ones(1,2), 0.1*ones(1,1), 0.01*ones(1,1)]; % caffe-ref opts.train.learningRate = 1e-4 * [ones(1,2), 0.1*ones(1,1), 0.01*ones(1,1)]; opts = vl_argparse(opts, varargin) ; if ~isfield(opts.train, 'gpus'), opts.train.gpus = []; end; % opts.train.numEpochs = numel(opts.train.learningRate); % ------------------------------------------------------------------------- % Prepare data % ------------------------------------------------------------------------- if exist(opts.imdbPath,'file') imdb = load(opts.imdbPath) ; else imdb = opts.datasetFn('dataDir', opts.dataDir, 'lite', opts.lite) ; mkdir(opts.expDir) ; save(opts.imdbPath, '-struct', 'imdb') ; end % UCF101 has 3 data splits if opts.split>3 error('split should be <=3'); end imdb.images.set = imdb.images.sets(opts.split,:); % reverse frame order if opts.reverseDyn for i=1:numel(imdb.images.names) imdb.images.names{i} = imdb.images.names{i}(end:-1:1); end end % ------------------------------------------------------------------------- % Prepare model % ------------------------------------------------------------------------- net = load(opts.modelPath); if isfield(net,'net') net = net.net; end opts.nCls = max(imdb.images.label) ; net = opts.networkFn(net,opts); if numel(net.meta.normalization.averageImage)>3 sz = size(net.meta.normalization.averageImage) ; net.meta.normalization.averageImage = ... mean(reshape(net.meta.normalization.averageImage,[sz(1)*sz(2) sz(3)]),1) ; end % Set the class names in the network net.meta.classes.name = imdb.classes.name ; net.meta.classes.description = imdb.classes.name ; % ------------------------------------------------------------------------- % Learn % ------------------------------------------------------------------------- if opts.epochFactor>0 opts.train.train = repmat(find(imdb.images.set==1),[1 opts.epochFactor]) ; else opts.train.train = NaN ; end opts.train.val = find(imdb.images.set==3) ; [net, info] = cnn_train_dicnn_dag(net, imdb, getBatchFn(opts, net.meta), ... 'expDir', opts.expDir, ... opts.train) ; % ------------------------------------------------------------------------- % Report accuracy % ------------------------------------------------------------------------- errlayer = net.getLayerIndex('errMC') ; if ~isnan(errlayer) cats = imdb.classes.name ; accs = net.layers(errlayer).block.accuracy ; if numel(cats)~=numel(accs) error('wrong number of classes\n') ; end for i=1:numel(cats) fprintf('%s acc %.1f\n',cats{i},100*accs(i)) ; end fprintf('Mean accuracy %.1f\n',100*mean(accs)) ; end % ------------------------------------------------------------------------- function fn = getBatchFn(opts, meta) % ------------------------------------------------------------------------- useGpu = numel(opts.train.gpus) > 0 ; bopts.numThreads = opts.numFetchThreads ; bopts.imageSize = meta.normalization.imageSize ; if isfield(meta.normalization,'border') bopts.border = meta.normalization.border ; else bopts.border = meta.normalization.imageSize(1:2) ./ ... meta.normalization.cropSize - meta.normalization.imageSize(1:2); end % bopts.averageImage = []; bopts.averageImage = meta.normalization.averageImage ; bopts.interpolation = meta.normalization.interpolation ; bopts.keepAspect = meta.normalization.keepAspect ; % bopts.rgbVariance = meta.augmentation.rgbVariance ; % bopts.transformation = meta.augmentation.transformation ; fn = @(x,y) getDagNNBatch(bopts,useGpu,x,y) ; % ------------------------------------------------------------------------- function inputs = getDagNNBatch(opts, useGpu, imdb, batch) % ------------------------------------------------------------------------- % batch refers to videos (not for frames) if isempty(batch) inputs = {'input', [], 'label', [], 'VideoId1', [], 'VideoId2', []}; return; end isVal = ~isempty(batch) && imdb.images.set(batch(1)) ~= 1 ; % if ~isVal, transformation='stretch'; else transformation='none';end if ~isVal, transformation='multiScaleRegular'; else transformation='none';end names = imdb.images.names(batch); % images = strcat([imdb.imageDir filesep], imdb.images.name(batch)) ; namesM = {}; nVids = numel(batch); VideoId1 = []; VideoId2 = []; % step-size stepSize = 6; % pool nFrames into a dynamic image nFrames = 1; % number of dynamic images to be max pooled later nDynImgs = 10; c1 = 1; for v=1:nVids name = names{v}; nFrms = numel(name); nSample = nFrames; nr = numel(1:stepSize:nFrms); % jitter by removing 50 % and limit a batch to nMaxs * nSamples images if nr > 1 && (~isVal && nr>nDynImgs) rat = min(nDynImgs,ceil(0.50*nr)); ri = randperm(nr); ri = ri(1:rat); r = zeros(1,nr); r(ri) = 1; else r = ones(1,nr); end c3 = 1; c2 = 0; for f=1:stepSize:nFrms if r(c3) idx = f:min(f+nSample-1,nFrms) ; if numel(idx) 0 if useGpu im = gpuArray(im) ; end inputs = {'input', im, 'label', imdb.images.label(batch), ... 'VideoId2', VideoId2}; end ================================================ FILE: dicnn/cnn_train_dicnn_dag.m ================================================ function [net,stats] = cnn_train_dicnn_dag(net, imdb, getBatch, varargin) %CNN_DICNN_TRAIN_DAG Demonstrates training a CNN using the DagNN wrapper % CNN_TRAIN_DAG() is similar to CNN_TRAIN(), but works with % the DagNN wrapper instead of the SimpleNN wrapper. % Copyright (C) 2014-16 Andrea Vedaldi. % All rights reserved. % % This file is part of the VLFeat library and is made available under % the terms of the BSD license (see the COPYING file). addpath(fullfile(vl_rootnn, 'examples')); opts.expDir = fullfile('data','exp') ; opts.continue = true ; opts.batchSize = 256 ; opts.numSubBatches = 1 ; opts.train = [] ; opts.val = [] ; opts.gpus = [] ; opts.prefetch = false ; opts.epochSize = inf; opts.numEpochs = 300 ; opts.learningRate = 0.001 ; opts.weightDecay = 0.0005 ; opts.solver = [] ; % Empty array means use the default SGD solver [opts, varargin] = vl_argparse(opts, varargin) ; if ~isempty(opts.solver) assert(isa(opts.solver, 'function_handle') && nargout(opts.solver) == 2,... 'Invalid solver; expected a function handle with two outputs.') ; % Call without input arguments, to get default options opts.solverOpts = opts.solver() ; end opts.momentum = 0.9 ; opts.saveSolverState = true ; opts.nesterovUpdate = false ; opts.randomSeed = 0 ; opts.profile = false ; opts.parameterServer.method = 'mmap' ; opts.parameterServer.prefix = 'mcn' ; opts.derOutputs = {'objective', 1} ; opts.extractStatsFn = @extractStats ; opts.plotStatistics = true; opts.postEpochFn = [] ; % postEpochFn(net,params,state) called after each epoch; can return a new learning rate, 0 to stop, [] for no change opts = vl_argparse(opts, varargin) ; if ~exist(opts.expDir, 'dir'), mkdir(opts.expDir) ; end if isempty(opts.train), opts.train = find(imdb.images.set==1) ; end if isempty(opts.val), opts.val = find(imdb.images.set==2) ; end if isscalar(opts.train) && isnumeric(opts.train) && isnan(opts.train) opts.train = [] ; end if isscalar(opts.val) && isnumeric(opts.val) && isnan(opts.val) opts.val = [] ; end % ------------------------------------------------------------------------- % Initialization % ------------------------------------------------------------------------- evaluateMode = isempty(opts.train) ; if ~evaluateMode if isempty(opts.derOutputs) error('DEROUTPUTS must be specified when training.\n') ; end end % ------------------------------------------------------------------------- % Train and validate % ------------------------------------------------------------------------- modelPath = @(ep) fullfile(opts.expDir, sprintf('net-epoch-%d.mat', ep)); modelFigPath = fullfile(opts.expDir, 'net-train.pdf') ; start = opts.continue * findLastCheckpoint(opts.expDir) ; if start >= 1 fprintf('%s: resuming by loading epoch %d\n', mfilename, start) ; [net, state, stats] = loadState(modelPath(start)) ; else state = [] ; end for epoch=start+1:opts.numEpochs % Set the random seed based on the epoch and opts.randomSeed. % This is important for reproducibility, including when training % is restarted from a checkpoint. rng(epoch + opts.randomSeed) ; prepareGPUs(opts, epoch == start+1) ; % Train for one epoch. params = opts ; params.epoch = epoch ; params.learningRate = opts.learningRate(min(epoch, numel(opts.learningRate))) ; params.train = opts.train(randperm(numel(opts.train))) ; % shuffle params.train = params.train(1:min(opts.epochSize, numel(opts.train))); params.val = opts.val(randperm(numel(opts.val))) ; params.imdb = imdb ; params.getBatch = getBatch ; if numel(opts.gpus) <= 1 [net, state] = processEpoch(net, state, params, 'train') ; [net, state] = processEpoch(net, state, params, 'val') ; if ~evaluateMode saveState(modelPath(epoch), net, state) ; end lastStats = state.stats ; else spmd [net, state] = processEpoch(net, state, params, 'train') ; [net, state] = processEpoch(net, state, params, 'val') ; if labindex == 1 && ~evaluateMode saveState(modelPath(epoch), net, state) ; end lastStats = state.stats ; end lastStats = accumulateStats(lastStats) ; end stats.train(epoch) = lastStats.train ; stats.val(epoch) = lastStats.val ; clear lastStats ; saveStats(modelPath(epoch), stats) ; if opts.plotStatistics switchFigure(1) ; clf ; plots = setdiff(... cat(2,... fieldnames(stats.train)', ... fieldnames(stats.val)'), {'num', 'time'}) ; for p = plots p = char(p) ; values = zeros(0, epoch) ; leg = {} ; for f = {'train', 'val'} f = char(f) ; if isfield(stats.(f), p) tmp = [stats.(f).(p)] ; values(end+1,:) = tmp(1,:)' ; leg{end+1} = f ; end end subplot(1,numel(plots),find(strcmp(p,plots))) ; plot(1:epoch, values','o-') ; xlabel('epoch') ; title(p) ; legend(leg{:}) ; grid on ; end drawnow ; print(1, modelFigPath, '-dpdf') ; end if ~isempty(opts.postEpochFn) if nargout(opts.postEpochFn) == 0 opts.postEpochFn(net, params, state) ; else lr = opts.postEpochFn(net, params, state) ; if ~isempty(lr), opts.learningRate = lr; end if opts.learningRate == 0, break; end end end end % With multiple GPUs, return one copy if isa(net, 'Composite'), net = net{1} ; end % ------------------------------------------------------------------------- function [net, state] = processEpoch(net, state, params, mode) % ------------------------------------------------------------------------- % Note that net is not strictly needed as an output argument as net % is a handle class. However, this fixes some aliasing issue in the % spmd caller. % initialize with momentum 0 if isempty(state) || isempty(state.solverState) state.solverState = cell(1, numel(net.params)) ; state.solverState(:) = {0} ; end % move CNN to GPU as needed numGpus = numel(params.gpus) ; if numGpus >= 1 net.move('gpu') ; for i = 1:numel(state.solverState) s = state.solverState{i} ; if isnumeric(s) state.solverState{i} = gpuArray(s) ; elseif isstruct(s) state.solverState{i} = structfun(@gpuArray, s, 'UniformOutput', false) ; end end end if numGpus > 1 parserv = ParameterServer(params.parameterServer) ; net.setParameterServer(parserv) ; else parserv = [] ; end % profile if params.profile if numGpus <= 1 profile clear ; profile on ; else mpiprofile reset ; mpiprofile on ; end end num = 0 ; epoch = params.epoch ; subset = params.(mode) ; adjustTime = 0 ; stats.num = 0 ; % return something even if subset = [] stats.time = 0 ; start = tic ; for t=1:params.batchSize:numel(subset) fprintf('%s: epoch %02d: %3d/%3d:', mode, epoch, ... fix((t-1)/params.batchSize)+1, ceil(numel(subset)/params.batchSize)) ; batchSize = min(params.batchSize, numel(subset) - t + 1) ; for s=1:params.numSubBatches % get this image batch and prefetch the next batchStart = t + (labindex-1) + (s-1) * numlabs ; batchEnd = min(t+params.batchSize-1, numel(subset)) ; batch = subset(batchStart : params.numSubBatches * numlabs : batchEnd) ; num = num + numel(batch) ; if numel(batch) == 0, continue ; end inputs = params.getBatch(params.imdb, batch) ; if params.prefetch if s == params.numSubBatches batchStart = t + (labindex-1) + params.batchSize ; batchEnd = min(t+2*params.batchSize-1, numel(subset)) ; else batchStart = batchStart + numlabs ; end nextBatch = subset(batchStart : params.numSubBatches * numlabs : batchEnd) ; params.getBatch(params.imdb, nextBatch) ; end if strcmp(mode, 'train') net.mode = 'normal' ; net.accumulateParamDers = (s ~= 1) ; net.eval(inputs, params.derOutputs, 'holdOn', s < params.numSubBatches) ; else net.mode = 'test' ; net.eval(inputs) ; end end % Accumulate gradient. if strcmp(mode, 'train') if ~isempty(parserv), parserv.sync() ; end state = accumulateGradients(net, state, params, parserv) ; end % Get statistics. time = toc(start) + adjustTime ; batchTime = time - stats.time ; stats.num = num ; stats.time = time ; stats = params.extractStatsFn(stats,net) ; currentSpeed = batchSize / batchTime ; averageSpeed = (t + batchSize - 1) / time ; if t == 3*params.batchSize + 1 % compensate for the first three iterations, which are outliers adjustTime = 4*batchTime - time ; stats.time = time + adjustTime ; end fprintf(' %.1f (%.1f) Hz', averageSpeed, currentSpeed) ; for f = setdiff(fieldnames(stats)', {'num', 'time'}) f = char(f) ; fprintf(' %s: %.3f', f, stats.(f)) ; end fprintf('\n') ; end % Save back to state. state.stats.(mode) = stats ; if params.profile if numGpus <= 1 state.prof.(mode) = profile('info') ; profile off ; else state.prof.(mode) = mpiprofile('info'); mpiprofile off ; end end if ~params.saveSolverState state.solverState = [] ; else for i = 1:numel(state.solverState) s = state.solverState{i} ; if isnumeric(s) state.solverState{i} = gather(s) ; elseif isstruct(s) state.solverState{i} = structfun(@gather, s, 'UniformOutput', false) ; end end end net.reset() ; net.move('cpu') ; % ------------------------------------------------------------------------- function state = accumulateGradients(net, state, params, parserv) % ------------------------------------------------------------------------- numGpus = numel(params.gpus) ; otherGpus = setdiff(1:numGpus, labindex) ; den = params.numSubBatches * max(numGpus,1) ; for p=1:numel(net.params) if ~isempty(parserv) parDer = parserv.pullWithIndex(p) ; else parDer = net.params(p).der ; end switch net.params(p).trainMethod case 'average' % mainly for batch normalization thisLR = net.params(p).learningRate ; net.params(p).value = vl_taccum(... 1 - thisLR, net.params(p).value, ... (thisLR/den/net.params(p).fanout), parDer) ; case 'gradient' thisDecay = params.weightDecay * net.params(p).weightDecay ; thisLR = params.learningRate * net.params(p).learningRate ; if thisLR>0 || thisDecay>0 % Normalize gradient and incorporate weight decay. parDer = vl_taccum(1/den, parDer, ... thisDecay, net.params(p).value) ; if isempty(params.solver) % Default solver is the optimised SGD. % Update momentum. state.solverState{p} = vl_taccum(... params.momentum, state.solverState{p}, ... -1, parDer) ; % Nesterov update (aka one step ahead). if params.nesterovUpdate delta = params.momentum * state.solverState{p} - parDer ; else delta = state.solverState{p} ; end % Update parameters. net.params(p).value = vl_taccum(... 1, net.params(p).value, thisLR, delta) ; else % call solver function to update weights [net.params(p).value, state.solverState{p}] = ... params.solver(net.params(p).value, state.solverState{p}, ... parDer, params.solverOpts, thisLR) ; end end otherwise error('Unknown training method ''%s'' for parameter ''%s''.', ... net.params(p).trainMethod, ... net.params(p).name) ; end end % ------------------------------------------------------------------------- function stats = accumulateStats(stats_) % ------------------------------------------------------------------------- for s = {'train', 'val'} s = char(s) ; total = 0 ; % initialize stats stucture with same fields and same order as % stats_{1} stats__ = stats_{1} ; names = fieldnames(stats__.(s))' ; values = zeros(1, numel(names)) ; fields = cat(1, names, num2cell(values)) ; stats.(s) = struct(fields{:}) ; for g = 1:numel(stats_) stats__ = stats_{g} ; num__ = stats__.(s).num ; total = total + num__ ; for f = setdiff(fieldnames(stats__.(s))', 'num') f = char(f) ; stats.(s).(f) = stats.(s).(f) + stats__.(s).(f) * num__ ; if g == numel(stats_) stats.(s).(f) = stats.(s).(f) / total ; end end end stats.(s).num = total ; end % ------------------------------------------------------------------------- function stats = extractStats(stats, net) % ------------------------------------------------------------------------- sel = find(cellfun(@(x) isa(x,'dagnn.Loss'), {net.layers.block})) ; for i = 1:numel(sel) if net.layers(sel(i)).block.ignoreAverage, continue; end; stats.(net.layers(sel(i)).outputs{1}) = net.layers(sel(i)).block.average ; end % ------------------------------------------------------------------------- function saveState(fileName, net_, state) % ------------------------------------------------------------------------- net = net_.saveobj() ; save(fileName, 'net', 'state') ; % ------------------------------------------------------------------------- function saveStats(fileName, stats) % ------------------------------------------------------------------------- if exist(fileName) save(fileName, 'stats', '-append') ; else save(fileName, 'stats') ; end % ------------------------------------------------------------------------- function [net, state, stats] = loadState(fileName) % ------------------------------------------------------------------------- load(fileName, 'net', 'state', 'stats') ; net = dagnn.DagNN.loadobj(net) ; if isempty(whos('stats')) error('Epoch ''%s'' was only partially saved. Delete this file and try again.', ... fileName) ; end % ------------------------------------------------------------------------- function epoch = findLastCheckpoint(modelDir) % ------------------------------------------------------------------------- list = dir(fullfile(modelDir, 'net-epoch-*.mat')) ; tokens = regexp({list.name}, 'net-epoch-([\d]+).mat', 'tokens') ; epoch = cellfun(@(x) sscanf(x{1}{1}, '%d'), tokens) ; epoch = max([epoch 0]) ; % ------------------------------------------------------------------------- function switchFigure(n) % ------------------------------------------------------------------------- if get(0,'CurrentFigure') ~= n try set(0,'CurrentFigure',n) ; catch figure(n) ; end end % ------------------------------------------------------------------------- function clearMex() % ------------------------------------------------------------------------- clear vl_tmove vl_imreadjpeg ; % ------------------------------------------------------------------------- function prepareGPUs(opts, cold) % ------------------------------------------------------------------------- numGpus = numel(opts.gpus) ; if numGpus > 1 % check parallel pool integrity as it could have timed out pool = gcp('nocreate') ; if ~isempty(pool) && pool.NumWorkers ~= numGpus delete(pool) ; end pool = gcp('nocreate') ; if isempty(pool) parpool('local', numGpus) ; cold = true ; end end if numGpus >= 1 && cold fprintf('%s: resetting GPU\n', mfilename) clearMex() ; if numGpus == 1 gpuDevice(opts.gpus) else spmd clearMex() ; gpuDevice(opts.gpus(labindex)) end end end ================================================ FILE: dicnn/cnn_video_of_get_batch.m ================================================ function imo = cnn_video_of_get_batch(images, vids, varargin) % CNN_VIDEO_OF_GET_BATCH Load, preprocess, and pack images for CNN evaluation % video ids % use same spatial jittering for frames from the same video % NOTE: all the frames from a video should have the same size (wxh) opts.imageSize = [227, 227] ; opts.border = [29, 29] ; opts.keepAspect = true ; opts.numAugments = 1 ; opts.transformation = 'multiScaleRegular' ; opts.averageImage = [] ; opts.rgbVariance = zeros(0,2,'single') ; opts.interpolation = 'bilinear' ; opts.numThreads = 1 ; opts.prefetch = false ; opts.lazyResize = true ; opts.subMean = false; % subtract the mean from each video opts = vl_argparse(opts, varargin); % fetch is true if images is a list of filenames (instead of % a cell array of images) fetch = numel(images) >= 1 && ischar(images{1}) ; % prefetch is used to load images in a separate thread prefetch = fetch & opts.prefetch ; if prefetch vl_imreadjpeg(images, 'numThreads', opts.numThreads, 'prefetch') ; imo = [] ; return ; end if fetch im = vl_imreadjpeg(images,'numThreads', opts.numThreads) ; else im = images ; end tfs = [] ; switch opts.transformation case 'none' tfs = [ .5 ; .5 ; 0 ] ; case 'f5' tfs = [... .5 0 0 1 1 .5 0 0 1 1 ; .5 0 1 0 1 .5 0 1 0 1 ; 0 0 0 0 0 1 1 1 1 1] ; case 'f25' [tx,ty] = meshgrid(linspace(0,1,5)) ; tfs = [tx(:)' ; ty(:)' ; zeros(1,numel(tx))] ; tfs_ = tfs ; tfs_(3,:) = 1 ; tfs = [tfs,tfs_] ; case 'stretch' case 'multiScaleRegular' otherwise error('Uknown transformations %s', opts.transformation) ; end [~,transformations] = sort(rand(size(tfs,2), numel(images)), 1) ; if ~isempty(opts.rgbVariance) && isempty(opts.averageImage) opts.averageImage = zeros(1,1,2) ; end if numel(opts.averageImage) == 2 opts.averageImage = reshape(opts.averageImage, 1,1,2) ; end imo = zeros(opts.imageSize(1), opts.imageSize(2), 2, ... numel(images)/2*opts.numAugments, 'single') ; nVid = max(vids); si = 1 ; countv = 1; for v=1:nVid vid = find(vids==v); for i=1:numel(images(vid)) % acquire image if isempty(im{i}) imt1 = imread(images{2*vid(i)-1}) ; imt2 = imread(images{2*vid(i)}) ; else imt1 = im{2*vid(i)-1} ; imt2 = im{2*vid(i)} ; end imt = single(cat(3,imt1,imt2)) ; % faster than im2single (and multiplies by 255) % resize w = size(imt,2) ; h = size(imt,1) ; factor = [(opts.imageSize(1)+opts.border(1))/h ... (opts.imageSize(2)+opts.border(2))/w]; if opts.keepAspect factor = max(factor) ; end if any(abs(factor - 1) > 0.0001) imt = imresize(imt, ... 'scale', factor, ... 'method', opts.interpolation) ; end % crop & flip if i==1 flip = rand > 0.5 ; w = size(imt,2) ; h = size(imt,1) ; switch opts.transformation case 'stretch' sz = round(min(opts.imageSize(1:2)' .* (1-0.1+0.2*rand(2,1)), [w;h])) ; dx = randi(w - sz(2) + 1, 1) ; dy = randi(h - sz(1) + 1, 1) ; % flip = rand > 0.5 ; case 'multiScaleRegular' reg_szs = [256, 224, 192, 168] ; sz(1) = reg_szs(randi(4)); sz(2) = reg_szs(randi(4)); dy = [0 h-sz(1) 0 h-sz(1) floor((h-sz(1)+1)/2)] + 1; dx = [0 w-sz(2) w-sz(2) 0 floor((w-sz(2)+1)/2)] + 1; corner = randi(5); dx = dx(corner); dy = dy(corner); otherwise tf = tfs(:, transformations(mod(0, numel(transformations)) + 1)) ; sz = opts.imageSize(1:2) ; dx = floor((w - sz(2)) * tf(2)) + 1 ; dy = floor((h - sz(1)) * tf(1)) + 1 ; % flip = tf(3) ; end end if opts.lazyResize sx = round(linspace(dx, sz(2)+dx-1, opts.imageSize(2))) ; sy = round(linspace(dy, sz(1)+dy-1, opts.imageSize(1))) ; else factor = [opts.imageSize(1)/sz(1) ... opts.imageSize(2)/sz(2)]; if any(abs(factor - 1) > 0.0001) imt = imresize(gather(imt(dy:sz(1)+dy-1,dx:sz(2)+dx-1,:)), [opts.imageSize(1:2)],... 'Antialiasing', false, 'Method', opts.interpolation); end sx = 1:opts.imageSize(2); sy = 1:opts.imageSize(1); end if flip sx = fliplr(sx) ; imo(:,:,1,si) = 255 - imt(sy,sx,1) ; imo(:,:,2,si) = imt(sy,sx,2) ; else imo(:,:,:,si) = imt(sy,sx,:) ; end si = si + 1 ; end countv = countv + numel(images(vid)); end if ~isempty(opts.averageImage) && numel(opts.averageImage)==2 if ~isempty(opts.rgbVariance) imo = bsxfun(@minus, imo, opts.averageImage+reshape(opts.rgbVariance * randn(2,1), 1,1,3)) ; else imo = bsxfun(@minus, imo, opts.averageImage) ; end end ================================================ FILE: dicnn/cnn_video_rgb_get_batch.m ================================================ function imo = cnn_video_rgb_get_batch(images, vids, varargin) % CNN_VIDEO_RGB_GET_BATCH Load, preprocess, and pack images for CNN evaluation % video ids % use same spatial jittering for frames from the same video % NOTE: all the frames from a video should have the same size (wxh) opts.imageSize = [227, 227] ; opts.border = [29, 29] ; opts.keepAspect = true ; opts.numAugments = 1 ; opts.transformation = 'none' ; opts.averageImage = [] ; opts.rgbVariance = zeros(0,3,'single') ; opts.interpolation = 'bilinear' ; opts.numThreads = 1 ; opts.prefetch = false ; opts.subMean = false ; % subtract the mean from each video opts.lazyResize = true ; opts = vl_argparse(opts, varargin); % fetch is true if images is a list of filenames (instead of % a cell array of images) fetch = numel(images) >= 1 && ischar(images{1}) ; % prefetch is used to load images in a separate thread prefetch = fetch & opts.prefetch ; if prefetch vl_imreadjpeg(images, 'numThreads', opts.numThreads, 'prefetch') ; imo = [] ; return ; end if fetch im = vl_imreadjpeg(images,'numThreads', opts.numThreads) ; else im = images ; end tfs = [] ; switch opts.transformation case 'none' tfs = [ .5 ; .5 ; 0 ] ; case 'f5' tfs = [... .5 0 0 1 1 .5 0 0 1 1 ; .5 0 1 0 1 .5 0 1 0 1 ; 0 0 0 0 0 1 1 1 1 1] ; case 'f25' [tx,ty] = meshgrid(linspace(0,1,5)) ; tfs = [tx(:)' ; ty(:)' ; zeros(1,numel(tx))] ; tfs_ = tfs ; tfs_(3,:) = 1 ; tfs = [tfs,tfs_] ; case 'stretch' case 'multiScaleRegular' otherwise error('Uknown transformations %s', opts.transformation) ; end [~,transformations] = sort(rand(size(tfs,2), numel(images)), 1) ; if ~isempty(opts.rgbVariance) && isempty(opts.averageImage) opts.averageImage = zeros(1,1,3) ; end if numel(opts.averageImage) == 3 opts.averageImage = reshape(opts.averageImage, 1,1,3) ; end imo = zeros(opts.imageSize(1), opts.imageSize(2), 3, ... numel(images)*opts.numAugments, 'single') ; nVid = max(vids); si = 1 ; countv = 1; for v=1:nVid vid = find(vids==v); for i=1:numel(images(vid)) % acquire image if isempty(im{i}) imt = imread(images{vid(i)}) ; imt = single(imt) ; % faster than im2single (and multiplies by 255) else imt = im{vid(i)} ; end if size(imt,3) == 1 imt = cat(3, imt, imt, imt) ; end % resize w = size(imt,2) ; h = size(imt,1) ; factor = [(opts.imageSize(1)+opts.border(1))/h ... (opts.imageSize(2)+opts.border(2))/w]; if opts.keepAspect factor = max(factor) ; end if any(abs(factor - 1) > 0.0001) imt = imresize(imt, ... 'scale', factor, ... 'method', opts.interpolation) ; end % crop & flip if i==1 w = size(imt,2) ; h = size(imt,1) ; switch opts.transformation case 'stretch' sz = round(min(opts.imageSize(1:2)' .* (1-0.1+0.2*rand(2,1)), [w;h])) ; dx = randi(w - sz(2) + 1, 1) ; dy = randi(h - sz(1) + 1, 1) ; flip = rand > 0.5 ; case 'multiScaleRegular' reg_szs = [256, 224, 192, 168] ; sz(1) = reg_szs(randi(4)); sz(2) = reg_szs(randi(4)); dy = [0 h-sz(1) 0 h-sz(1) floor((h-sz(1)+1)/2)] + 1; dx = [0 w-sz(2) w-sz(2) 0 floor((w-sz(2)+1)/2)] + 1; corner = randi(5); dx = dx(corner); dy = dy(corner); flip = rand > 0.5 ; otherwise tf = tfs(:, transformations(mod(0, numel(transformations)) + 1)) ; sz = opts.imageSize(1:2) ; dx = floor((w - sz(2)) * tf(2)) + 1 ; dy = floor((h - sz(1)) * tf(1)) + 1 ; flip = tf(3) ; end end if opts.lazyResize sx = round(linspace(dx, sz(2)+dx-1, opts.imageSize(2))) ; sy = round(linspace(dy, sz(1)+dy-1, opts.imageSize(1))) ; else factor = [opts.imageSize(1)/sz(1) ... opts.imageSize(2)/sz(2)]; if any(abs(factor - 1) > 0.0001) imt = imresize(gather(imt(dy:sz(1)+dy-1,dx:sz(2)+dx-1,:)), ... opts.imageSize(1:2), 'Antialiasing', false, ... 'Method', opts.interpolation); end sx = 1:opts.imageSize(2); sy = 1:opts.imageSize(1); end if flip sx = fliplr(sx) ; end imo(:,:,:,si) = imt(sy,sx,:) ; si = si + 1 ; end countv = countv + numel(images(vid)); end if ~isempty(opts.averageImage) && numel(opts.averageImage)==3 if ~isempty(opts.rgbVariance) imo = bsxfun(@minus, imo, opts.averageImage+reshape(opts.rgbVariance * randn(3,1), 1,1,3)) ; else imo = bsxfun(@minus, imo, opts.averageImage) ; end end ================================================ FILE: dicnn/compute_approximate_dynamic_images.m ================================================ function di = compute_approximate_dynamic_images(images) % Computes approximate dynamic images for a given array of images % IMAGES must be a tensor of H x W x D x N dimensionality or % cell of image names % For the exact dynamic images, use the code % http://users.cecs.anu.edu.au/~basura/dynamic_images/code.zip % Explained here http://arxiv.org/abs/1512.01848 if isempty(images) di = [] ; return ; end if iscell(images) imagesA = cell(1,numel(images)) ; for i=1:numel(images) if ~ischar(images{i}) error('images must be an array of images or cell of image names') ; end imagesA{i} = imread(images{i}) ; end images = cat(4,imagesA{:}) ; end N = size(images,4) ; di = vl_nnarpooltemporal(single(images),ones(1,N)) ; ================================================ FILE: dicnn/visualize_approximate_dynamic_images.m ================================================ function visualize_approximate_dynamic_images(images) % VISUALIZE_DYNAMIC_IMAGES di = compute_approximate_dynamic_images(images) ; di = di - min(di(:)) ; di = 255 * di ./ max(di(:)) ; image(uint8(di)) ; ================================================ FILE: main_train.m ================================================ model = 'resnext50' ; % {'cafferef','resnext50','resnext101'} input = 'rgb' ; % {'rgb','of'} dataset = 'ucf101' ; % {'ucf101','hmdb51'} hmdb51 requires more iterations to train (add more epochs to learning rate) opts.train.batchSize = 128 ; opts.train.numSubBatches = 32 ; % increase the number (16,32) if it does not fit into gpu mem opts.epochFactor = 5 ; opts.split = 1 ; opts.train.gpus = 1 ; run matconvnet/matlab/vl_setupnn.m ; vl_contrib install mcnExtraLayers ; vl_contrib setup mcnExtraLayers ; vl_contrib install autonn ; vl_contrib setup autonn ; % addpath(fullfile('matconvnet','contrib','mcnExtraLayers','matlab')) ; opts.expDir = ['exp/' model 'rgb-arpool-split' num2str(opts.split)] ; if strcmp(input,'rgb') opts.DropOutRate = 0.5 ; trainfn = @cnn_dicnn_rgb ; elseif strcmp(input,'of') opts.DropOutRate = 0.8 ; trainfn = @cnn_dicnn_of ; end if strcmp(model,'cafferef') opts.pool1Layer = 'conv1' ; % download from http://www.vlfeat.org/matconvnet/models/imagenet-caffe-ref.mat opts.modelPath = fullfile('models','imagenet-caffe-ref.mat') ; opts.networkFn = @cnn_init_cafferef ; if strcmp(input,'rgb') opts.train.learningRate = 1e-3 * [ones(1,2) 0.1*ones(1,2)] ; else opts.train.learningRate = 3e-3 * [ones(1,10) 0.1*ones(1,2)] ; end opts.train.numEpochs = numel(opts.train.learningRate) ; elseif strcmp(model,'resnext50') || strcmp(model,'resnext101') % download from http://www.robots.ox.ac.uk/~albanie/models/pytorch-imports/resnext_50_32x4d-pt-mcn.mat % download from http://www.robots.ox.ac.uk/~albanie/models/pytorch-imports/resnext_101_32x4d-pt-mcn.mat if strcmp(model,'resnext50') opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat') ; else opts.modelPath = fullfile('models','resnext_101_32x4d-pt-mcn.mat') ; end opts.modelPath = fullfile('models','resnext_50_32x4d-pt-mcn.mat') ; opts.networkFn = @cnn_init_resnext ; if strcmp(input,'rgb') opts.train.learningRate = 1e-2 * [ones(1,2) 0.1*ones(1,8) ] ; else opts.train.learningRate = 1e-2 * [ones(1,2) 0.1*ones(1,2) ] ; end end addpath dicnn ; [net, info] = trainfn(opts) ================================================ FILE: utils/extract_frames.sh ================================================ # !/bin/bash # This script converts videos into frames # for different fps change (-r 1) for f in *.avi do g=`echo $f | sed 's/\.avi//'`; echo Processing $f; mkdir -p frames/$g/ ; ffmpeg -i $f frames/$g/image-%04d.jpeg ; done