Repository: jiyanggao/TALL Branch: master Commit: 3df6794af148 Files: 10 Total size: 45.8 KB Directory structure: gitextract_wdpa8dzs/ ├── README.md ├── ctrl_model.py ├── ctrl_test_results.txt ├── dataset.py ├── exp_data/ │ └── .gitkeep ├── main.py ├── util/ │ ├── __init__.py │ └── cnn.py ├── video_allframes_info.pkl └── vs_multilayer.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: README.md ================================================ ## TALL: Temporal Activity Localization via Language Query This is the repository for our ICCV 2017 paper [_TALL: Temporal Activity Localization via Language Query_](https://arxiv.org/abs/1705.02101). ### Visual Features on TACoS Download the C3D features for [training set](https://drive.google.com/file/d/1zQp0aYGFCm8PqqHOh4UtXfy2U3pJMBeu/view?usp=sharing) and [test set](https://drive.google.com/file/d/1zC-UrspRf42Qiu5prQw4fQrbgLQfJN-P/view?usp=sharing) of TACoS dataset. Modify the path to feature folders in main.py ### Sentence Embeddings on TACoS Download the Skip-thought sentence embeddings and sample files from [here](https://drive.google.com/file/d/1HF-hNFPvLrHwI5O7YvYKZWTeTxC5Mg1K/view?usp=sharing) of TACoS Dataset, and put them under exp_data folder. ### Reproduce the results on TACoS `python main.py` ### Charades-STA anno download The sentence temporal annotations on [Charades](http://allenai.org/plato/charades/) dataset are available here: [train](https://drive.google.com/file/d/1ZjG7wJpPSMIBYnW7BAG2u9VVEoNvFm5c/view?usp=sharing), [test](https://drive.google.com/file/d/1QG4MXFkoj6JFU0YK5olTY75xTARKSW5e/view?usp=sharing). The format is "[video name] [start time] [end time]##[sentence]". You may want to generate the skip-thought embeddings and C3D features on Charades-STA, and modify the codes slightly to reproduce the experiments. ### Updates on Charades-STA performance I did some anno cleaning for Charades-STA (compared to the version I used in ICCV paper), the updated performance is listed below. Please compare to these results when using Charades-STA. | Model | R@1,IoU=0.5 | R@1,IoU=0.7 | R@5,IoU=0.5 | R@5,IoU=0.7 | | :--------------- | ----------: | ----------: | ----------: | ----------: | | CTRL (aln) | 17.69 | 5.91 | 55.54 | 23.79 | | CTRL (reg-p) | 19.22 | 6.64 | 57.98 | 25.22 | | CTRL (reg-np) | 21.42 | 7.15 | 59.11 | 26.91 | ================================================ FILE: ctrl_model.py ================================================ import numpy as np import tensorflow as tf from tensorflow.python.framework import dtypes from util.cnn import fc_layer as fc import vs_multilayer from dataset import TestingDataSet from dataset import TrainingDataSet class CTRL_Model(object): def __init__(self, batch_size, train_csv_path, test_csv_path, test_visual_feature_dir, train_visual_feature_dir): self.batch_size = batch_size self.test_batch_size = 1 self.vs_lr = 0.005 self.lambda_regression = 0.01 self.alpha = 1.0/batch_size self.semantic_size = 1024 # the size of visual and semantic comparison size self.sentence_embedding_size = 4800 self.visual_feature_dim = 4096*3 self.train_set=TrainingDataSet(train_visual_feature_dir, train_csv_path, self.batch_size) self.test_set=TestingDataSet(test_visual_feature_dir, test_csv_path, self.test_batch_size) ''' used in training alignment model, CTRL(aln) ''' def fill_feed_dict_train(self): image_batch,sentence_batch,offset_batch = self.train_set.next_batch() input_feed = { self.visual_featmap_ph_train: image_batch, self.sentence_ph_train: sentence_batch, self.offset_ph: offset_batch } return input_feed ''' used in training alignment+regression model, CTRL(reg) ''' def fill_feed_dict_train_reg(self): image_batch, sentence_batch, offset_batch = self.train_set.next_batch_iou() input_feed = { self.visual_featmap_ph_train: image_batch, self.sentence_ph_train: sentence_batch, self.offset_ph: offset_batch } return input_feed ''' cross modal processing module ''' def cross_modal_comb(self, visual_feat, sentence_embed, batch_size): vv_feature = tf.reshape(tf.tile(visual_feat, [batch_size, 1]), [batch_size, batch_size, self.semantic_size]) ss_feature = tf.reshape(tf.tile(sentence_embed,[1, batch_size]),[batch_size, batch_size, self.semantic_size]) concat_feature = tf.reshape(tf.concat(2,[vv_feature, ss_feature]),[batch_size, batch_size, self.semantic_size+self.semantic_size]) print concat_feature.get_shape().as_list() mul_feature = tf.mul(vv_feature, ss_feature) add_feature = tf.add(vv_feature, ss_feature) comb_feature = tf.reshape(tf.concat(2, [mul_feature, add_feature, concat_feature]),[1, batch_size, batch_size, self.semantic_size*4]) return comb_feature ''' visual semantic inference, including visual semantic alignment and clip location regression ''' def visual_semantic_infer(self, visual_feature_train, sentence_embed_train, visual_feature_test, sentence_embed_test): name="CTRL_Model" with tf.variable_scope(name): print "Building training network...............................\n" transformed_clip_train = fc('v2s_lt', visual_feature_train, output_dim=self.semantic_size) transformed_clip_train_norm = tf.nn.l2_normalize(transformed_clip_train, dim=1) transformed_sentence_train = fc('s2s_lt', sentence_embed_train, output_dim=self.semantic_size) transformed_sentence_train_norm = tf.nn.l2_normalize(transformed_sentence_train, dim=1) cross_modal_vec_train = self.cross_modal_comb(transformed_clip_train_norm, transformed_sentence_train_norm, self.batch_size) sim_score_mat_train = vs_multilayer.vs_multilayer(cross_modal_vec_train, "vs_multilayer_lt", middle_layer_dim=1000) sim_score_mat_train = tf.reshape(sim_score_mat_train,[self.batch_size, self.batch_size, 3]) tf.get_variable_scope().reuse_variables() print "Building test network...............................\n" transformed_clip_test = fc('v2s_lt', visual_feature_test, output_dim=self.semantic_size) transformed_clip_test_norm = tf.nn.l2_normalize(transformed_clip_test, dim=1) transformed_sentence_test = fc('s2s_lt', sentence_embed_test, output_dim=self.semantic_size) transformed_sentence_test_norm = tf.nn.l2_normalize(transformed_sentence_test, dim=1) cross_modal_vec_test = self.cross_modal_comb(transformed_clip_test_norm, transformed_sentence_test_norm, self.test_batch_size) sim_score_mat_test = vs_multilayer.vs_multilayer(cross_modal_vec_test, "vs_multilayer_lt", reuse=True, middle_layer_dim=1000) sim_score_mat_test = tf.reshape(sim_score_mat_test, [3]) return sim_score_mat_train, sim_score_mat_test ''' compute alignment and regression loss ''' def compute_loss_reg(self, sim_reg_mat, offset_label): sim_score_mat, p_reg_mat, l_reg_mat = tf.split(2, 3, sim_reg_mat) sim_score_mat = tf.reshape(sim_score_mat, [self.batch_size, self.batch_size]) l_reg_mat = tf.reshape(l_reg_mat, [self.batch_size, self.batch_size]) p_reg_mat = tf.reshape(p_reg_mat, [self.batch_size, self.batch_size]) # unit matrix with -2 I_2 = tf.diag(tf.constant(-2.0, shape=[self.batch_size])) all1 = tf.constant(1.0, shape=[self.batch_size, self.batch_size]) # | -1 1 1... | # mask_mat = | 1 -1 -1... | # | 1 1 -1 ... | mask_mat = tf.add(I_2, all1) # loss cls, not considering iou I = tf.diag(tf.constant(1.0, shape=[self.batch_size])) I_half = tf.diag(tf.constant(0.5, shape=[self.batch_size])) batch_para_mat = tf.constant(self.alpha, shape=[self.batch_size, self.batch_size]) para_mat = tf.add(I,batch_para_mat) loss_mat = tf.log(tf.add(all1, tf.exp(tf.mul(mask_mat, sim_score_mat)))) loss_mat = tf.mul(loss_mat, para_mat) loss_align = tf.reduce_mean(loss_mat) # regression loss l_reg_diag = tf.matmul(tf.mul(l_reg_mat, I), tf.constant(1.0, shape=[self.batch_size, 1])) p_reg_diag = tf.matmul(tf.mul(p_reg_mat, I), tf.constant(1.0, shape=[self.batch_size, 1])) offset_pred = tf.concat(1, (p_reg_diag, l_reg_diag)) loss_reg = tf.reduce_mean(tf.abs(tf.sub(offset_pred, offset_label))) loss=tf.add(tf.mul(self.lambda_regression, loss_reg), loss_align) return loss, offset_pred, loss_reg def init_placeholder(self): visual_featmap_ph_train = tf.placeholder(tf.float32, shape=(self.batch_size, self.visual_feature_dim)) sentence_ph_train = tf.placeholder(tf.float32, shape=(self.batch_size, self.sentence_embedding_size)) offset_ph = tf.placeholder(tf.float32, shape=(self.batch_size,2)) visual_featmap_ph_test = tf.placeholder(tf.float32, shape=(self.test_batch_size, self.visual_feature_dim)) sentence_ph_test = tf.placeholder(tf.float32, shape=(self.test_batch_size, self.sentence_embedding_size)) return visual_featmap_ph_train,sentence_ph_train,offset_ph,visual_featmap_ph_test,sentence_ph_test def get_variables_by_name(self,name_list): v_list = tf.trainable_variables() v_dict = {} for name in name_list: v_dict[name] = [] for v in v_list: for name in name_list: if name in v.name: v_dict[name].append(v) for name in name_list: print "Variables of <"+name+">" for v in v_dict[name]: print " "+v.name return v_dict def training(self, loss): v_dict = self.get_variables_by_name(["lt"]) vs_optimizer = tf.train.AdamOptimizer(self.vs_lr, name='vs_adam') vs_train_op = vs_optimizer.minimize(loss, var_list=v_dict["lt"]) return vs_train_op def construct_model(self): # initialize the placeholder self.visual_featmap_ph_train, self.sentence_ph_train, self.offset_ph, self.visual_featmap_ph_test, self.sentence_ph_test=self.init_placeholder() # build inference network sim_reg_mat, sim_reg_mat_test = self.visual_semantic_infer(self.visual_featmap_ph_train, self.sentence_ph_train, self.visual_featmap_ph_test, self.sentence_ph_test) # compute loss self.loss_align_reg, offset_pred, loss_reg = self.compute_loss_reg(sim_reg_mat, self.offset_ph) # optimize self.vs_train_op = self.training(self.loss_align_reg) return self.loss_align_reg, self.vs_train_op, sim_reg_mat_test, offset_pred, loss_reg ================================================ FILE: ctrl_test_results.txt ================================================ ================================================ FILE: dataset.py ================================================ import numpy as np from math import sqrt import os import random import pickle ''' calculate temporal intersection over union ''' def calculate_IoU(i0, i1): union = (min(i0[0], i1[0]), max(i0[1], i1[1])) inter = (max(i0[0], i1[0]), min(i0[1], i1[1])) iou = 1.0*(inter[1]-inter[0])/(union[1]-union[0]) return iou ''' calculate the non Intersection part over Length ratia, make sure the input IoU is larger than 0 ''' def calculate_nIoL(base, sliding_clip): inter = (max(base[0], sliding_clip[0]), min(base[1], sliding_clip[1])) inter_l = inter[1]-inter[0] length = sliding_clip[1]-sliding_clip[0] nIoL = 1.0*(length-inter_l)/length return nIoL class TrainingDataSet(object): def __init__(self, sliding_dir, it_path, batch_size): self.counter = 0 self.batch_size = batch_size self.context_num = 1 self.context_size = 128 print "Reading training data list from "+it_path cs = pickle.load(open(it_path)) movie_length_info = pickle.load(open("./video_allframes_info.pkl")) self.clip_sentence_pairs = [] for l in cs: clip_name = l[0] sent_vecs = l[1] for sent_vec in sent_vecs: self.clip_sentence_pairs.append((clip_name, sent_vec)) movie_names_set = set() self.movie_clip_names = {} # read groundtruth sentence-clip pairs for k in range(len(self.clip_sentence_pairs)): clip_name = self.clip_sentence_pairs[k][0] movie_name = clip_name.split("_")[0] if not movie_name in movie_names_set: movie_names_set.add(movie_name) self.movie_clip_names[movie_name] = [] self.movie_clip_names[movie_name].append(k) self.movie_names = list(movie_names_set) self.visual_feature_dim = 4096*3 self.sent_vec_dim = 4800 self.num_samples = len(self.clip_sentence_pairs) self.sliding_clip_path = sliding_dir print str(len(self.clip_sentence_pairs))+" clip-sentence pairs are readed" # read sliding windows, and match them with the groundtruths to make training samples sliding_clips_tmp = os.listdir(self.sliding_clip_path) self.clip_sentence_pairs_iou = [] for clip_name in sliding_clips_tmp: if clip_name.split(".")[2]=="npy": movie_name = clip_name.split("_")[0] for clip_sentence in self.clip_sentence_pairs: original_clip_name = clip_sentence[0] original_movie_name = original_clip_name.split("_")[0] if original_movie_name==movie_name: start = int(clip_name.split("_")[1]) end = int(clip_name.split("_")[2].split(".")[0]) o_start = int(original_clip_name.split("_")[1]) o_end = int(original_clip_name.split("_")[2].split(".")[0]) iou = calculate_IoU((start, end), (o_start, o_end)) if iou>0.5: nIoL=calculate_nIoL((o_start, o_end), (start, end)) if nIoL<0.15: movie_length = movie_length_info[movie_name.split(".")[0]] start_offset =o_start-start end_offset = o_end-end self.clip_sentence_pairs_iou.append((clip_sentence[0], clip_sentence[1], clip_name, start_offset, end_offset)) self.num_samples_iou = len(self.clip_sentence_pairs_iou) print str(len(self.clip_sentence_pairs_iou))+" iou clip-sentence pairs are readed" ''' compute left (pre) and right (post) context features ''' def get_context_window(self, clip_name, win_length): movie_name = clip_name.split("_")[0] start = int(clip_name.split("_")[1]) end = int(clip_name.split("_")[2].split(".")[0]) clip_length = self.context_size left_context_feats = np.zeros([win_length, 4096], dtype=np.float32) right_context_feats = np.zeros([win_length, 4096], dtype=np.float32) last_left_feat = np.load(self.sliding_clip_path+clip_name) last_right_feat = np.load(self.sliding_clip_path+clip_name) for k in range(win_length): left_context_start = start-clip_length*(k+1) left_context_end = start-clip_length*k right_context_start = end+clip_length*k right_context_end = end+clip_length*(k+1) left_context_name = movie_name+"_"+str(left_context_start)+"_"+str(left_context_end)+".npy" right_context_name = movie_name+"_"+str(right_context_start)+"_"+str(right_context_end)+".npy" if os.path.exists(self.sliding_clip_path+left_context_name): left_context_feat = np.load(self.sliding_clip_path+left_context_name) last_left_feat = left_context_feat else: left_context_feat = last_left_feat if os.path.exists(self.sliding_clip_path+right_context_name): right_context_feat = np.load(self.sliding_clip_path+right_context_name) last_right_feat = right_context_feat else: right_context_feat = last_right_feat left_context_feats[k] = left_context_feat right_context_feats[k] = right_context_feat return np.mean(left_context_feats, axis=0), np.mean(right_context_feats, axis=0) ''' read next batch of training data, this function is used for training CTRL-aln ''' def next_batch(self): random_batch_index = random.sample(range(self.num_samples), self.batch_size) image_batch = np.zeros([self.batch_size, self.visual_feature_dim]) sentence_batch = np.zeros([self.batch_size, self.sent_vec_dim]) offset_batch = np.zeros([self.batch_size, 2], dtype=np.float32) # this one is actually useless index = 0 clip_set=set() while index < self.batch_size: k = random_batch_index[index] clip_name = self.clip_sentence_pairs[k][0] if not clip_name in clip_set: clip_set.add(clip_name) feat_path = self.image_dir+self.clip_sentence_pairs[k][0]+".npy" featmap = np.load(feat_path) image_batch[index,:] = featmap sentence_batch[index,:] = self.clip_sentence_pairs[k][1][:self.sent_vec_dim] index+=1 else: r = random.choice(range(self.num_samples)) random_batch_index[index] = r continue return image_batch, sentence_batch, offset_batch ''' read next batch of training data, this function is used for training CTRL-reg ''' def next_batch_iou(self): random_batch_index = random.sample(range(self.num_samples_iou), self.batch_size) image_batch = np.zeros([self.batch_size, self.visual_feature_dim]) sentence_batch = np.zeros([self.batch_size, self.sent_vec_dim]) offset_batch = np.zeros([self.batch_size, 2], dtype=np.float32) index = 0 clip_set = set() while index < self.batch_size: k = random_batch_index[index] clip_name = self.clip_sentence_pairs_iou[k][0] if not clip_name in clip_set: clip_set.add(clip_name) feat_path = self.sliding_clip_path+self.clip_sentence_pairs_iou[k][2] featmap = np.load(feat_path) # read context features left_context_feat, right_context_feat = self.get_context_window(self.clip_sentence_pairs_iou[k][2], self.context_num) image_batch[index,:] = np.hstack((left_context_feat, featmap, right_context_feat)) sentence_batch[index,:] = self.clip_sentence_pairs_iou[k][1][:self.sent_vec_dim] p_offset = self.clip_sentence_pairs_iou[k][3] l_offset = self.clip_sentence_pairs_iou[k][4] offset_batch[index,0] = p_offset offset_batch[index,1] = l_offset index+=1 else: r = random.choice(range(self.num_samples_iou)) random_batch_index[index] = r continue return image_batch, sentence_batch, offset_batch class TestingDataSet(object): def __init__(self, img_dir, csv_path, batch_size): #il_path: image_label_file path #self.index_in_epoch = 0 #self.epochs_completed = 0 self.batch_size = batch_size self.image_dir = img_dir print "Reading testing data list from "+csv_path self.semantic_size = 4800 csv = pickle.load(open(csv_path)) self.clip_sentence_pairs = [] for l in csv: clip_name = l[0] sent_vecs = l[1] for sent_vec in sent_vecs: self.clip_sentence_pairs.append((clip_name, sent_vec)) print str(len(self.clip_sentence_pairs))+" pairs are readed" movie_names_set = set() self.movie_clip_names = {} for k in range(len(self.clip_sentence_pairs)): clip_name = self.clip_sentence_pairs[k][0] movie_name = clip_name.split("_")[0] if not movie_name in movie_names_set: movie_names_set.add(movie_name) self.movie_clip_names[movie_name] = [] self.movie_clip_names[movie_name].append(k) self.movie_names = list(movie_names_set) self.clip_num_per_movie_max = 0 for movie_name in self.movie_clip_names: if len(self.movie_clip_names[movie_name])>self.clip_num_per_movie_max: self.clip_num_per_movie_max = len(self.movie_clip_names[movie_name]) print "Max number of clips in a movie is "+str(self.clip_num_per_movie_max) self.sliding_clip_path = img_dir sliding_clips_tmp = os.listdir(self.sliding_clip_path) self.sliding_clip_names = [] for clip_name in sliding_clips_tmp: if clip_name.split(".")[2]=="npy": movie_name = clip_name.split("_")[0] if movie_name in self.movie_clip_names: self.sliding_clip_names.append(clip_name.split(".")[0]+"."+clip_name.split(".")[1]) self.num_samples = len(self.clip_sentence_pairs) print "sliding clips number: "+str(len(self.sliding_clip_names)) assert self.batch_size <= self.num_samples def get_clip_sample(self, sample_num, movie_name, clip_name): length=len(os.listdir(self.image_dir+movie_name+"/"+clip_name)) sample_step=1.0*length/sample_num sample_pos=np.floor(sample_step*np.array(range(sample_num))) sample_pos_str=[] img_names=os.listdir(self.image_dir+movie_name+"/"+clip_name) # sort is very important! to get a correct sequence order img_names.sort() # print img_names for pos in sample_pos: sample_pos_str.append(self.image_dir+movie_name+"/"+clip_name+"/"+img_names[int(pos)]) return sample_pos_str def get_context_window(self, clip_name, win_length): movie_name = clip_name.split("_")[0] start = int(clip_name.split("_")[1]) end = int(clip_name.split("_")[2].split(".")[0]) clip_length = 128#end-start left_context_feats = np.zeros([win_length,4096], dtype=np.float32) right_context_feats = np.zeros([win_length,4096], dtype=np.float32) last_left_feat = np.load(self.sliding_clip_path+clip_name) last_right_feat = np.load(self.sliding_clip_path+clip_name) for k in range(win_length): left_context_start = start-clip_length*(k+1) left_context_end = start-clip_length*k right_context_start = end+clip_length*k right_context_end = end+clip_length*(k+1) left_context_name = movie_name+"_"+str(left_context_start)+"_"+str(left_context_end)+".npy" right_context_name = movie_name+"_"+str(right_context_start)+"_"+str(right_context_end)+".npy" if os.path.exists(self.sliding_clip_path+left_context_name): left_context_feat = np.load(self.sliding_clip_path+left_context_name) last_left_feat = left_context_feat else: left_context_feat = last_left_feat if os.path.exists(self.sliding_clip_path+right_context_name): right_context_feat = np.load(self.sliding_clip_path+right_context_name) last_right_feat = right_context_feat else: right_context_feat = last_right_feat left_context_feats[k] = left_context_feat right_context_feats[k] = right_context_feat return np.mean(left_context_feats, axis=0), np.mean(right_context_feats, axis=0) def load_movie(self, movie_name): movie_clip_sentences=[] for k in range(len(self.clip_names)): if movie_name in self.clip_names[k]: movie_clip_sentences.append((self.clip_names[k], self.sent_vecs[k][:2400], self.sentences[k])) movie_clip_imgs=[] for k in range(len(self.movie_frames[movie_name])): # print str(k)+"/"+str(len(self.movie_frames[movie_name])) if os.path.isfile(self.movie_frames[movie_name][k][1]) and os.path.getsize(self.movie_frames[movie_name][k][1])!=0: img=load_image(self.movie_frames[movie_name][k][1]) movie_clip_imgs.append((self.movie_frames[movie_name][k][0],img)) return movie_clip_imgs, movie_clip_sentences def load_movie_byclip(self,movie_name,sample_num): movie_clip_sentences=[] movie_clip_featmap=[] clip_set=set() for k in range(len(self.clip_sentence_pairs)): if movie_name in self.clip_sentence_pairs[k][0]: movie_clip_sentences.append((self.clip_sentence_pairs[k][0],self.clip_sentence_pairs[k][1][:self.semantic_size])) if not self.clip_sentence_pairs[k][0] in clip_set: clip_set.add(self.clip_sentence_pairs[k][0]) # print str(k)+"/"+str(len(self.movie_clip_names[movie_name])) visual_feature_path=self.image_dir+self.clip_sentence_pairs[k][0]+".npy" feature_data=np.load(visual_feature_path) movie_clip_featmap.append((self.clip_sentence_pairs[k][0],feature_data)) return movie_clip_featmap, movie_clip_sentences def load_movie_slidingclip(self, movie_name, sample_num): movie_clip_sentences = [] movie_clip_featmap = [] clip_set = set() for k in range(len(self.clip_sentence_pairs)): if movie_name in self.clip_sentence_pairs[k][0]: movie_clip_sentences.append((self.clip_sentence_pairs[k][0], self.clip_sentence_pairs[k][1][:self.semantic_size])) for k in range(len(self.sliding_clip_names)): if movie_name in self.sliding_clip_names[k]: # print str(k)+"/"+str(len(self.movie_clip_names[movie_name])) visual_feature_path = self.sliding_clip_path+self.sliding_clip_names[k]+".npy" #context_feat=self.get_context(self.sliding_clip_names[k]+".npy") left_context_feat,right_context_feat = self.get_context_window(self.sliding_clip_names[k]+".npy",1) feature_data = np.load(visual_feature_path) #comb_feat=np.hstack((context_feat,feature_data)) comb_feat = np.hstack((left_context_feat,feature_data,right_context_feat)) movie_clip_featmap.append((self.sliding_clip_names[k], comb_feat)) return movie_clip_featmap, movie_clip_sentences ================================================ FILE: exp_data/.gitkeep ================================================ ================================================ FILE: main.py ================================================ import tensorflow as tf import numpy as np import ctrl_model from six.moves import xrange import time from sklearn.metrics import average_precision_score import pickle import vs_multilayer import operator def dense_to_one_hot(labels_dense, num_classes): """Convert class labels from scalars to one-hot vectors.""" num_labels = labels_dense.shape[0] index_offset = np.arange(num_labels) * num_classes labels_one_hot = np.zeros((num_labels, num_classes)) labels_one_hot.flat[index_offset+labels_dense.ravel()] = 1 return labels_one_hot def compute_ap(class_score_matrix, labels): num_classes=class_score_matrix.shape[1] one_hot_labels=dense_to_one_hot(labels, num_classes) predictions=np.array(class_score_matrix>0, dtype="int32") average_precision=[] for i in range(num_classes): ps=average_precision_score(one_hot_labels[:, i], class_score_matrix[:, i]) # if not np.isnan(ps): average_precision.append(ps) return np.array(average_precision) def calculate_IoU(i0,i1): union = (min(i0[0], i1[0]), max(i0[1], i1[1])) inter = (max(i0[0], i1[0]), min(i0[1], i1[1])) iou = 1.0*(inter[1]-inter[0])/(union[1]-union[0]) return iou def nms_temporal(x1,x2,s, overlap): pick = [] assert len(x1)==len(s) assert len(x2)==len(s) if len(x1)==0: return pick #x1 = [b[0] for b in boxes] #x2 = [b[1] for b in boxes] #s = [b[-1] for b in boxes] union = map(operator.sub, x2, x1) # union = x2-x1 I = [i[0] for i in sorted(enumerate(s), key=lambda x:x[1])] # sort and get index while len(I)>0: i = I[-1] pick.append(i) xx1 = [max(x1[i],x1[j]) for j in I[:-1]] xx2 = [min(x2[i],x2[j]) for j in I[:-1]] inter = [max(0.0, k2-k1) for k1, k2 in zip(xx1, xx2)] o = [inter[u]/(union[i] + union[I[u]] - inter[u]) for u in range(len(I)-1)] I_new = [] for j in range(len(o)): if o[j] <=overlap: I_new.append(I[j]) I = I_new return pick ''' compute recall at certain IoU ''' def compute_IoU_recall_top_n_forreg(top_n, iou_thresh, sentence_image_mat, sentence_image_reg_mat, sclips, iclips): correct_num = 0.0 for k in range(sentence_image_mat.shape[0]): gt = sclips[k] gt_start = float(gt.split("_")[1]) gt_end = float(gt.split("_")[2]) #print gt +" "+str(gt_start)+" "+str(gt_end) sim_v = [v for v in sentence_image_mat[k]] starts = [s for s in sentence_image_reg_mat[k,:,0]] ends = [e for e in sentence_image_reg_mat[k,:,1]] picks = nms_temporal(starts,ends, sim_v, iou_thresh-0.05) #sim_argsort=np.argsort(sim_v)[::-1][0:top_n] if top_n=iou_thresh: correct_num+=1 break return correct_num ''' evaluate the model ''' def do_eval_slidingclips(sess, vs_eval_op, model, movie_length_info, iter_step, test_result_output): IoU_thresh = [0.1, 0.2, 0.3, 0.4, 0.5] all_correct_num_10 = [0.0]*5 all_correct_num_5 = [0.0]*5 all_correct_num_1 = [0.0]*5 all_retrievd = 0.0 for movie_name in model.test_set.movie_names: movie_length=movie_length_info[movie_name.split(".")[0]] print "Test movie: "+movie_name+"....loading movie data" movie_clip_featmaps, movie_clip_sentences=model.test_set.load_movie_slidingclip(movie_name, 16) print "sentences: "+ str(len(movie_clip_sentences)) print "clips: "+ str(len(movie_clip_featmaps)) sentence_image_mat=np.zeros([len(movie_clip_sentences), len(movie_clip_featmaps)]) sentence_image_reg_mat=np.zeros([len(movie_clip_sentences), len(movie_clip_featmaps), 2]) for k in range(len(movie_clip_sentences)): #sentence_clip_name=movie_clip_sentences[k][0] #start=float(sentence_clip_name.split("_")[1]) #end=float(sentence_clip_name.split("_")[2].split("_")[0]) sent_vec=movie_clip_sentences[k][1] sent_vec=np.reshape(sent_vec,[1,sent_vec.shape[0]]) for t in range(len(movie_clip_featmaps)): featmap = movie_clip_featmaps[t][1] visual_clip_name = movie_clip_featmaps[t][0] start = float(visual_clip_name.split("_")[1]) end = float(visual_clip_name.split("_")[2].split("_")[0]) featmap = np.reshape(featmap, [1, featmap.shape[0]]) feed_dict = { model.visual_featmap_ph_test: featmap, model.sentence_ph_test:sent_vec } outputs = sess.run(vs_eval_op,feed_dict=feed_dict) sentence_image_mat[k,t] = outputs[0] reg_clip_length = (end-start)*(10**outputs[2]) reg_mid_point = (start+end)/2.0+movie_length*outputs[1] reg_end = end+outputs[2] reg_start = start+outputs[1] sentence_image_reg_mat[k,t,0] = reg_start sentence_image_reg_mat[k,t,1] = reg_end iclips = [b[0] for b in movie_clip_featmaps] sclips = [b[0] for b in movie_clip_sentences] # calculate Recall@m, IoU=n for k in range(len(IoU_thresh)): IoU=IoU_thresh[k] correct_num_10 = compute_IoU_recall_top_n_forreg(10, IoU, sentence_image_mat, sentence_image_reg_mat, sclips, iclips) correct_num_5 = compute_IoU_recall_top_n_forreg(5, IoU, sentence_image_mat, sentence_image_reg_mat, sclips, iclips) correct_num_1 = compute_IoU_recall_top_n_forreg(1, IoU, sentence_image_mat, sentence_image_reg_mat, sclips, iclips) print movie_name+" IoU="+str(IoU)+", R@10: "+str(correct_num_10/len(sclips))+"; IoU="+str(IoU)+", R@5: "+str(correct_num_5/len(sclips))+"; IoU="+str(IoU)+", R@1: "+str(correct_num_1/len(sclips)) all_correct_num_10[k]+=correct_num_10 all_correct_num_5[k]+=correct_num_5 all_correct_num_1[k]+=correct_num_1 all_retrievd+=len(sclips) for k in range(len(IoU_thresh)): print " IoU="+str(IoU_thresh[k])+", R@10: "+str(all_correct_num_10[k]/all_retrievd)+"; IoU="+str(IoU_thresh[k])+", R@5: "+str(all_correct_num_5[k]/all_retrievd)+"; IoU="+str(IoU_thresh[k])+", R@1: "+str(all_correct_num_1[k]/all_retrievd) test_result_output.write("Step "+str(iter_step)+": IoU="+str(IoU_thresh[k])+", R@10: "+str(all_correct_num_10[k]/all_retrievd)+"; IoU="+str(IoU_thresh[k])+", R@5: "+str(all_correct_num_5[k]/all_retrievd)+"; IoU="+str(IoU_thresh[k])+", R@1: "+str(all_correct_num_1[k]/all_retrievd)+"\n") def run_training(): initial_steps = 0 max_steps = 20000 batch_size = 56 train_csv_path = "./exp_data/TACoS/train_clip-sentvec.pkl" test_csv_path = "./exp_data/TACoS/test_clip-sentvec.pkl" test_feature_dir="../TACOS/Interval128_256_overlap0.8_c3d_fc6/" train_feature_dir = "../TACOS/Interval64_128_256_512_overlap0.8_c3d_fc6/" model = ctrl_model.CTRL_Model(batch_size, train_csv_path, test_csv_path, test_feature_dir, train_feature_dir) test_result_output=open("ctrl_test_results.txt", "w") with tf.Graph().as_default(): loss_align_reg, vs_train_op, vs_eval_op, offset_pred, loss_reg = model.construct_model() # Create a session for running Ops on the Graph. gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.2) sess = tf.Session(config = tf.ConfigProto(gpu_options = gpu_options)) # Run the Op to initialize the variables. init = tf.initialize_all_variables() sess.run(init) for step in xrange(max_steps): start_time = time.time() feed_dict = model.fill_feed_dict_train_reg() _, loss_value, offset_pred_v, loss_reg_v = sess.run([vs_train_op, loss_align_reg, offset_pred, loss_reg], feed_dict=feed_dict) duration = time.time() - start_time if step % 5 == 0: # Print status to stdout. print('Step %d: loss = %.3f (%.3f sec)' % (step, loss_value, duration)) if (step+1) % 2000 == 0: print "Start to test:-----------------\n" movie_length_info=pickle.load(open("./video_allframes_info.pkl")) do_eval_slidingclips(sess, vs_eval_op, model, movie_length_info, step+1, test_result_output) def main(_): run_training() if __name__ == '__main__': tf.app.run() ================================================ FILE: util/__init__.py ================================================ ================================================ FILE: util/cnn.py ================================================ from __future__ import division #, print_function import tensorflow as tf def conv_layer(name, bottom, kernel_size, stride, output_dim, padding='SAME', bias_term=True, weights_initializer=None, biases_initializer=None): # input has shape [batch, in_height, in_width, in_channels] input_dim = bottom.get_shape().as_list()[-1] # weights and biases variables with tf.variable_scope(name): if weights_initializer is None and biases_initializer is None: # initialize the variables if weights_initializer is None: weights_initializer = tf.random_normal_initializer(mean=0.0, stddev=0.01) if bias_term and biases_initializer is None: biases_initializer = tf.constant_initializer(0.) print "input_dim"+str(input_dim) # filter has shape [filter_height, filter_width, in_channels, out_channels] weights = tf.get_variable("weights", [kernel_size, kernel_size, input_dim, output_dim], initializer=weights_initializer) if bias_term: biases = tf.get_variable("biases", output_dim, initializer=biases_initializer) print str(weights.name)+" initialized as random or retrieved from graph" if bias_term: print biases.name+" initialized as random or retrieved from graph" else: weights = tf.get_variable("weights", shape=None, initializer=weights_initializer) if bias_term: biases = tf.get_variable("biases", shape=None, initializer=biases_initializer) print weights.name+" initialized from pre-trained parameters or retrieved from graph" if bias_term: print biases.name+" initialized from pre-trained parameters or retrieved from graph" conv = tf.nn.conv2d(bottom, filter=weights, strides=[1, stride, stride, 1], padding=padding) if bias_term: conv = tf.nn.bias_add(conv, biases) return conv def conv_relu_layer(name, bottom, kernel_size, stride, output_dim, padding='SAME', bias_term=True, weights_initializer=None, biases_initializer=None): conv = conv_layer(name, bottom, kernel_size, stride, output_dim, padding, bias_term, weights_initializer, biases_initializer) relu = tf.nn.relu(conv) return relu def deconv_layer(name, bottom, kernel_size, stride, output_dim, padding='SAME', bias_term=True, weights_initializer=None, biases_initializer=None): # input_shape is [batch, in_height, in_width, in_channels] input_shape = bottom.get_shape().as_list() batch_size, input_height, input_width, input_dim = input_shape output_shape = [batch_size, input_height*stride, input_width*stride, output_dim] # weights and biases variables with tf.variable_scope(name): # initialize the variables if weights_initializer is None: weights_initializer = tf.random_normal_initializer() if bias_term and biases_initializer is None: biases_initializer = tf.constant_initializer(0.) # filter has shape [filter_height, filter_width, out_channels, in_channels] weights = tf.get_variable("weights", [kernel_size, kernel_size, output_dim, input_dim], initializer=weights_initializer) if bias_term: biases = tf.get_variable("biases", output_dim, initializer=biases_initializer) deconv = tf.nn.conv2d_transpose(bottom, filter=weights, output_shape=output_shape, strides=[1, stride, stride, 1], padding=padding) if bias_term: deconv = tf.nn.bias_add(deconv, biases) return deconv def deconv_relu_layer(name, bottom, kernel_size, stride, output_dim, padding='SAME', bias_term=True, weights_initializer=None, biases_initializer=None): deconv = deconv_layer(name, bottom, kernel_size, stride, output_dim, padding, bias_term, weights_initializer, biases_initializer) relu = tf.nn.relu(deconv) return relu def pooling_layer(name, bottom, kernel_size, stride): pool = tf.nn.max_pool(bottom, ksize=[1, kernel_size, kernel_size, 1], strides=[1, stride, stride, 1], padding='SAME', name=name) return pool def fc_layer(name, bottom, output_dim, bias_term=True, weights_initializer=None, biases_initializer=None): # flatten bottom input # input has shape [batch, in_height, in_width, in_channels] shape = bottom.get_shape().as_list() input_dim = 1 for d in shape[1:]: input_dim *= d flat_bottom = tf.reshape(bottom, [-1, input_dim]) # weights and biases variables with tf.variable_scope(name): if weights_initializer is None and biases_initializer is None: # initialize the variables if weights_initializer is None: weights_initializer = tf.random_normal_initializer() if bias_term and biases_initializer is None: biases_initializer = tf.constant_initializer(0.) # weights has shape [input_dim, output_dim] weights = tf.get_variable("weights", [input_dim, output_dim], initializer=weights_initializer) if bias_term: biases = tf.get_variable("biases", output_dim, initializer=biases_initializer) print weights.name+" initialized as random or retrieved from graph" if bias_term: print biases.name+" initialized as random or retrieved from graph" else: weights = tf.get_variable("weights", shape=None, initializer=weights_initializer) if bias_term: biases = tf.get_variable("biases", shape=None, initializer=biases_initializer) print weights.name+" initialized from pre-trained parameters or retrieved from graph" if bias_term: print biases.name+" initialized from pre-trained parameters or retrieved from graph" if bias_term: fc = tf.nn.xw_plus_b(flat_bottom, weights, biases) else: fc = tf.matmul(flat_bottom, weights) return fc def fc_relu_layer(name, bottom, output_dim, bias_term=True, weights_initializer=None, biases_initializer=None): fc = fc_layer(name, bottom, output_dim, bias_term, weights_initializer, biases_initializer) relu = tf.nn.relu(fc) return relu def softmax_loss_layer(name, score_bottom, label_bottom): """ Calculates cumulative Softmax Cross Entropy Loss along the last dimension *This function does not divide the loss by batch size* Once tensorflow has SparseCrossEntropy function, this one will be replaced """ # Check shape score_shape = score_bottom.get_shape().as_list() label_shape = label_bottom.get_shape().as_list() assert len(score_shape) == len(label_shape) + 1 assert score_shape[:-1] == label_shape # Compute the outer dimensions dimensions in label inner_dim = score_shape[-1] outer_dim = 1 for d in label_shape: outer_dim *= d # flatten score and label flat_score = tf.reshape(score_bottom, [outer_dim, inner_dim]) flat_label = tf.reshape(label_bottom, [outer_dim, 1]) # Reshape the labels into a dense Tensor of # shape [batch_size, NUM_CLASSES]. sparse_labels = tf.reshape(labels, [FLAGS.batch_size, 1]) indices = tf.reshape(tf.range(FLAGS.batch_size), [FLAGS.batch_size, 1]) concated = tf.concat(1, [indices, sparse_labels]) dense_labels = tf.sparse_to_dense(concated, [FLAGS.batch_size, NUM_CLASSES], 1.0, 0.0) ================================================ FILE: video_allframes_info.pkl ================================================ (dp0 S's30-d43' p1 I19807 sS's30-d40' p2 I4911 sS's30-d41' p3 I19163 sS's22-d55' p4 I1530 sS's25-d52' p5 I5676 sS's25-d51' p6 I3409 sS's28-d39' p7 I22908 sS's35-d55' p8 I6758 sS's24-d41' p9 I17753 sS's24-d40' p10 I6322 sS's24-d48' p11 I2397 sS's21-d29' p12 I1881 sS's21-d39' p13 I3774 sS's22-d29' p14 I3253 sS's33-d27' p15 I20003 sS's22-d46' p16 I14844 sS's32-d70' p17 I7357 sS's13-d52' p18 I2767 sS's22-d48' p19 I2467 sS's13-d54' p20 I8199 sS's37-d39' p21 I9169 sS's25-d23' p22 I3808 sS's27-d70' p23 I8218 sS's36-d50' p24 I5219 sS's26-d70' p25 I5847 sS's22-d53' p26 I6013 sS's21-d23' p27 I3452 sS's21-d21' p28 I4450 sS's24-d53' p29 I5237 sS's21-d28' p30 I3693 sS's32-d69' p31 I12409 sS's35-d48' p32 I7628 sS's31-d28' p33 I5841 sS's31-d25' p34 I2650 sS's22-d25' p35 I1621 sS's29-d42' p36 I22198 sS's35-d40' p37 I13864 sS's23-d31' p38 I3216 sS's17-d48' p39 I2273 sS's36-d42' p40 I18133 sS's34-d34' p41 I17675 sS's35-d41' p42 I27306 sS's34-d41' p43 I15419 sS's37-d46' p44 I13677 sS's21-d53' p45 I4683 sS's30-d26' p46 I17130 sS's24-d28' p47 I7230 sS's21-d55' p48 I2133 sS's14-d35' p49 I2421 sS's30-d29' p50 I6115 sS's36-d70' p51 I7055 sS's23-d51' p52 I6830 sS's31-d31' p53 I6273 sS's23-d54' p54 I9474 sS's24-d34' p55 I5430 sS's17-d55' p56 I2335 sS's27-d50' p57 I2430 sS's17-d53' p58 I7031 sS's22-d43' p59 I3315 sS's25-d69' p60 I14351 sS's27-d34' p61 I1772 sS's23-d46' p62 I10819 sS's21-d42' p63 I6090 sS's21-d43' p64 I4033 sS's23-d42' p65 I12416 sS's21-d45' p66 I2866 sS's14-d26' p67 I12483 sS's14-d27' p68 I6134 sS's21-d40' p69 I3238 sS's26-d26' p70 I41240 sS's26-d23' p71 I2842 sS's17-d42' p72 I15601 sS's34-d28' p73 I11816 sS's23-d45' p74 I4184 sS's29-d52' p75 I5871 sS's15-d70' p76 I8315 sS's27-d45' p77 I4757 sS's29-d50' p78 I2847 sS's27-d29' p79 I8024 sS's21-d35' p80 I1875 sS's27-d21' p81 I4408 sS's37-d25' p82 I1436 sS's37-d21' p83 I11130 sS's23-d34' p84 I5231 sS's23-d39' p85 I5444 sS's37-d29' p86 I3012 sS's27-d54' p87 I10969 sS's14-d51' p88 I8323 sS's28-d46' p89 I11162 sS's23-d21' p90 I4866 sS's13-d48' p91 I3228 sS's32-d27' p92 I22541 sS's13-d21' p93 I2955 sS's13-d25' p94 I2951 sS's13-d28' p95 I5629 sS's14-d46' p96 I9249 sS's14-d43' p97 I4667 sS's22-d34' p98 I2401 sS's21-d50' p99 I1607 sS's24-d23' p100 I5735 sS's29-d31' p101 I3164 sS's17-d69' p102 I12454 sS's15-d26' p103 I33264 sS's29-d39' p104 I9630 sS's32-d55' p105 I5689 sS's32-d52' p106 I5508 sS's13-d31' p107 I5093 sS's36-d31' p108 I7071 sS's36-d43' p109 I26924 sS's33-d45' p110 I7698 sS's22-d26' p111 I9365 sS's28-d27' p112 I17235 sS's28-d25' p113 I3220 sS's30-d53' p114 I13379 sS's30-d52' p115 I7362 sS's33-d54' p116 I10898 sS's22-d35' p117 I2555 sS's15-d35' p118 I6472 sS's33-d50' p119 I3466 sS's13-d40' p120 I3347 sS's13-d45' p121 I4354 sS's36-d23' p122 I19412 sS's25-d35' p123 I3900 sS's28-d51' p124 I19725 sS's26-d69' p125 I20978 sS's36-d27' p126 I4390 sS's34-d69' p127 I12952 s. ================================================ FILE: vs_multilayer.py ================================================ from __future__ import division import numpy as np import tensorflow as tf # components from tensorflow.python.ops.nn import dropout as drop from util.cnn import conv_layer as conv from util.cnn import conv_relu_layer as conv_relu from util.cnn import pooling_layer as pool from util.cnn import fc_layer as fc from util.cnn import fc_relu_layer as fc_relu def vs_multilayer(input_batch,name,middle_layer_dim=1000,reuse=False): with tf.variable_scope(name): if reuse==True: print name+" reuse variables" tf.get_variable_scope().reuse_variables() else: print name+" doesn't reuse variables" layer1 = conv_relu('layer1', input_batch, kernel_size=1,stride=1,output_dim=middle_layer_dim) sim_score = conv('layer2', layer1, kernel_size=1,stride=1,output_dim=3) return sim_score