Repository: jiyanggao/TALL
Branch: master
Commit: 3df6794af148
Files: 10
Total size: 45.8 KB
Directory structure:
gitextract_wdpa8dzs/
├── README.md
├── ctrl_model.py
├── ctrl_test_results.txt
├── dataset.py
├── exp_data/
│ └── .gitkeep
├── main.py
├── util/
│ ├── __init__.py
│ └── cnn.py
├── video_allframes_info.pkl
└── vs_multilayer.py
================================================
FILE CONTENTS
================================================
================================================
FILE: README.md
================================================
## TALL: Temporal Activity Localization via Language Query
This is the repository for our ICCV 2017 paper [_TALL: Temporal Activity Localization via Language Query_](https://arxiv.org/abs/1705.02101).
### Visual Features on TACoS
Download the C3D features for [training set](https://drive.google.com/file/d/1zQp0aYGFCm8PqqHOh4UtXfy2U3pJMBeu/view?usp=sharing) and [test set](https://drive.google.com/file/d/1zC-UrspRf42Qiu5prQw4fQrbgLQfJN-P/view?usp=sharing) of TACoS dataset. Modify the path to feature folders in main.py
### Sentence Embeddings on TACoS
Download the Skip-thought sentence embeddings and sample files from [here](https://drive.google.com/file/d/1HF-hNFPvLrHwI5O7YvYKZWTeTxC5Mg1K/view?usp=sharing) of TACoS Dataset, and put them under exp_data folder.
### Reproduce the results on TACoS
`python main.py`
### Charades-STA anno download
The sentence temporal annotations on [Charades](http://allenai.org/plato/charades/) dataset are available here: [train](https://drive.google.com/file/d/1ZjG7wJpPSMIBYnW7BAG2u9VVEoNvFm5c/view?usp=sharing), [test](https://drive.google.com/file/d/1QG4MXFkoj6JFU0YK5olTY75xTARKSW5e/view?usp=sharing). The format is "[video name] [start time] [end time]##[sentence]". You may want to generate the skip-thought embeddings and C3D features on Charades-STA, and modify the codes slightly to reproduce the experiments.
### Updates on Charades-STA performance
I did some anno cleaning for Charades-STA (compared to the version I used in ICCV paper), the updated performance is listed below. Please compare to these results when using Charades-STA.
| Model | R@1,IoU=0.5 | R@1,IoU=0.7 | R@5,IoU=0.5 | R@5,IoU=0.7 |
| :--------------- | ----------: | ----------: | ----------: | ----------: |
| CTRL (aln) | 17.69 | 5.91 | 55.54 | 23.79 |
| CTRL (reg-p) | 19.22 | 6.64 | 57.98 | 25.22 |
| CTRL (reg-np) | 21.42 | 7.15 | 59.11 | 26.91 |
================================================
FILE: ctrl_model.py
================================================
import numpy as np
import tensorflow as tf
from tensorflow.python.framework import dtypes
from util.cnn import fc_layer as fc
import vs_multilayer
from dataset import TestingDataSet
from dataset import TrainingDataSet
class CTRL_Model(object):
def __init__(self, batch_size, train_csv_path, test_csv_path, test_visual_feature_dir, train_visual_feature_dir):
self.batch_size = batch_size
self.test_batch_size = 1
self.vs_lr = 0.005
self.lambda_regression = 0.01
self.alpha = 1.0/batch_size
self.semantic_size = 1024 # the size of visual and semantic comparison size
self.sentence_embedding_size = 4800
self.visual_feature_dim = 4096*3
self.train_set=TrainingDataSet(train_visual_feature_dir, train_csv_path, self.batch_size)
self.test_set=TestingDataSet(test_visual_feature_dir, test_csv_path, self.test_batch_size)
'''
used in training alignment model, CTRL(aln)
'''
def fill_feed_dict_train(self):
image_batch,sentence_batch,offset_batch = self.train_set.next_batch()
input_feed = {
self.visual_featmap_ph_train: image_batch,
self.sentence_ph_train: sentence_batch,
self.offset_ph: offset_batch
}
return input_feed
'''
used in training alignment+regression model, CTRL(reg)
'''
def fill_feed_dict_train_reg(self):
image_batch, sentence_batch, offset_batch = self.train_set.next_batch_iou()
input_feed = {
self.visual_featmap_ph_train: image_batch,
self.sentence_ph_train: sentence_batch,
self.offset_ph: offset_batch
}
return input_feed
'''
cross modal processing module
'''
def cross_modal_comb(self, visual_feat, sentence_embed, batch_size):
vv_feature = tf.reshape(tf.tile(visual_feat, [batch_size, 1]),
[batch_size, batch_size, self.semantic_size])
ss_feature = tf.reshape(tf.tile(sentence_embed,[1, batch_size]),[batch_size, batch_size, self.semantic_size])
concat_feature = tf.reshape(tf.concat(2,[vv_feature, ss_feature]),[batch_size, batch_size, self.semantic_size+self.semantic_size])
print concat_feature.get_shape().as_list()
mul_feature = tf.mul(vv_feature, ss_feature)
add_feature = tf.add(vv_feature, ss_feature)
comb_feature = tf.reshape(tf.concat(2, [mul_feature, add_feature, concat_feature]),[1, batch_size, batch_size, self.semantic_size*4])
return comb_feature
'''
visual semantic inference, including visual semantic alignment and clip location regression
'''
def visual_semantic_infer(self, visual_feature_train, sentence_embed_train, visual_feature_test, sentence_embed_test):
name="CTRL_Model"
with tf.variable_scope(name):
print "Building training network...............................\n"
transformed_clip_train = fc('v2s_lt', visual_feature_train, output_dim=self.semantic_size)
transformed_clip_train_norm = tf.nn.l2_normalize(transformed_clip_train, dim=1)
transformed_sentence_train = fc('s2s_lt', sentence_embed_train, output_dim=self.semantic_size)
transformed_sentence_train_norm = tf.nn.l2_normalize(transformed_sentence_train, dim=1)
cross_modal_vec_train = self.cross_modal_comb(transformed_clip_train_norm, transformed_sentence_train_norm, self.batch_size)
sim_score_mat_train = vs_multilayer.vs_multilayer(cross_modal_vec_train, "vs_multilayer_lt", middle_layer_dim=1000)
sim_score_mat_train = tf.reshape(sim_score_mat_train,[self.batch_size, self.batch_size, 3])
tf.get_variable_scope().reuse_variables()
print "Building test network...............................\n"
transformed_clip_test = fc('v2s_lt', visual_feature_test, output_dim=self.semantic_size)
transformed_clip_test_norm = tf.nn.l2_normalize(transformed_clip_test, dim=1)
transformed_sentence_test = fc('s2s_lt', sentence_embed_test, output_dim=self.semantic_size)
transformed_sentence_test_norm = tf.nn.l2_normalize(transformed_sentence_test, dim=1)
cross_modal_vec_test = self.cross_modal_comb(transformed_clip_test_norm, transformed_sentence_test_norm, self.test_batch_size)
sim_score_mat_test = vs_multilayer.vs_multilayer(cross_modal_vec_test, "vs_multilayer_lt", reuse=True, middle_layer_dim=1000)
sim_score_mat_test = tf.reshape(sim_score_mat_test, [3])
return sim_score_mat_train, sim_score_mat_test
'''
compute alignment and regression loss
'''
def compute_loss_reg(self, sim_reg_mat, offset_label):
sim_score_mat, p_reg_mat, l_reg_mat = tf.split(2, 3, sim_reg_mat)
sim_score_mat = tf.reshape(sim_score_mat, [self.batch_size, self.batch_size])
l_reg_mat = tf.reshape(l_reg_mat, [self.batch_size, self.batch_size])
p_reg_mat = tf.reshape(p_reg_mat, [self.batch_size, self.batch_size])
# unit matrix with -2
I_2 = tf.diag(tf.constant(-2.0, shape=[self.batch_size]))
all1 = tf.constant(1.0, shape=[self.batch_size, self.batch_size])
# | -1 1 1... |
# mask_mat = | 1 -1 -1... |
# | 1 1 -1 ... |
mask_mat = tf.add(I_2, all1)
# loss cls, not considering iou
I = tf.diag(tf.constant(1.0, shape=[self.batch_size]))
I_half = tf.diag(tf.constant(0.5, shape=[self.batch_size]))
batch_para_mat = tf.constant(self.alpha, shape=[self.batch_size, self.batch_size])
para_mat = tf.add(I,batch_para_mat)
loss_mat = tf.log(tf.add(all1, tf.exp(tf.mul(mask_mat, sim_score_mat))))
loss_mat = tf.mul(loss_mat, para_mat)
loss_align = tf.reduce_mean(loss_mat)
# regression loss
l_reg_diag = tf.matmul(tf.mul(l_reg_mat, I), tf.constant(1.0, shape=[self.batch_size, 1]))
p_reg_diag = tf.matmul(tf.mul(p_reg_mat, I), tf.constant(1.0, shape=[self.batch_size, 1]))
offset_pred = tf.concat(1, (p_reg_diag, l_reg_diag))
loss_reg = tf.reduce_mean(tf.abs(tf.sub(offset_pred, offset_label)))
loss=tf.add(tf.mul(self.lambda_regression, loss_reg), loss_align)
return loss, offset_pred, loss_reg
def init_placeholder(self):
visual_featmap_ph_train = tf.placeholder(tf.float32, shape=(self.batch_size, self.visual_feature_dim))
sentence_ph_train = tf.placeholder(tf.float32, shape=(self.batch_size, self.sentence_embedding_size))
offset_ph = tf.placeholder(tf.float32, shape=(self.batch_size,2))
visual_featmap_ph_test = tf.placeholder(tf.float32, shape=(self.test_batch_size, self.visual_feature_dim))
sentence_ph_test = tf.placeholder(tf.float32, shape=(self.test_batch_size, self.sentence_embedding_size))
return visual_featmap_ph_train,sentence_ph_train,offset_ph,visual_featmap_ph_test,sentence_ph_test
def get_variables_by_name(self,name_list):
v_list = tf.trainable_variables()
v_dict = {}
for name in name_list:
v_dict[name] = []
for v in v_list:
for name in name_list:
if name in v.name: v_dict[name].append(v)
for name in name_list:
print "Variables of <"+name+">"
for v in v_dict[name]:
print " "+v.name
return v_dict
def training(self, loss):
v_dict = self.get_variables_by_name(["lt"])
vs_optimizer = tf.train.AdamOptimizer(self.vs_lr, name='vs_adam')
vs_train_op = vs_optimizer.minimize(loss, var_list=v_dict["lt"])
return vs_train_op
def construct_model(self):
# initialize the placeholder
self.visual_featmap_ph_train, self.sentence_ph_train, self.offset_ph, self.visual_featmap_ph_test, self.sentence_ph_test=self.init_placeholder()
# build inference network
sim_reg_mat, sim_reg_mat_test = self.visual_semantic_infer(self.visual_featmap_ph_train, self.sentence_ph_train, self.visual_featmap_ph_test, self.sentence_ph_test)
# compute loss
self.loss_align_reg, offset_pred, loss_reg = self.compute_loss_reg(sim_reg_mat, self.offset_ph)
# optimize
self.vs_train_op = self.training(self.loss_align_reg)
return self.loss_align_reg, self.vs_train_op, sim_reg_mat_test, offset_pred, loss_reg
================================================
FILE: ctrl_test_results.txt
================================================
================================================
FILE: dataset.py
================================================
import numpy as np
from math import sqrt
import os
import random
import pickle
'''
calculate temporal intersection over union
'''
def calculate_IoU(i0, i1):
union = (min(i0[0], i1[0]), max(i0[1], i1[1]))
inter = (max(i0[0], i1[0]), min(i0[1], i1[1]))
iou = 1.0*(inter[1]-inter[0])/(union[1]-union[0])
return iou
'''
calculate the non Intersection part over Length ratia, make sure the input IoU is larger than 0
'''
def calculate_nIoL(base, sliding_clip):
inter = (max(base[0], sliding_clip[0]), min(base[1], sliding_clip[1]))
inter_l = inter[1]-inter[0]
length = sliding_clip[1]-sliding_clip[0]
nIoL = 1.0*(length-inter_l)/length
return nIoL
class TrainingDataSet(object):
def __init__(self, sliding_dir, it_path, batch_size):
self.counter = 0
self.batch_size = batch_size
self.context_num = 1
self.context_size = 128
print "Reading training data list from "+it_path
cs = pickle.load(open(it_path))
movie_length_info = pickle.load(open("./video_allframes_info.pkl"))
self.clip_sentence_pairs = []
for l in cs:
clip_name = l[0]
sent_vecs = l[1]
for sent_vec in sent_vecs:
self.clip_sentence_pairs.append((clip_name, sent_vec))
movie_names_set = set()
self.movie_clip_names = {}
# read groundtruth sentence-clip pairs
for k in range(len(self.clip_sentence_pairs)):
clip_name = self.clip_sentence_pairs[k][0]
movie_name = clip_name.split("_")[0]
if not movie_name in movie_names_set:
movie_names_set.add(movie_name)
self.movie_clip_names[movie_name] = []
self.movie_clip_names[movie_name].append(k)
self.movie_names = list(movie_names_set)
self.visual_feature_dim = 4096*3
self.sent_vec_dim = 4800
self.num_samples = len(self.clip_sentence_pairs)
self.sliding_clip_path = sliding_dir
print str(len(self.clip_sentence_pairs))+" clip-sentence pairs are readed"
# read sliding windows, and match them with the groundtruths to make training samples
sliding_clips_tmp = os.listdir(self.sliding_clip_path)
self.clip_sentence_pairs_iou = []
for clip_name in sliding_clips_tmp:
if clip_name.split(".")[2]=="npy":
movie_name = clip_name.split("_")[0]
for clip_sentence in self.clip_sentence_pairs:
original_clip_name = clip_sentence[0]
original_movie_name = original_clip_name.split("_")[0]
if original_movie_name==movie_name:
start = int(clip_name.split("_")[1])
end = int(clip_name.split("_")[2].split(".")[0])
o_start = int(original_clip_name.split("_")[1])
o_end = int(original_clip_name.split("_")[2].split(".")[0])
iou = calculate_IoU((start, end), (o_start, o_end))
if iou>0.5:
nIoL=calculate_nIoL((o_start, o_end), (start, end))
if nIoL<0.15:
movie_length = movie_length_info[movie_name.split(".")[0]]
start_offset =o_start-start
end_offset = o_end-end
self.clip_sentence_pairs_iou.append((clip_sentence[0], clip_sentence[1], clip_name, start_offset, end_offset))
self.num_samples_iou = len(self.clip_sentence_pairs_iou)
print str(len(self.clip_sentence_pairs_iou))+" iou clip-sentence pairs are readed"
'''
compute left (pre) and right (post) context features
'''
def get_context_window(self, clip_name, win_length):
movie_name = clip_name.split("_")[0]
start = int(clip_name.split("_")[1])
end = int(clip_name.split("_")[2].split(".")[0])
clip_length = self.context_size
left_context_feats = np.zeros([win_length, 4096], dtype=np.float32)
right_context_feats = np.zeros([win_length, 4096], dtype=np.float32)
last_left_feat = np.load(self.sliding_clip_path+clip_name)
last_right_feat = np.load(self.sliding_clip_path+clip_name)
for k in range(win_length):
left_context_start = start-clip_length*(k+1)
left_context_end = start-clip_length*k
right_context_start = end+clip_length*k
right_context_end = end+clip_length*(k+1)
left_context_name = movie_name+"_"+str(left_context_start)+"_"+str(left_context_end)+".npy"
right_context_name = movie_name+"_"+str(right_context_start)+"_"+str(right_context_end)+".npy"
if os.path.exists(self.sliding_clip_path+left_context_name):
left_context_feat = np.load(self.sliding_clip_path+left_context_name)
last_left_feat = left_context_feat
else:
left_context_feat = last_left_feat
if os.path.exists(self.sliding_clip_path+right_context_name):
right_context_feat = np.load(self.sliding_clip_path+right_context_name)
last_right_feat = right_context_feat
else:
right_context_feat = last_right_feat
left_context_feats[k] = left_context_feat
right_context_feats[k] = right_context_feat
return np.mean(left_context_feats, axis=0), np.mean(right_context_feats, axis=0)
'''
read next batch of training data, this function is used for training CTRL-aln
'''
def next_batch(self):
random_batch_index = random.sample(range(self.num_samples), self.batch_size)
image_batch = np.zeros([self.batch_size, self.visual_feature_dim])
sentence_batch = np.zeros([self.batch_size, self.sent_vec_dim])
offset_batch = np.zeros([self.batch_size, 2], dtype=np.float32) # this one is actually useless
index = 0
clip_set=set()
while index < self.batch_size:
k = random_batch_index[index]
clip_name = self.clip_sentence_pairs[k][0]
if not clip_name in clip_set:
clip_set.add(clip_name)
feat_path = self.image_dir+self.clip_sentence_pairs[k][0]+".npy"
featmap = np.load(feat_path)
image_batch[index,:] = featmap
sentence_batch[index,:] = self.clip_sentence_pairs[k][1][:self.sent_vec_dim]
index+=1
else:
r = random.choice(range(self.num_samples))
random_batch_index[index] = r
continue
return image_batch, sentence_batch, offset_batch
'''
read next batch of training data, this function is used for training CTRL-reg
'''
def next_batch_iou(self):
random_batch_index = random.sample(range(self.num_samples_iou), self.batch_size)
image_batch = np.zeros([self.batch_size, self.visual_feature_dim])
sentence_batch = np.zeros([self.batch_size, self.sent_vec_dim])
offset_batch = np.zeros([self.batch_size, 2], dtype=np.float32)
index = 0
clip_set = set()
while index < self.batch_size:
k = random_batch_index[index]
clip_name = self.clip_sentence_pairs_iou[k][0]
if not clip_name in clip_set:
clip_set.add(clip_name)
feat_path = self.sliding_clip_path+self.clip_sentence_pairs_iou[k][2]
featmap = np.load(feat_path)
# read context features
left_context_feat, right_context_feat = self.get_context_window(self.clip_sentence_pairs_iou[k][2], self.context_num)
image_batch[index,:] = np.hstack((left_context_feat, featmap, right_context_feat))
sentence_batch[index,:] = self.clip_sentence_pairs_iou[k][1][:self.sent_vec_dim]
p_offset = self.clip_sentence_pairs_iou[k][3]
l_offset = self.clip_sentence_pairs_iou[k][4]
offset_batch[index,0] = p_offset
offset_batch[index,1] = l_offset
index+=1
else:
r = random.choice(range(self.num_samples_iou))
random_batch_index[index] = r
continue
return image_batch, sentence_batch, offset_batch
class TestingDataSet(object):
def __init__(self, img_dir, csv_path, batch_size):
#il_path: image_label_file path
#self.index_in_epoch = 0
#self.epochs_completed = 0
self.batch_size = batch_size
self.image_dir = img_dir
print "Reading testing data list from "+csv_path
self.semantic_size = 4800
csv = pickle.load(open(csv_path))
self.clip_sentence_pairs = []
for l in csv:
clip_name = l[0]
sent_vecs = l[1]
for sent_vec in sent_vecs:
self.clip_sentence_pairs.append((clip_name, sent_vec))
print str(len(self.clip_sentence_pairs))+" pairs are readed"
movie_names_set = set()
self.movie_clip_names = {}
for k in range(len(self.clip_sentence_pairs)):
clip_name = self.clip_sentence_pairs[k][0]
movie_name = clip_name.split("_")[0]
if not movie_name in movie_names_set:
movie_names_set.add(movie_name)
self.movie_clip_names[movie_name] = []
self.movie_clip_names[movie_name].append(k)
self.movie_names = list(movie_names_set)
self.clip_num_per_movie_max = 0
for movie_name in self.movie_clip_names:
if len(self.movie_clip_names[movie_name])>self.clip_num_per_movie_max: self.clip_num_per_movie_max = len(self.movie_clip_names[movie_name])
print "Max number of clips in a movie is "+str(self.clip_num_per_movie_max)
self.sliding_clip_path = img_dir
sliding_clips_tmp = os.listdir(self.sliding_clip_path)
self.sliding_clip_names = []
for clip_name in sliding_clips_tmp:
if clip_name.split(".")[2]=="npy":
movie_name = clip_name.split("_")[0]
if movie_name in self.movie_clip_names:
self.sliding_clip_names.append(clip_name.split(".")[0]+"."+clip_name.split(".")[1])
self.num_samples = len(self.clip_sentence_pairs)
print "sliding clips number: "+str(len(self.sliding_clip_names))
assert self.batch_size <= self.num_samples
def get_clip_sample(self, sample_num, movie_name, clip_name):
length=len(os.listdir(self.image_dir+movie_name+"/"+clip_name))
sample_step=1.0*length/sample_num
sample_pos=np.floor(sample_step*np.array(range(sample_num)))
sample_pos_str=[]
img_names=os.listdir(self.image_dir+movie_name+"/"+clip_name)
# sort is very important! to get a correct sequence order
img_names.sort()
# print img_names
for pos in sample_pos:
sample_pos_str.append(self.image_dir+movie_name+"/"+clip_name+"/"+img_names[int(pos)])
return sample_pos_str
def get_context_window(self, clip_name, win_length):
movie_name = clip_name.split("_")[0]
start = int(clip_name.split("_")[1])
end = int(clip_name.split("_")[2].split(".")[0])
clip_length = 128#end-start
left_context_feats = np.zeros([win_length,4096], dtype=np.float32)
right_context_feats = np.zeros([win_length,4096], dtype=np.float32)
last_left_feat = np.load(self.sliding_clip_path+clip_name)
last_right_feat = np.load(self.sliding_clip_path+clip_name)
for k in range(win_length):
left_context_start = start-clip_length*(k+1)
left_context_end = start-clip_length*k
right_context_start = end+clip_length*k
right_context_end = end+clip_length*(k+1)
left_context_name = movie_name+"_"+str(left_context_start)+"_"+str(left_context_end)+".npy"
right_context_name = movie_name+"_"+str(right_context_start)+"_"+str(right_context_end)+".npy"
if os.path.exists(self.sliding_clip_path+left_context_name):
left_context_feat = np.load(self.sliding_clip_path+left_context_name)
last_left_feat = left_context_feat
else:
left_context_feat = last_left_feat
if os.path.exists(self.sliding_clip_path+right_context_name):
right_context_feat = np.load(self.sliding_clip_path+right_context_name)
last_right_feat = right_context_feat
else:
right_context_feat = last_right_feat
left_context_feats[k] = left_context_feat
right_context_feats[k] = right_context_feat
return np.mean(left_context_feats, axis=0), np.mean(right_context_feats, axis=0)
def load_movie(self, movie_name):
movie_clip_sentences=[]
for k in range(len(self.clip_names)):
if movie_name in self.clip_names[k]:
movie_clip_sentences.append((self.clip_names[k], self.sent_vecs[k][:2400], self.sentences[k]))
movie_clip_imgs=[]
for k in range(len(self.movie_frames[movie_name])):
# print str(k)+"/"+str(len(self.movie_frames[movie_name]))
if os.path.isfile(self.movie_frames[movie_name][k][1]) and os.path.getsize(self.movie_frames[movie_name][k][1])!=0:
img=load_image(self.movie_frames[movie_name][k][1])
movie_clip_imgs.append((self.movie_frames[movie_name][k][0],img))
return movie_clip_imgs, movie_clip_sentences
def load_movie_byclip(self,movie_name,sample_num):
movie_clip_sentences=[]
movie_clip_featmap=[]
clip_set=set()
for k in range(len(self.clip_sentence_pairs)):
if movie_name in self.clip_sentence_pairs[k][0]:
movie_clip_sentences.append((self.clip_sentence_pairs[k][0],self.clip_sentence_pairs[k][1][:self.semantic_size]))
if not self.clip_sentence_pairs[k][0] in clip_set:
clip_set.add(self.clip_sentence_pairs[k][0])
# print str(k)+"/"+str(len(self.movie_clip_names[movie_name]))
visual_feature_path=self.image_dir+self.clip_sentence_pairs[k][0]+".npy"
feature_data=np.load(visual_feature_path)
movie_clip_featmap.append((self.clip_sentence_pairs[k][0],feature_data))
return movie_clip_featmap, movie_clip_sentences
def load_movie_slidingclip(self, movie_name, sample_num):
movie_clip_sentences = []
movie_clip_featmap = []
clip_set = set()
for k in range(len(self.clip_sentence_pairs)):
if movie_name in self.clip_sentence_pairs[k][0]:
movie_clip_sentences.append((self.clip_sentence_pairs[k][0], self.clip_sentence_pairs[k][1][:self.semantic_size]))
for k in range(len(self.sliding_clip_names)):
if movie_name in self.sliding_clip_names[k]:
# print str(k)+"/"+str(len(self.movie_clip_names[movie_name]))
visual_feature_path = self.sliding_clip_path+self.sliding_clip_names[k]+".npy"
#context_feat=self.get_context(self.sliding_clip_names[k]+".npy")
left_context_feat,right_context_feat = self.get_context_window(self.sliding_clip_names[k]+".npy",1)
feature_data = np.load(visual_feature_path)
#comb_feat=np.hstack((context_feat,feature_data))
comb_feat = np.hstack((left_context_feat,feature_data,right_context_feat))
movie_clip_featmap.append((self.sliding_clip_names[k], comb_feat))
return movie_clip_featmap, movie_clip_sentences
================================================
FILE: exp_data/.gitkeep
================================================
================================================
FILE: main.py
================================================
import tensorflow as tf
import numpy as np
import ctrl_model
from six.moves import xrange
import time
from sklearn.metrics import average_precision_score
import pickle
import vs_multilayer
import operator
def dense_to_one_hot(labels_dense, num_classes):
"""Convert class labels from scalars to one-hot vectors."""
num_labels = labels_dense.shape[0]
index_offset = np.arange(num_labels) * num_classes
labels_one_hot = np.zeros((num_labels, num_classes))
labels_one_hot.flat[index_offset+labels_dense.ravel()] = 1
return labels_one_hot
def compute_ap(class_score_matrix, labels):
num_classes=class_score_matrix.shape[1]
one_hot_labels=dense_to_one_hot(labels, num_classes)
predictions=np.array(class_score_matrix>0, dtype="int32")
average_precision=[]
for i in range(num_classes):
ps=average_precision_score(one_hot_labels[:, i], class_score_matrix[:, i])
# if not np.isnan(ps):
average_precision.append(ps)
return np.array(average_precision)
def calculate_IoU(i0,i1):
union = (min(i0[0], i1[0]), max(i0[1], i1[1]))
inter = (max(i0[0], i1[0]), min(i0[1], i1[1]))
iou = 1.0*(inter[1]-inter[0])/(union[1]-union[0])
return iou
def nms_temporal(x1,x2,s, overlap):
pick = []
assert len(x1)==len(s)
assert len(x2)==len(s)
if len(x1)==0:
return pick
#x1 = [b[0] for b in boxes]
#x2 = [b[1] for b in boxes]
#s = [b[-1] for b in boxes]
union = map(operator.sub, x2, x1) # union = x2-x1
I = [i[0] for i in sorted(enumerate(s), key=lambda x:x[1])] # sort and get index
while len(I)>0:
i = I[-1]
pick.append(i)
xx1 = [max(x1[i],x1[j]) for j in I[:-1]]
xx2 = [min(x2[i],x2[j]) for j in I[:-1]]
inter = [max(0.0, k2-k1) for k1, k2 in zip(xx1, xx2)]
o = [inter[u]/(union[i] + union[I[u]] - inter[u]) for u in range(len(I)-1)]
I_new = []
for j in range(len(o)):
if o[j] <=overlap:
I_new.append(I[j])
I = I_new
return pick
'''
compute recall at certain IoU
'''
def compute_IoU_recall_top_n_forreg(top_n, iou_thresh, sentence_image_mat, sentence_image_reg_mat, sclips, iclips):
correct_num = 0.0
for k in range(sentence_image_mat.shape[0]):
gt = sclips[k]
gt_start = float(gt.split("_")[1])
gt_end = float(gt.split("_")[2])
#print gt +" "+str(gt_start)+" "+str(gt_end)
sim_v = [v for v in sentence_image_mat[k]]
starts = [s for s in sentence_image_reg_mat[k,:,0]]
ends = [e for e in sentence_image_reg_mat[k,:,1]]
picks = nms_temporal(starts,ends, sim_v, iou_thresh-0.05)
#sim_argsort=np.argsort(sim_v)[::-1][0:top_n]
if top_n<len(picks): picks=picks[0:top_n]
for index in picks:
pred_start = sentence_image_reg_mat[k, index, 0]
pred_end = sentence_image_reg_mat[k, index, 1]
iou = calculate_IoU((gt_start, gt_end),(pred_start, pred_end))
if iou>=iou_thresh:
correct_num+=1
break
return correct_num
'''
evaluate the model
'''
def do_eval_slidingclips(sess, vs_eval_op, model, movie_length_info, iter_step, test_result_output):
IoU_thresh = [0.1, 0.2, 0.3, 0.4, 0.5]
all_correct_num_10 = [0.0]*5
all_correct_num_5 = [0.0]*5
all_correct_num_1 = [0.0]*5
all_retrievd = 0.0
for movie_name in model.test_set.movie_names:
movie_length=movie_length_info[movie_name.split(".")[0]]
print "Test movie: "+movie_name+"....loading movie data"
movie_clip_featmaps, movie_clip_sentences=model.test_set.load_movie_slidingclip(movie_name, 16)
print "sentences: "+ str(len(movie_clip_sentences))
print "clips: "+ str(len(movie_clip_featmaps))
sentence_image_mat=np.zeros([len(movie_clip_sentences), len(movie_clip_featmaps)])
sentence_image_reg_mat=np.zeros([len(movie_clip_sentences), len(movie_clip_featmaps), 2])
for k in range(len(movie_clip_sentences)):
#sentence_clip_name=movie_clip_sentences[k][0]
#start=float(sentence_clip_name.split("_")[1])
#end=float(sentence_clip_name.split("_")[2].split("_")[0])
sent_vec=movie_clip_sentences[k][1]
sent_vec=np.reshape(sent_vec,[1,sent_vec.shape[0]])
for t in range(len(movie_clip_featmaps)):
featmap = movie_clip_featmaps[t][1]
visual_clip_name = movie_clip_featmaps[t][0]
start = float(visual_clip_name.split("_")[1])
end = float(visual_clip_name.split("_")[2].split("_")[0])
featmap = np.reshape(featmap, [1, featmap.shape[0]])
feed_dict = {
model.visual_featmap_ph_test: featmap,
model.sentence_ph_test:sent_vec
}
outputs = sess.run(vs_eval_op,feed_dict=feed_dict)
sentence_image_mat[k,t] = outputs[0]
reg_clip_length = (end-start)*(10**outputs[2])
reg_mid_point = (start+end)/2.0+movie_length*outputs[1]
reg_end = end+outputs[2]
reg_start = start+outputs[1]
sentence_image_reg_mat[k,t,0] = reg_start
sentence_image_reg_mat[k,t,1] = reg_end
iclips = [b[0] for b in movie_clip_featmaps]
sclips = [b[0] for b in movie_clip_sentences]
# calculate Recall@m, IoU=n
for k in range(len(IoU_thresh)):
IoU=IoU_thresh[k]
correct_num_10 = compute_IoU_recall_top_n_forreg(10, IoU, sentence_image_mat, sentence_image_reg_mat, sclips, iclips)
correct_num_5 = compute_IoU_recall_top_n_forreg(5, IoU, sentence_image_mat, sentence_image_reg_mat, sclips, iclips)
correct_num_1 = compute_IoU_recall_top_n_forreg(1, IoU, sentence_image_mat, sentence_image_reg_mat, sclips, iclips)
print movie_name+" IoU="+str(IoU)+", R@10: "+str(correct_num_10/len(sclips))+"; IoU="+str(IoU)+", R@5: "+str(correct_num_5/len(sclips))+"; IoU="+str(IoU)+", R@1: "+str(correct_num_1/len(sclips))
all_correct_num_10[k]+=correct_num_10
all_correct_num_5[k]+=correct_num_5
all_correct_num_1[k]+=correct_num_1
all_retrievd+=len(sclips)
for k in range(len(IoU_thresh)):
print " IoU="+str(IoU_thresh[k])+", R@10: "+str(all_correct_num_10[k]/all_retrievd)+"; IoU="+str(IoU_thresh[k])+", R@5: "+str(all_correct_num_5[k]/all_retrievd)+"; IoU="+str(IoU_thresh[k])+", R@1: "+str(all_correct_num_1[k]/all_retrievd)
test_result_output.write("Step "+str(iter_step)+": IoU="+str(IoU_thresh[k])+", R@10: "+str(all_correct_num_10[k]/all_retrievd)+"; IoU="+str(IoU_thresh[k])+", R@5: "+str(all_correct_num_5[k]/all_retrievd)+"; IoU="+str(IoU_thresh[k])+", R@1: "+str(all_correct_num_1[k]/all_retrievd)+"\n")
def run_training():
initial_steps = 0
max_steps = 20000
batch_size = 56
train_csv_path = "./exp_data/TACoS/train_clip-sentvec.pkl"
test_csv_path = "./exp_data/TACoS/test_clip-sentvec.pkl"
test_feature_dir="../TACOS/Interval128_256_overlap0.8_c3d_fc6/"
train_feature_dir = "../TACOS/Interval64_128_256_512_overlap0.8_c3d_fc6/"
model = ctrl_model.CTRL_Model(batch_size, train_csv_path, test_csv_path, test_feature_dir, train_feature_dir)
test_result_output=open("ctrl_test_results.txt", "w")
with tf.Graph().as_default():
loss_align_reg, vs_train_op, vs_eval_op, offset_pred, loss_reg = model.construct_model()
# Create a session for running Ops on the Graph.
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.2)
sess = tf.Session(config = tf.ConfigProto(gpu_options = gpu_options))
# Run the Op to initialize the variables.
init = tf.initialize_all_variables()
sess.run(init)
for step in xrange(max_steps):
start_time = time.time()
feed_dict = model.fill_feed_dict_train_reg()
_, loss_value, offset_pred_v, loss_reg_v = sess.run([vs_train_op, loss_align_reg, offset_pred, loss_reg], feed_dict=feed_dict)
duration = time.time() - start_time
if step % 5 == 0:
# Print status to stdout.
print('Step %d: loss = %.3f (%.3f sec)' % (step, loss_value, duration))
if (step+1) % 2000 == 0:
print "Start to test:-----------------\n"
movie_length_info=pickle.load(open("./video_allframes_info.pkl"))
do_eval_slidingclips(sess, vs_eval_op, model, movie_length_info, step+1, test_result_output)
def main(_):
run_training()
if __name__ == '__main__':
tf.app.run()
================================================
FILE: util/__init__.py
================================================
================================================
FILE: util/cnn.py
================================================
from __future__ import division #, print_function
import tensorflow as tf
def conv_layer(name, bottom, kernel_size, stride, output_dim, padding='SAME',
bias_term=True, weights_initializer=None, biases_initializer=None):
# input has shape [batch, in_height, in_width, in_channels]
input_dim = bottom.get_shape().as_list()[-1]
# weights and biases variables
with tf.variable_scope(name):
if weights_initializer is None and biases_initializer is None:
# initialize the variables
if weights_initializer is None:
weights_initializer = tf.random_normal_initializer(mean=0.0, stddev=0.01)
if bias_term and biases_initializer is None:
biases_initializer = tf.constant_initializer(0.)
print "input_dim"+str(input_dim)
# filter has shape [filter_height, filter_width, in_channels, out_channels]
weights = tf.get_variable("weights",
[kernel_size, kernel_size, input_dim, output_dim],
initializer=weights_initializer)
if bias_term:
biases = tf.get_variable("biases", output_dim,
initializer=biases_initializer)
print str(weights.name)+" initialized as random or retrieved from graph"
if bias_term:
print biases.name+" initialized as random or retrieved from graph"
else:
weights = tf.get_variable("weights",
shape=None,
initializer=weights_initializer)
if bias_term:
biases = tf.get_variable("biases", shape=None,
initializer=biases_initializer)
print weights.name+" initialized from pre-trained parameters or retrieved from graph"
if bias_term:
print biases.name+" initialized from pre-trained parameters or retrieved from graph"
conv = tf.nn.conv2d(bottom, filter=weights,
strides=[1, stride, stride, 1], padding=padding)
if bias_term:
conv = tf.nn.bias_add(conv, biases)
return conv
def conv_relu_layer(name, bottom, kernel_size, stride, output_dim, padding='SAME',
bias_term=True, weights_initializer=None, biases_initializer=None):
conv = conv_layer(name, bottom, kernel_size, stride, output_dim, padding,
bias_term, weights_initializer, biases_initializer)
relu = tf.nn.relu(conv)
return relu
def deconv_layer(name, bottom, kernel_size, stride, output_dim, padding='SAME',
bias_term=True, weights_initializer=None, biases_initializer=None):
# input_shape is [batch, in_height, in_width, in_channels]
input_shape = bottom.get_shape().as_list()
batch_size, input_height, input_width, input_dim = input_shape
output_shape = [batch_size, input_height*stride, input_width*stride, output_dim]
# weights and biases variables
with tf.variable_scope(name):
# initialize the variables
if weights_initializer is None:
weights_initializer = tf.random_normal_initializer()
if bias_term and biases_initializer is None:
biases_initializer = tf.constant_initializer(0.)
# filter has shape [filter_height, filter_width, out_channels, in_channels]
weights = tf.get_variable("weights",
[kernel_size, kernel_size, output_dim, input_dim],
initializer=weights_initializer)
if bias_term:
biases = tf.get_variable("biases", output_dim,
initializer=biases_initializer)
deconv = tf.nn.conv2d_transpose(bottom, filter=weights,
output_shape=output_shape, strides=[1, stride, stride, 1],
padding=padding)
if bias_term:
deconv = tf.nn.bias_add(deconv, biases)
return deconv
def deconv_relu_layer(name, bottom, kernel_size, stride, output_dim, padding='SAME',
bias_term=True, weights_initializer=None, biases_initializer=None):
deconv = deconv_layer(name, bottom, kernel_size, stride, output_dim, padding,
bias_term, weights_initializer, biases_initializer)
relu = tf.nn.relu(deconv)
return relu
def pooling_layer(name, bottom, kernel_size, stride):
pool = tf.nn.max_pool(bottom, ksize=[1, kernel_size, kernel_size, 1],
strides=[1, stride, stride, 1], padding='SAME', name=name)
return pool
def fc_layer(name, bottom, output_dim, bias_term=True, weights_initializer=None,
biases_initializer=None):
# flatten bottom input
# input has shape [batch, in_height, in_width, in_channels]
shape = bottom.get_shape().as_list()
input_dim = 1
for d in shape[1:]:
input_dim *= d
flat_bottom = tf.reshape(bottom, [-1, input_dim])
# weights and biases variables
with tf.variable_scope(name):
if weights_initializer is None and biases_initializer is None:
# initialize the variables
if weights_initializer is None:
weights_initializer = tf.random_normal_initializer()
if bias_term and biases_initializer is None:
biases_initializer = tf.constant_initializer(0.)
# weights has shape [input_dim, output_dim]
weights = tf.get_variable("weights", [input_dim, output_dim],
initializer=weights_initializer)
if bias_term:
biases = tf.get_variable("biases", output_dim,
initializer=biases_initializer)
print weights.name+" initialized as random or retrieved from graph"
if bias_term:
print biases.name+" initialized as random or retrieved from graph"
else:
weights = tf.get_variable("weights", shape=None,
initializer=weights_initializer)
if bias_term:
biases = tf.get_variable("biases", shape=None,
initializer=biases_initializer)
print weights.name+" initialized from pre-trained parameters or retrieved from graph"
if bias_term:
print biases.name+" initialized from pre-trained parameters or retrieved from graph"
if bias_term:
fc = tf.nn.xw_plus_b(flat_bottom, weights, biases)
else:
fc = tf.matmul(flat_bottom, weights)
return fc
def fc_relu_layer(name, bottom, output_dim, bias_term=True,
weights_initializer=None, biases_initializer=None):
fc = fc_layer(name, bottom, output_dim, bias_term, weights_initializer,
biases_initializer)
relu = tf.nn.relu(fc)
return relu
def softmax_loss_layer(name, score_bottom, label_bottom):
"""
Calculates cumulative Softmax Cross Entropy Loss along the last dimension
*This function does not divide the loss by batch size*
Once tensorflow has SparseCrossEntropy function, this one will be replaced
"""
# Check shape
score_shape = score_bottom.get_shape().as_list()
label_shape = label_bottom.get_shape().as_list()
assert len(score_shape) == len(label_shape) + 1
assert score_shape[:-1] == label_shape
# Compute the outer dimensions dimensions in label
inner_dim = score_shape[-1]
outer_dim = 1
for d in label_shape: outer_dim *= d
# flatten score and label
flat_score = tf.reshape(score_bottom, [outer_dim, inner_dim])
flat_label = tf.reshape(label_bottom, [outer_dim, 1])
# Reshape the labels into a dense Tensor of
# shape [batch_size, NUM_CLASSES].
sparse_labels = tf.reshape(labels, [FLAGS.batch_size, 1])
indices = tf.reshape(tf.range(FLAGS.batch_size), [FLAGS.batch_size, 1])
concated = tf.concat(1, [indices, sparse_labels])
dense_labels = tf.sparse_to_dense(concated, [FLAGS.batch_size, NUM_CLASSES],
1.0, 0.0)
================================================
FILE: video_allframes_info.pkl
================================================
(dp0
S's30-d43'
p1
I19807
sS's30-d40'
p2
I4911
sS's30-d41'
p3
I19163
sS's22-d55'
p4
I1530
sS's25-d52'
p5
I5676
sS's25-d51'
p6
I3409
sS's28-d39'
p7
I22908
sS's35-d55'
p8
I6758
sS's24-d41'
p9
I17753
sS's24-d40'
p10
I6322
sS's24-d48'
p11
I2397
sS's21-d29'
p12
I1881
sS's21-d39'
p13
I3774
sS's22-d29'
p14
I3253
sS's33-d27'
p15
I20003
sS's22-d46'
p16
I14844
sS's32-d70'
p17
I7357
sS's13-d52'
p18
I2767
sS's22-d48'
p19
I2467
sS's13-d54'
p20
I8199
sS's37-d39'
p21
I9169
sS's25-d23'
p22
I3808
sS's27-d70'
p23
I8218
sS's36-d50'
p24
I5219
sS's26-d70'
p25
I5847
sS's22-d53'
p26
I6013
sS's21-d23'
p27
I3452
sS's21-d21'
p28
I4450
sS's24-d53'
p29
I5237
sS's21-d28'
p30
I3693
sS's32-d69'
p31
I12409
sS's35-d48'
p32
I7628
sS's31-d28'
p33
I5841
sS's31-d25'
p34
I2650
sS's22-d25'
p35
I1621
sS's29-d42'
p36
I22198
sS's35-d40'
p37
I13864
sS's23-d31'
p38
I3216
sS's17-d48'
p39
I2273
sS's36-d42'
p40
I18133
sS's34-d34'
p41
I17675
sS's35-d41'
p42
I27306
sS's34-d41'
p43
I15419
sS's37-d46'
p44
I13677
sS's21-d53'
p45
I4683
sS's30-d26'
p46
I17130
sS's24-d28'
p47
I7230
sS's21-d55'
p48
I2133
sS's14-d35'
p49
I2421
sS's30-d29'
p50
I6115
sS's36-d70'
p51
I7055
sS's23-d51'
p52
I6830
sS's31-d31'
p53
I6273
sS's23-d54'
p54
I9474
sS's24-d34'
p55
I5430
sS's17-d55'
p56
I2335
sS's27-d50'
p57
I2430
sS's17-d53'
p58
I7031
sS's22-d43'
p59
I3315
sS's25-d69'
p60
I14351
sS's27-d34'
p61
I1772
sS's23-d46'
p62
I10819
sS's21-d42'
p63
I6090
sS's21-d43'
p64
I4033
sS's23-d42'
p65
I12416
sS's21-d45'
p66
I2866
sS's14-d26'
p67
I12483
sS's14-d27'
p68
I6134
sS's21-d40'
p69
I3238
sS's26-d26'
p70
I41240
sS's26-d23'
p71
I2842
sS's17-d42'
p72
I15601
sS's34-d28'
p73
I11816
sS's23-d45'
p74
I4184
sS's29-d52'
p75
I5871
sS's15-d70'
p76
I8315
sS's27-d45'
p77
I4757
sS's29-d50'
p78
I2847
sS's27-d29'
p79
I8024
sS's21-d35'
p80
I1875
sS's27-d21'
p81
I4408
sS's37-d25'
p82
I1436
sS's37-d21'
p83
I11130
sS's23-d34'
p84
I5231
sS's23-d39'
p85
I5444
sS's37-d29'
p86
I3012
sS's27-d54'
p87
I10969
sS's14-d51'
p88
I8323
sS's28-d46'
p89
I11162
sS's23-d21'
p90
I4866
sS's13-d48'
p91
I3228
sS's32-d27'
p92
I22541
sS's13-d21'
p93
I2955
sS's13-d25'
p94
I2951
sS's13-d28'
p95
I5629
sS's14-d46'
p96
I9249
sS's14-d43'
p97
I4667
sS's22-d34'
p98
I2401
sS's21-d50'
p99
I1607
sS's24-d23'
p100
I5735
sS's29-d31'
p101
I3164
sS's17-d69'
p102
I12454
sS's15-d26'
p103
I33264
sS's29-d39'
p104
I9630
sS's32-d55'
p105
I5689
sS's32-d52'
p106
I5508
sS's13-d31'
p107
I5093
sS's36-d31'
p108
I7071
sS's36-d43'
p109
I26924
sS's33-d45'
p110
I7698
sS's22-d26'
p111
I9365
sS's28-d27'
p112
I17235
sS's28-d25'
p113
I3220
sS's30-d53'
p114
I13379
sS's30-d52'
p115
I7362
sS's33-d54'
p116
I10898
sS's22-d35'
p117
I2555
sS's15-d35'
p118
I6472
sS's33-d50'
p119
I3466
sS's13-d40'
p120
I3347
sS's13-d45'
p121
I4354
sS's36-d23'
p122
I19412
sS's25-d35'
p123
I3900
sS's28-d51'
p124
I19725
sS's26-d69'
p125
I20978
sS's36-d27'
p126
I4390
sS's34-d69'
p127
I12952
s.
================================================
FILE: vs_multilayer.py
================================================
from __future__ import division
import numpy as np
import tensorflow as tf
# components
from tensorflow.python.ops.nn import dropout as drop
from util.cnn import conv_layer as conv
from util.cnn import conv_relu_layer as conv_relu
from util.cnn import pooling_layer as pool
from util.cnn import fc_layer as fc
from util.cnn import fc_relu_layer as fc_relu
def vs_multilayer(input_batch,name,middle_layer_dim=1000,reuse=False):
with tf.variable_scope(name):
if reuse==True:
print name+" reuse variables"
tf.get_variable_scope().reuse_variables()
else:
print name+" doesn't reuse variables"
layer1 = conv_relu('layer1', input_batch,
kernel_size=1,stride=1,output_dim=middle_layer_dim)
sim_score = conv('layer2', layer1,
kernel_size=1,stride=1,output_dim=3)
return sim_score
gitextract_wdpa8dzs/ ├── README.md ├── ctrl_model.py ├── ctrl_test_results.txt ├── dataset.py ├── exp_data/ │ └── .gitkeep ├── main.py ├── util/ │ ├── __init__.py │ └── cnn.py ├── video_allframes_info.pkl └── vs_multilayer.py
SYMBOL INDEX (42 symbols across 5 files)
FILE: ctrl_model.py
class CTRL_Model (line 11) | class CTRL_Model(object):
method __init__ (line 12) | def __init__(self, batch_size, train_csv_path, test_csv_path, test_vis...
method fill_feed_dict_train (line 28) | def fill_feed_dict_train(self):
method fill_feed_dict_train_reg (line 41) | def fill_feed_dict_train_reg(self):
method cross_modal_comb (line 55) | def cross_modal_comb(self, visual_feat, sentence_embed, batch_size):
method visual_semantic_infer (line 70) | def visual_semantic_infer(self, visual_feature_train, sentence_embed_t...
method compute_loss_reg (line 97) | def compute_loss_reg(self, sim_reg_mat, offset_label):
method init_placeholder (line 130) | def init_placeholder(self):
method get_variables_by_name (line 140) | def get_variables_by_name(self,name_list):
method training (line 155) | def training(self, loss):
method construct_model (line 163) | def construct_model(self):
FILE: dataset.py
function calculate_IoU (line 11) | def calculate_IoU(i0, i1):
function calculate_nIoL (line 20) | def calculate_nIoL(base, sliding_clip):
class TrainingDataSet (line 27) | class TrainingDataSet(object):
method __init__ (line 28) | def __init__(self, sliding_dir, it_path, batch_size):
method get_context_window (line 90) | def get_context_window(self, clip_name, win_length):
method next_batch (line 123) | def next_batch(self):
method next_batch_iou (line 152) | def next_batch_iou(self):
class TestingDataSet (line 184) | class TestingDataSet(object):
method __init__ (line 185) | def __init__(self, img_dir, csv_path, batch_size):
method get_clip_sample (line 230) | def get_clip_sample(self, sample_num, movie_name, clip_name):
method get_context_window (line 243) | def get_context_window(self, clip_name, win_length):
method load_movie (line 275) | def load_movie(self, movie_name):
method load_movie_byclip (line 290) | def load_movie_byclip(self,movie_name,sample_num):
method load_movie_slidingclip (line 306) | def load_movie_slidingclip(self, movie_name, sample_num):
FILE: main.py
function dense_to_one_hot (line 11) | def dense_to_one_hot(labels_dense, num_classes):
function compute_ap (line 19) | def compute_ap(class_score_matrix, labels):
function calculate_IoU (line 30) | def calculate_IoU(i0,i1):
function nms_temporal (line 36) | def nms_temporal(x1,x2,s, overlap):
function compute_IoU_recall_top_n_forreg (line 67) | def compute_IoU_recall_top_n_forreg(top_n, iou_thresh, sentence_image_ma...
function do_eval_slidingclips (line 92) | def do_eval_slidingclips(sess, vs_eval_op, model, movie_length_info, ite...
function run_training (line 151) | def run_training():
function main (line 186) | def main(_):
FILE: util/cnn.py
function conv_layer (line 5) | def conv_layer(name, bottom, kernel_size, stride, output_dim, padding='S...
function conv_relu_layer (line 49) | def conv_relu_layer(name, bottom, kernel_size, stride, output_dim, paddi...
function deconv_layer (line 56) | def deconv_layer(name, bottom, kernel_size, stride, output_dim, padding=...
function deconv_relu_layer (line 86) | def deconv_relu_layer(name, bottom, kernel_size, stride, output_dim, pad...
function pooling_layer (line 93) | def pooling_layer(name, bottom, kernel_size, stride):
function fc_layer (line 98) | def fc_layer(name, bottom, output_dim, bias_term=True, weights_initializ...
function fc_relu_layer (line 144) | def fc_relu_layer(name, bottom, output_dim, bias_term=True,
function softmax_loss_layer (line 151) | def softmax_loss_layer(name, score_bottom, label_bottom):
FILE: vs_multilayer.py
function vs_multilayer (line 14) | def vs_multilayer(input_batch,name,middle_layer_dim=1000,reuse=False):
Condensed preview — 10 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (49K chars).
[
{
"path": "README.md",
"chars": 1983,
"preview": "## TALL: Temporal Activity Localization via Language Query\n\nThis is the repository for our ICCV 2017 paper [_TALL: Tempo"
},
{
"path": "ctrl_model.py",
"chars": 8545,
"preview": "import numpy as np\nimport tensorflow as tf\nfrom tensorflow.python.framework import dtypes\n\nfrom util.cnn import fc_layer"
},
{
"path": "ctrl_test_results.txt",
"chars": 0,
"preview": ""
},
{
"path": "dataset.py",
"chars": 15984,
"preview": "\nimport numpy as np\nfrom math import sqrt\nimport os\nimport random\nimport pickle\n\n'''\ncalculate temporal intersection ove"
},
{
"path": "exp_data/.gitkeep",
"chars": 0,
"preview": ""
},
{
"path": "main.py",
"chars": 8783,
"preview": "import tensorflow as tf\nimport numpy as np\nimport ctrl_model\nfrom six.moves import xrange\nimport time\nfrom sklearn.metri"
},
{
"path": "util/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "util/cnn.py",
"chars": 7849,
"preview": "from __future__ import division #, print_function\n\nimport tensorflow as tf\n\ndef conv_layer(name, bottom, kernel_size, s"
},
{
"path": "video_allframes_info.pkl",
"chars": 2855,
"preview": "(dp0\nS's30-d43'\np1\nI19807\nsS's30-d40'\np2\nI4911\nsS's30-d41'\np3\nI19163\nsS's22-d55'\np4\nI1530\nsS's25-d52'\np5\nI5676\nsS's25-d5"
},
{
"path": "vs_multilayer.py",
"chars": 900,
"preview": "from __future__ import division\n\nimport numpy as np\nimport tensorflow as tf\n\n# components\nfrom tensorflow.python.ops.nn "
}
]
About this extraction
This page contains the full source code of the jiyanggao/TALL GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 10 files (45.8 KB), approximately 12.5k tokens, and a symbol index with 42 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.