Repository: jiyanggao/TALL
Branch: master
Commit: 3df6794af148
Files: 10
Total size: 45.8 KB

Directory structure:
gitextract_wdpa8dzs/

├── README.md
├── ctrl_model.py
├── ctrl_test_results.txt
├── dataset.py
├── exp_data/
│   └── .gitkeep
├── main.py
├── util/
│   ├── __init__.py
│   └── cnn.py
├── video_allframes_info.pkl
└── vs_multilayer.py

================================================
FILE CONTENTS
================================================

================================================
FILE: README.md
================================================
## TALL: Temporal Activity Localization via Language Query

This is the repository for our ICCV 2017 paper [_TALL: Temporal Activity Localization via Language Query_](https://arxiv.org/abs/1705.02101).

### Visual Features on TACoS
Download the C3D features for [training set](https://drive.google.com/file/d/1zQp0aYGFCm8PqqHOh4UtXfy2U3pJMBeu/view?usp=sharing)  and [test set](https://drive.google.com/file/d/1zC-UrspRf42Qiu5prQw4fQrbgLQfJN-P/view?usp=sharing) of TACoS dataset. Modify the path to feature folders in main.py

### Sentence Embeddings on TACoS
Download the Skip-thought sentence embeddings and sample files from [here](https://drive.google.com/file/d/1HF-hNFPvLrHwI5O7YvYKZWTeTxC5Mg1K/view?usp=sharing) of TACoS Dataset, and put them under exp_data folder.

### Reproduce the results on TACoS
`python main.py`

### Charades-STA anno download
The sentence temporal annotations on [Charades](http://allenai.org/plato/charades/) dataset are available here: [train](https://drive.google.com/file/d/1ZjG7wJpPSMIBYnW7BAG2u9VVEoNvFm5c/view?usp=sharing), [test](https://drive.google.com/file/d/1QG4MXFkoj6JFU0YK5olTY75xTARKSW5e/view?usp=sharing). The format is "[video name] [start time] [end time]##[sentence]". You may want to generate the skip-thought embeddings and C3D features on Charades-STA, and modify the codes slightly to reproduce the experiments.

### Updates on Charades-STA performance
I did some anno cleaning for Charades-STA (compared to the version I used in ICCV paper), the updated performance is listed below. Please compare to these results when using Charades-STA.

| Model            | R@1,IoU=0.5 | R@1,IoU=0.7 | R@5,IoU=0.5 | R@5,IoU=0.7 |
| :--------------- | ----------: | ----------: | ----------: | ----------: | 
| CTRL (aln)       |   17.69     |    5.91     |    55.54    |     23.79   |
| CTRL (reg-p)     |   19.22     |    6.64     |    57.98    |     25.22   |
| CTRL (reg-np)    |   21.42     |    7.15     |    59.11    |     26.91   |


================================================
FILE: ctrl_model.py
================================================
import numpy as np
import tensorflow as tf
from tensorflow.python.framework import dtypes

from util.cnn import fc_layer as fc
import vs_multilayer 
from dataset import TestingDataSet
from dataset import TrainingDataSet


class CTRL_Model(object):
    def __init__(self, batch_size, train_csv_path, test_csv_path, test_visual_feature_dir, train_visual_feature_dir):
        
        self.batch_size = batch_size
        self.test_batch_size = 1
        self.vs_lr = 0.005
        self.lambda_regression = 0.01
        self.alpha = 1.0/batch_size
        self.semantic_size = 1024 # the size of visual and semantic comparison size
        self.sentence_embedding_size = 4800
        self.visual_feature_dim = 4096*3
        self.train_set=TrainingDataSet(train_visual_feature_dir, train_csv_path, self.batch_size)
        self.test_set=TestingDataSet(test_visual_feature_dir, test_csv_path, self.test_batch_size)
   
    '''
    used in training alignment model, CTRL(aln)
    '''	
    def fill_feed_dict_train(self):
        image_batch,sentence_batch,offset_batch = self.train_set.next_batch()
        input_feed = {
                self.visual_featmap_ph_train: image_batch,
                self.sentence_ph_train: sentence_batch,
                self.offset_ph: offset_batch
        }

        return input_feed
    
    '''
    used in training alignment+regression model, CTRL(reg)
    '''
    def fill_feed_dict_train_reg(self):
        image_batch, sentence_batch, offset_batch = self.train_set.next_batch_iou()
        input_feed = {
                self.visual_featmap_ph_train: image_batch,
                self.sentence_ph_train: sentence_batch,
                self.offset_ph: offset_batch
        }

        return input_feed

    
    '''
    cross modal processing module
    '''
    def cross_modal_comb(self, visual_feat, sentence_embed, batch_size):
        vv_feature = tf.reshape(tf.tile(visual_feat, [batch_size, 1]),
            [batch_size, batch_size, self.semantic_size])
        ss_feature = tf.reshape(tf.tile(sentence_embed,[1, batch_size]),[batch_size, batch_size, self.semantic_size])
        concat_feature = tf.reshape(tf.concat(2,[vv_feature, ss_feature]),[batch_size, batch_size, self.semantic_size+self.semantic_size])
        print concat_feature.get_shape().as_list()
        mul_feature = tf.mul(vv_feature, ss_feature) 
        add_feature = tf.add(vv_feature, ss_feature)
        
        comb_feature = tf.reshape(tf.concat(2, [mul_feature, add_feature, concat_feature]),[1, batch_size, batch_size, self.semantic_size*4])
        return comb_feature
    
    '''
    visual semantic inference, including visual semantic alignment and clip location regression
    '''
    def visual_semantic_infer(self, visual_feature_train, sentence_embed_train, visual_feature_test, sentence_embed_test):
        name="CTRL_Model"
        with tf.variable_scope(name):
            print "Building training network...............................\n"     
            transformed_clip_train = fc('v2s_lt', visual_feature_train, output_dim=self.semantic_size) 
            transformed_clip_train_norm = tf.nn.l2_normalize(transformed_clip_train, dim=1)
            transformed_sentence_train = fc('s2s_lt', sentence_embed_train, output_dim=self.semantic_size)
            transformed_sentence_train_norm = tf.nn.l2_normalize(transformed_sentence_train, dim=1)  
            cross_modal_vec_train = self.cross_modal_comb(transformed_clip_train_norm, transformed_sentence_train_norm, self.batch_size)
            sim_score_mat_train = vs_multilayer.vs_multilayer(cross_modal_vec_train, "vs_multilayer_lt", middle_layer_dim=1000)
            sim_score_mat_train = tf.reshape(sim_score_mat_train,[self.batch_size, self.batch_size, 3])

            tf.get_variable_scope().reuse_variables()
            print "Building test network...............................\n" 
            transformed_clip_test = fc('v2s_lt', visual_feature_test, output_dim=self.semantic_size)
            transformed_clip_test_norm = tf.nn.l2_normalize(transformed_clip_test, dim=1)
            transformed_sentence_test = fc('s2s_lt', sentence_embed_test, output_dim=self.semantic_size)
            transformed_sentence_test_norm = tf.nn.l2_normalize(transformed_sentence_test, dim=1)
            cross_modal_vec_test = self.cross_modal_comb(transformed_clip_test_norm, transformed_sentence_test_norm, self.test_batch_size)
            sim_score_mat_test = vs_multilayer.vs_multilayer(cross_modal_vec_test, "vs_multilayer_lt", reuse=True, middle_layer_dim=1000)
            sim_score_mat_test = tf.reshape(sim_score_mat_test, [3])

            return sim_score_mat_train, sim_score_mat_test

    '''
    compute alignment and regression loss
    '''
    def compute_loss_reg(self, sim_reg_mat, offset_label):

        sim_score_mat, p_reg_mat, l_reg_mat = tf.split(2, 3, sim_reg_mat)
        sim_score_mat = tf.reshape(sim_score_mat, [self.batch_size, self.batch_size])
        l_reg_mat = tf.reshape(l_reg_mat, [self.batch_size, self.batch_size])
        p_reg_mat = tf.reshape(p_reg_mat, [self.batch_size, self.batch_size])
        # unit matrix with -2
        I_2 = tf.diag(tf.constant(-2.0, shape=[self.batch_size]))
        all1 = tf.constant(1.0, shape=[self.batch_size, self.batch_size])
        #               | -1  1   1...   |

        #   mask_mat =  | 1  -1  -1...   |

        #               | 1   1  -1 ...  |
        mask_mat = tf.add(I_2, all1)
        # loss cls, not considering iou
        I = tf.diag(tf.constant(1.0, shape=[self.batch_size]))
        I_half = tf.diag(tf.constant(0.5, shape=[self.batch_size]))
        batch_para_mat = tf.constant(self.alpha, shape=[self.batch_size, self.batch_size])
        para_mat = tf.add(I,batch_para_mat)
        loss_mat = tf.log(tf.add(all1, tf.exp(tf.mul(mask_mat, sim_score_mat))))
        loss_mat = tf.mul(loss_mat, para_mat)
        loss_align = tf.reduce_mean(loss_mat)
        # regression loss
        l_reg_diag = tf.matmul(tf.mul(l_reg_mat, I), tf.constant(1.0, shape=[self.batch_size, 1]))
        p_reg_diag = tf.matmul(tf.mul(p_reg_mat, I), tf.constant(1.0, shape=[self.batch_size, 1]))
        offset_pred = tf.concat(1, (p_reg_diag, l_reg_diag))
        loss_reg = tf.reduce_mean(tf.abs(tf.sub(offset_pred, offset_label)))

        loss=tf.add(tf.mul(self.lambda_regression, loss_reg), loss_align)
        return loss, offset_pred, loss_reg


    def init_placeholder(self):
        visual_featmap_ph_train = tf.placeholder(tf.float32, shape=(self.batch_size, self.visual_feature_dim))
        sentence_ph_train = tf.placeholder(tf.float32, shape=(self.batch_size, self.sentence_embedding_size))
        offset_ph = tf.placeholder(tf.float32, shape=(self.batch_size,2))
        visual_featmap_ph_test = tf.placeholder(tf.float32, shape=(self.test_batch_size, self.visual_feature_dim))
        sentence_ph_test = tf.placeholder(tf.float32, shape=(self.test_batch_size, self.sentence_embedding_size))

        return visual_featmap_ph_train,sentence_ph_train,offset_ph,visual_featmap_ph_test,sentence_ph_test
    

    def get_variables_by_name(self,name_list):
        v_list = tf.trainable_variables()
        v_dict = {}
        for name in name_list:
            v_dict[name] = []
        for v in v_list:
            for name in name_list:
                if name in v.name: v_dict[name].append(v)

        for name in name_list:
            print "Variables of <"+name+">"
            for v in v_dict[name]:
                print "    "+v.name
        return v_dict

    def training(self, loss):
        
        v_dict = self.get_variables_by_name(["lt"])
        vs_optimizer = tf.train.AdamOptimizer(self.vs_lr, name='vs_adam')
        vs_train_op = vs_optimizer.minimize(loss, var_list=v_dict["lt"])
        return vs_train_op


    def construct_model(self):
        # initialize the placeholder
        self.visual_featmap_ph_train, self.sentence_ph_train, self.offset_ph, self.visual_featmap_ph_test, self.sentence_ph_test=self.init_placeholder()
        # build inference network
        sim_reg_mat, sim_reg_mat_test = self.visual_semantic_infer(self.visual_featmap_ph_train, self.sentence_ph_train, self.visual_featmap_ph_test, self.sentence_ph_test)
        # compute loss
        self.loss_align_reg, offset_pred, loss_reg = self.compute_loss_reg(sim_reg_mat, self.offset_ph)
        # optimize
        self.vs_train_op = self.training(self.loss_align_reg)
        return self.loss_align_reg, self.vs_train_op, sim_reg_mat_test, offset_pred, loss_reg


================================================
FILE: ctrl_test_results.txt
================================================


================================================
FILE: dataset.py
================================================

import numpy as np
from math import sqrt
import os
import random
import pickle

'''
calculate temporal intersection over union
'''
def calculate_IoU(i0, i1):
    union = (min(i0[0], i1[0]), max(i0[1], i1[1]))
    inter = (max(i0[0], i1[0]), min(i0[1], i1[1]))
    iou = 1.0*(inter[1]-inter[0])/(union[1]-union[0])
    return iou

'''
calculate the non Intersection part over Length ratia, make sure the input IoU is larger than 0
'''
def calculate_nIoL(base, sliding_clip):
    inter = (max(base[0], sliding_clip[0]), min(base[1], sliding_clip[1]))
    inter_l = inter[1]-inter[0]
    length = sliding_clip[1]-sliding_clip[0]
    nIoL = 1.0*(length-inter_l)/length
    return nIoL

class TrainingDataSet(object):
    def __init__(self, sliding_dir, it_path, batch_size):
        
        self.counter = 0
        self.batch_size = batch_size
        self.context_num = 1
        self.context_size = 128
        print "Reading training data list from "+it_path
        cs = pickle.load(open(it_path))
        movie_length_info = pickle.load(open("./video_allframes_info.pkl"))
        self.clip_sentence_pairs = []
        for l in cs:
            clip_name = l[0]
            sent_vecs = l[1]
            for sent_vec in sent_vecs:
                self.clip_sentence_pairs.append((clip_name, sent_vec))

        movie_names_set = set()
        self.movie_clip_names = {}
        # read groundtruth sentence-clip pairs
        for k in range(len(self.clip_sentence_pairs)):
            clip_name = self.clip_sentence_pairs[k][0]
            movie_name = clip_name.split("_")[0]
            if not movie_name in movie_names_set:
                movie_names_set.add(movie_name)
                self.movie_clip_names[movie_name] = []
            self.movie_clip_names[movie_name].append(k)
        self.movie_names = list(movie_names_set)
        self.visual_feature_dim = 4096*3
        self.sent_vec_dim = 4800
        self.num_samples = len(self.clip_sentence_pairs)
        self.sliding_clip_path = sliding_dir
        print str(len(self.clip_sentence_pairs))+" clip-sentence pairs are readed"
        
        # read sliding windows, and match them with the groundtruths to make training samples
        sliding_clips_tmp = os.listdir(self.sliding_clip_path)
        self.clip_sentence_pairs_iou = []
        for clip_name in sliding_clips_tmp:
            if clip_name.split(".")[2]=="npy":
                movie_name = clip_name.split("_")[0]
                for clip_sentence in self.clip_sentence_pairs:
                    original_clip_name = clip_sentence[0] 
                    original_movie_name = original_clip_name.split("_")[0]
                    if original_movie_name==movie_name:
                        start = int(clip_name.split("_")[1])
                        end = int(clip_name.split("_")[2].split(".")[0])
                        o_start = int(original_clip_name.split("_")[1]) 
                        o_end = int(original_clip_name.split("_")[2].split(".")[0])
                        iou = calculate_IoU((start, end), (o_start, o_end))
                        if iou>0.5:
                            nIoL=calculate_nIoL((o_start, o_end), (start, end))
                            if nIoL<0.15:
                                movie_length = movie_length_info[movie_name.split(".")[0]]
                                start_offset =o_start-start
                                end_offset = o_end-end
                                self.clip_sentence_pairs_iou.append((clip_sentence[0], clip_sentence[1], clip_name, start_offset, end_offset))
        self.num_samples_iou = len(self.clip_sentence_pairs_iou)
        print str(len(self.clip_sentence_pairs_iou))+" iou clip-sentence pairs are readed"
       
    
    '''
    compute left (pre) and right (post) context features
    '''
    def get_context_window(self, clip_name, win_length):
        movie_name = clip_name.split("_")[0]
        start = int(clip_name.split("_")[1])
        end = int(clip_name.split("_")[2].split(".")[0])
        clip_length = self.context_size
        left_context_feats = np.zeros([win_length, 4096], dtype=np.float32)
        right_context_feats = np.zeros([win_length, 4096], dtype=np.float32)
        last_left_feat = np.load(self.sliding_clip_path+clip_name)
        last_right_feat = np.load(self.sliding_clip_path+clip_name)
        for k in range(win_length):
            left_context_start = start-clip_length*(k+1)
            left_context_end = start-clip_length*k
            right_context_start = end+clip_length*k
            right_context_end = end+clip_length*(k+1)
            left_context_name = movie_name+"_"+str(left_context_start)+"_"+str(left_context_end)+".npy"
            right_context_name = movie_name+"_"+str(right_context_start)+"_"+str(right_context_end)+".npy"
            if os.path.exists(self.sliding_clip_path+left_context_name):
                left_context_feat = np.load(self.sliding_clip_path+left_context_name)
                last_left_feat = left_context_feat
            else:
                left_context_feat = last_left_feat
            if os.path.exists(self.sliding_clip_path+right_context_name):
                right_context_feat = np.load(self.sliding_clip_path+right_context_name)
                last_right_feat = right_context_feat
            else:
                right_context_feat = last_right_feat
            left_context_feats[k] = left_context_feat
            right_context_feats[k] = right_context_feat
        return np.mean(left_context_feats, axis=0), np.mean(right_context_feats, axis=0)
    
    '''
    read next batch of training data, this function is used for training CTRL-aln
    '''
    def next_batch(self):
        
        random_batch_index = random.sample(range(self.num_samples), self.batch_size)
        image_batch = np.zeros([self.batch_size, self.visual_feature_dim])
        sentence_batch = np.zeros([self.batch_size, self.sent_vec_dim])
        offset_batch = np.zeros([self.batch_size, 2], dtype=np.float32) # this one is actually useless
        index = 0
        clip_set=set()
        while index < self.batch_size:
            k = random_batch_index[index]
            clip_name = self.clip_sentence_pairs[k][0]
            if not clip_name in clip_set: 
                clip_set.add(clip_name)
                feat_path = self.image_dir+self.clip_sentence_pairs[k][0]+".npy"
                featmap = np.load(feat_path)
                image_batch[index,:] = featmap
                sentence_batch[index,:] = self.clip_sentence_pairs[k][1][:self.sent_vec_dim]

                index+=1
            else:
                r = random.choice(range(self.num_samples))
                random_batch_index[index] = r
                continue 
                      
        return image_batch, sentence_batch, offset_batch

    '''
    read next batch of training data, this function is used for training CTRL-reg
    '''
    def next_batch_iou(self):

        random_batch_index = random.sample(range(self.num_samples_iou), self.batch_size)
        image_batch = np.zeros([self.batch_size, self.visual_feature_dim])
        sentence_batch = np.zeros([self.batch_size, self.sent_vec_dim])
        offset_batch = np.zeros([self.batch_size, 2], dtype=np.float32)
        index = 0
        clip_set = set()
        while index < self.batch_size:
            k = random_batch_index[index]
            clip_name = self.clip_sentence_pairs_iou[k][0]
            if not clip_name in clip_set:
                clip_set.add(clip_name)
                feat_path = self.sliding_clip_path+self.clip_sentence_pairs_iou[k][2]
                featmap = np.load(feat_path)
                # read context features
                left_context_feat, right_context_feat = self.get_context_window(self.clip_sentence_pairs_iou[k][2], self.context_num)
                image_batch[index,:] = np.hstack((left_context_feat, featmap, right_context_feat))
                sentence_batch[index,:] = self.clip_sentence_pairs_iou[k][1][:self.sent_vec_dim]
                p_offset = self.clip_sentence_pairs_iou[k][3]
                l_offset = self.clip_sentence_pairs_iou[k][4]
                offset_batch[index,0] = p_offset
                offset_batch[index,1] = l_offset
                index+=1
            else:
                r = random.choice(range(self.num_samples_iou))
                random_batch_index[index] = r
                continue
       
        return image_batch, sentence_batch, offset_batch


class TestingDataSet(object):
    def __init__(self, img_dir, csv_path, batch_size):
        #il_path: image_label_file path
        #self.index_in_epoch = 0
        #self.epochs_completed = 0
        self.batch_size = batch_size
        self.image_dir = img_dir
        print "Reading testing data list from "+csv_path
        self.semantic_size = 4800
        csv = pickle.load(open(csv_path))
        self.clip_sentence_pairs = []
        for l in csv:
            clip_name = l[0]
            sent_vecs = l[1]
            for sent_vec in sent_vecs:
                self.clip_sentence_pairs.append((clip_name, sent_vec))
        print str(len(self.clip_sentence_pairs))+" pairs are readed"
        movie_names_set = set()
        self.movie_clip_names = {}
        for k in range(len(self.clip_sentence_pairs)):
            clip_name = self.clip_sentence_pairs[k][0]
            movie_name = clip_name.split("_")[0]
            if not movie_name in movie_names_set:
                movie_names_set.add(movie_name)
                self.movie_clip_names[movie_name] = []
            self.movie_clip_names[movie_name].append(k)
        self.movie_names = list(movie_names_set)
        
        self.clip_num_per_movie_max = 0
        for movie_name in self.movie_clip_names:
            if len(self.movie_clip_names[movie_name])>self.clip_num_per_movie_max: self.clip_num_per_movie_max = len(self.movie_clip_names[movie_name])
        print "Max number of clips in a movie is "+str(self.clip_num_per_movie_max)
        
        self.sliding_clip_path = img_dir
        sliding_clips_tmp = os.listdir(self.sliding_clip_path)
        self.sliding_clip_names = []
        for clip_name in sliding_clips_tmp:
            if clip_name.split(".")[2]=="npy":
                movie_name = clip_name.split("_")[0]
                if movie_name in self.movie_clip_names:
                    self.sliding_clip_names.append(clip_name.split(".")[0]+"."+clip_name.split(".")[1])
        self.num_samples = len(self.clip_sentence_pairs)
        print "sliding clips number: "+str(len(self.sliding_clip_names))
        assert self.batch_size <= self.num_samples
        

    def get_clip_sample(self, sample_num, movie_name, clip_name):
        length=len(os.listdir(self.image_dir+movie_name+"/"+clip_name))
        sample_step=1.0*length/sample_num
        sample_pos=np.floor(sample_step*np.array(range(sample_num)))
        sample_pos_str=[]
        img_names=os.listdir(self.image_dir+movie_name+"/"+clip_name)
        # sort is very important! to get a correct sequence order
        img_names.sort()
       # print img_names
        for pos in sample_pos:
            sample_pos_str.append(self.image_dir+movie_name+"/"+clip_name+"/"+img_names[int(pos)])
        return sample_pos_str
    
    def get_context_window(self, clip_name, win_length):
        movie_name = clip_name.split("_")[0]
        start = int(clip_name.split("_")[1])
        end = int(clip_name.split("_")[2].split(".")[0])
        clip_length = 128#end-start
        left_context_feats = np.zeros([win_length,4096], dtype=np.float32)
        right_context_feats = np.zeros([win_length,4096], dtype=np.float32)
        last_left_feat = np.load(self.sliding_clip_path+clip_name)
        last_right_feat = np.load(self.sliding_clip_path+clip_name)
        for k in range(win_length):
            left_context_start = start-clip_length*(k+1)
            left_context_end = start-clip_length*k
            right_context_start = end+clip_length*k
            right_context_end = end+clip_length*(k+1)
            left_context_name = movie_name+"_"+str(left_context_start)+"_"+str(left_context_end)+".npy"
            right_context_name = movie_name+"_"+str(right_context_start)+"_"+str(right_context_end)+".npy"
            if os.path.exists(self.sliding_clip_path+left_context_name):
                left_context_feat = np.load(self.sliding_clip_path+left_context_name)
                last_left_feat = left_context_feat
            else:
                left_context_feat = last_left_feat
            if os.path.exists(self.sliding_clip_path+right_context_name):
                right_context_feat = np.load(self.sliding_clip_path+right_context_name)
                last_right_feat = right_context_feat
            else:
                right_context_feat = last_right_feat
            left_context_feats[k] = left_context_feat
            right_context_feats[k] = right_context_feat

        return np.mean(left_context_feats, axis=0), np.mean(right_context_feats, axis=0)


    def load_movie(self, movie_name):
        movie_clip_sentences=[]
        for k in range(len(self.clip_names)):
            if movie_name in self.clip_names[k]:
                movie_clip_sentences.append((self.clip_names[k], self.sent_vecs[k][:2400], self.sentences[k]))

        movie_clip_imgs=[]
        for k in range(len(self.movie_frames[movie_name])):
           # print str(k)+"/"+str(len(self.movie_frames[movie_name]))            
            if os.path.isfile(self.movie_frames[movie_name][k][1]) and os.path.getsize(self.movie_frames[movie_name][k][1])!=0:
                img=load_image(self.movie_frames[movie_name][k][1])
                movie_clip_imgs.append((self.movie_frames[movie_name][k][0],img))
                    
        return movie_clip_imgs, movie_clip_sentences

    def load_movie_byclip(self,movie_name,sample_num):
        movie_clip_sentences=[]
        movie_clip_featmap=[]
        clip_set=set()
        for k in range(len(self.clip_sentence_pairs)):
            if movie_name in self.clip_sentence_pairs[k][0]:
                movie_clip_sentences.append((self.clip_sentence_pairs[k][0],self.clip_sentence_pairs[k][1][:self.semantic_size]))

                if not self.clip_sentence_pairs[k][0] in clip_set:
                    clip_set.add(self.clip_sentence_pairs[k][0])
                    # print str(k)+"/"+str(len(self.movie_clip_names[movie_name]))
                    visual_feature_path=self.image_dir+self.clip_sentence_pairs[k][0]+".npy"
                    feature_data=np.load(visual_feature_path)
                    movie_clip_featmap.append((self.clip_sentence_pairs[k][0],feature_data))
        return movie_clip_featmap, movie_clip_sentences
    
    def load_movie_slidingclip(self, movie_name, sample_num):
        movie_clip_sentences = []
        movie_clip_featmap = []
        clip_set = set()
        for k in range(len(self.clip_sentence_pairs)):
            if movie_name in self.clip_sentence_pairs[k][0]:
                movie_clip_sentences.append((self.clip_sentence_pairs[k][0], self.clip_sentence_pairs[k][1][:self.semantic_size]))
        for k in range(len(self.sliding_clip_names)):
            if movie_name in self.sliding_clip_names[k]:
                # print str(k)+"/"+str(len(self.movie_clip_names[movie_name]))
                visual_feature_path = self.sliding_clip_path+self.sliding_clip_names[k]+".npy"
                #context_feat=self.get_context(self.sliding_clip_names[k]+".npy")
                left_context_feat,right_context_feat = self.get_context_window(self.sliding_clip_names[k]+".npy",1)
                feature_data = np.load(visual_feature_path)
                #comb_feat=np.hstack((context_feat,feature_data))
                comb_feat = np.hstack((left_context_feat,feature_data,right_context_feat))
                movie_clip_featmap.append((self.sliding_clip_names[k], comb_feat))
        return movie_clip_featmap, movie_clip_sentences


================================================
FILE: exp_data/.gitkeep
================================================


================================================
FILE: main.py
================================================
import tensorflow as tf
import numpy as np
import ctrl_model
from six.moves import xrange
import time
from sklearn.metrics import average_precision_score
import pickle
import vs_multilayer
import operator

def dense_to_one_hot(labels_dense, num_classes):
    """Convert class labels from scalars to one-hot vectors."""
    num_labels = labels_dense.shape[0]
    index_offset = np.arange(num_labels) * num_classes
    labels_one_hot = np.zeros((num_labels, num_classes))
    labels_one_hot.flat[index_offset+labels_dense.ravel()] = 1
    return labels_one_hot

def compute_ap(class_score_matrix, labels):
    num_classes=class_score_matrix.shape[1]
    one_hot_labels=dense_to_one_hot(labels, num_classes)
    predictions=np.array(class_score_matrix>0, dtype="int32")
    average_precision=[]
    for i in range(num_classes):
        ps=average_precision_score(one_hot_labels[:, i], class_score_matrix[:, i])
       # if not np.isnan(ps):
        average_precision.append(ps)
    return np.array(average_precision)

def calculate_IoU(i0,i1):
    union = (min(i0[0], i1[0]), max(i0[1], i1[1]))
    inter = (max(i0[0], i1[0]), min(i0[1], i1[1]))
    iou = 1.0*(inter[1]-inter[0])/(union[1]-union[0])
    return iou

def nms_temporal(x1,x2,s, overlap):
    pick = []
    assert len(x1)==len(s)
    assert len(x2)==len(s)
    if len(x1)==0:
        return pick

    #x1 = [b[0] for b in boxes]
    #x2 = [b[1] for b in boxes]
    #s = [b[-1] for b in boxes]
    union = map(operator.sub, x2, x1) # union = x2-x1
    I = [i[0] for i in sorted(enumerate(s), key=lambda x:x[1])] # sort and get index

    while len(I)>0:
        i = I[-1]
        pick.append(i)

        xx1 = [max(x1[i],x1[j]) for j in I[:-1]]
        xx2 = [min(x2[i],x2[j]) for j in I[:-1]]
        inter = [max(0.0, k2-k1) for k1, k2 in zip(xx1, xx2)]
        o = [inter[u]/(union[i] + union[I[u]] - inter[u]) for u in range(len(I)-1)]
        I_new = []
        for j in range(len(o)):
            if o[j] <=overlap:
                I_new.append(I[j])
        I = I_new
    return pick

'''
compute recall at certain IoU
'''
def compute_IoU_recall_top_n_forreg(top_n, iou_thresh, sentence_image_mat, sentence_image_reg_mat, sclips, iclips):
    correct_num = 0.0
    for k in range(sentence_image_mat.shape[0]):
        gt = sclips[k]
        gt_start = float(gt.split("_")[1])
        gt_end = float(gt.split("_")[2])
        #print gt +" "+str(gt_start)+" "+str(gt_end)
        sim_v = [v for v in sentence_image_mat[k]]
        starts = [s for s in sentence_image_reg_mat[k,:,0]]
        ends = [e for e in sentence_image_reg_mat[k,:,1]]
        picks = nms_temporal(starts,ends, sim_v, iou_thresh-0.05)
        #sim_argsort=np.argsort(sim_v)[::-1][0:top_n]
        if top_n<len(picks): picks=picks[0:top_n]
        for index in picks:
            pred_start = sentence_image_reg_mat[k, index, 0]
            pred_end = sentence_image_reg_mat[k, index, 1]
            iou = calculate_IoU((gt_start, gt_end),(pred_start, pred_end))
            if iou>=iou_thresh:
                correct_num+=1
                break
    return correct_num

'''
evaluate the model
'''
def do_eval_slidingclips(sess, vs_eval_op, model, movie_length_info, iter_step, test_result_output):
    IoU_thresh = [0.1, 0.2, 0.3, 0.4, 0.5]
    all_correct_num_10 = [0.0]*5
    all_correct_num_5 = [0.0]*5
    all_correct_num_1 = [0.0]*5
    all_retrievd = 0.0
    for movie_name in model.test_set.movie_names:
        movie_length=movie_length_info[movie_name.split(".")[0]]
        print "Test movie: "+movie_name+"....loading movie data"
        movie_clip_featmaps, movie_clip_sentences=model.test_set.load_movie_slidingclip(movie_name, 16)
        print "sentences: "+ str(len(movie_clip_sentences))
        print "clips: "+ str(len(movie_clip_featmaps))
        sentence_image_mat=np.zeros([len(movie_clip_sentences), len(movie_clip_featmaps)])
        sentence_image_reg_mat=np.zeros([len(movie_clip_sentences), len(movie_clip_featmaps), 2])
        for k in range(len(movie_clip_sentences)):
            #sentence_clip_name=movie_clip_sentences[k][0]
            #start=float(sentence_clip_name.split("_")[1])
            #end=float(sentence_clip_name.split("_")[2].split("_")[0])
            
            sent_vec=movie_clip_sentences[k][1]
            sent_vec=np.reshape(sent_vec,[1,sent_vec.shape[0]])
            for t in range(len(movie_clip_featmaps)):
                featmap = movie_clip_featmaps[t][1]
                visual_clip_name = movie_clip_featmaps[t][0]
                start = float(visual_clip_name.split("_")[1])
                end = float(visual_clip_name.split("_")[2].split("_")[0])
                featmap = np.reshape(featmap, [1, featmap.shape[0]])
                feed_dict = {
                model.visual_featmap_ph_test: featmap,
                model.sentence_ph_test:sent_vec
                }
                outputs = sess.run(vs_eval_op,feed_dict=feed_dict)
                sentence_image_mat[k,t] = outputs[0]
                reg_clip_length = (end-start)*(10**outputs[2])
                reg_mid_point = (start+end)/2.0+movie_length*outputs[1]
                reg_end = end+outputs[2]
                reg_start = start+outputs[1]
                
                sentence_image_reg_mat[k,t,0] = reg_start
                sentence_image_reg_mat[k,t,1] = reg_end
        
        iclips = [b[0] for b in movie_clip_featmaps]
        sclips = [b[0] for b in movie_clip_sentences]
        
        # calculate Recall@m, IoU=n
        for k in range(len(IoU_thresh)):
            IoU=IoU_thresh[k]
            correct_num_10 = compute_IoU_recall_top_n_forreg(10, IoU, sentence_image_mat, sentence_image_reg_mat, sclips, iclips)
            correct_num_5 = compute_IoU_recall_top_n_forreg(5, IoU, sentence_image_mat, sentence_image_reg_mat, sclips, iclips)
            correct_num_1 = compute_IoU_recall_top_n_forreg(1, IoU, sentence_image_mat, sentence_image_reg_mat, sclips, iclips)
            print movie_name+" IoU="+str(IoU)+", R@10: "+str(correct_num_10/len(sclips))+"; IoU="+str(IoU)+", R@5: "+str(correct_num_5/len(sclips))+"; IoU="+str(IoU)+", R@1: "+str(correct_num_1/len(sclips))
            all_correct_num_10[k]+=correct_num_10
            all_correct_num_5[k]+=correct_num_5
            all_correct_num_1[k]+=correct_num_1
        all_retrievd+=len(sclips)
    for k in range(len(IoU_thresh)):
        print " IoU="+str(IoU_thresh[k])+", R@10: "+str(all_correct_num_10[k]/all_retrievd)+"; IoU="+str(IoU_thresh[k])+", R@5: "+str(all_correct_num_5[k]/all_retrievd)+"; IoU="+str(IoU_thresh[k])+", R@1: "+str(all_correct_num_1[k]/all_retrievd)
        test_result_output.write("Step "+str(iter_step)+": IoU="+str(IoU_thresh[k])+", R@10: "+str(all_correct_num_10[k]/all_retrievd)+"; IoU="+str(IoU_thresh[k])+", R@5: "+str(all_correct_num_5[k]/all_retrievd)+"; IoU="+str(IoU_thresh[k])+", R@1: "+str(all_correct_num_1[k]/all_retrievd)+"\n")

def run_training():
    initial_steps = 0
    max_steps = 20000
    batch_size = 56
    train_csv_path = "./exp_data/TACoS/train_clip-sentvec.pkl"
    test_csv_path = "./exp_data/TACoS/test_clip-sentvec.pkl"
    test_feature_dir="../TACOS/Interval128_256_overlap0.8_c3d_fc6/"
    train_feature_dir = "../TACOS/Interval64_128_256_512_overlap0.8_c3d_fc6/"
    
    model = ctrl_model.CTRL_Model(batch_size, train_csv_path, test_csv_path, test_feature_dir, train_feature_dir)
    test_result_output=open("ctrl_test_results.txt", "w")
    with tf.Graph().as_default():
		
        loss_align_reg, vs_train_op, vs_eval_op, offset_pred, loss_reg = model.construct_model()
        # Create a session for running Ops on the Graph.
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.2)
        sess = tf.Session(config = tf.ConfigProto(gpu_options = gpu_options))
        # Run the Op to initialize the variables.
        init = tf.initialize_all_variables()
        sess.run(init)
        for step in xrange(max_steps):
            start_time = time.time()
            feed_dict = model.fill_feed_dict_train_reg()
            _, loss_value, offset_pred_v, loss_reg_v = sess.run([vs_train_op, loss_align_reg, offset_pred, loss_reg], feed_dict=feed_dict)
            duration = time.time() - start_time

            if step % 5 == 0:
                # Print status to stdout.
                print('Step %d: loss = %.3f (%.3f sec)' % (step, loss_value, duration))

            if (step+1) % 2000 == 0:
                print "Start to test:-----------------\n"
                movie_length_info=pickle.load(open("./video_allframes_info.pkl"))
                do_eval_slidingclips(sess, vs_eval_op, model, movie_length_info, step+1, test_result_output)

def main(_):
    run_training()


if __name__ == '__main__':
    tf.app.run()
        	

================================================
FILE: util/__init__.py
================================================


================================================
FILE: util/cnn.py
================================================
from __future__ import division  #, print_function

import tensorflow as tf

def conv_layer(name, bottom, kernel_size, stride, output_dim, padding='SAME',
               bias_term=True, weights_initializer=None, biases_initializer=None):
    # input has shape [batch, in_height, in_width, in_channels]
    input_dim = bottom.get_shape().as_list()[-1]

    # weights and biases variables
    with tf.variable_scope(name):
        if weights_initializer is None and biases_initializer is None:
            # initialize the variables
            if weights_initializer is None:
                weights_initializer = tf.random_normal_initializer(mean=0.0, stddev=0.01)
            if bias_term and biases_initializer is None:
              biases_initializer = tf.constant_initializer(0.)
            print "input_dim"+str(input_dim)
            # filter has shape [filter_height, filter_width, in_channels, out_channels]
            weights = tf.get_variable("weights",
                [kernel_size, kernel_size, input_dim, output_dim],
                initializer=weights_initializer)
            if bias_term:
                biases = tf.get_variable("biases", output_dim,
                    initializer=biases_initializer)
            print str(weights.name)+" initialized as random or retrieved from graph"
            if bias_term:
                print biases.name+" initialized as random or retrieved from graph"

        else:
            weights = tf.get_variable("weights",
                shape=None,
                initializer=weights_initializer)
            if bias_term:
                biases = tf.get_variable("biases", shape=None,
                    initializer=biases_initializer) 

            print weights.name+" initialized from pre-trained parameters or retrieved from graph"
            if bias_term:
                print biases.name+" initialized from pre-trained parameters or retrieved from graph"


    conv = tf.nn.conv2d(bottom, filter=weights,
        strides=[1, stride, stride, 1], padding=padding)
    if bias_term:
        conv = tf.nn.bias_add(conv, biases)
    return conv

def conv_relu_layer(name, bottom, kernel_size, stride, output_dim, padding='SAME',
                    bias_term=True, weights_initializer=None, biases_initializer=None):
    conv = conv_layer(name, bottom, kernel_size, stride, output_dim, padding,
                      bias_term, weights_initializer, biases_initializer)
    relu = tf.nn.relu(conv)
    return relu

def deconv_layer(name, bottom, kernel_size, stride, output_dim, padding='SAME',
                 bias_term=True, weights_initializer=None, biases_initializer=None):
    # input_shape is [batch, in_height, in_width, in_channels]
    input_shape = bottom.get_shape().as_list()
    batch_size, input_height, input_width, input_dim = input_shape
    output_shape = [batch_size, input_height*stride, input_width*stride, output_dim]

    # weights and biases variables
    with tf.variable_scope(name):
        # initialize the variables
        if weights_initializer is None:
            weights_initializer = tf.random_normal_initializer()
        if bias_term and biases_initializer is None:
            biases_initializer = tf.constant_initializer(0.)

        # filter has shape [filter_height, filter_width, out_channels, in_channels]
        weights = tf.get_variable("weights",
            [kernel_size, kernel_size, output_dim, input_dim],
            initializer=weights_initializer)
        if bias_term:
            biases = tf.get_variable("biases", output_dim,
                initializer=biases_initializer)

    deconv = tf.nn.conv2d_transpose(bottom, filter=weights,
        output_shape=output_shape, strides=[1, stride, stride, 1],
        padding=padding)
    if bias_term:
        deconv = tf.nn.bias_add(deconv, biases)
    return deconv

def deconv_relu_layer(name, bottom, kernel_size, stride, output_dim, padding='SAME',
                      bias_term=True, weights_initializer=None, biases_initializer=None):
    deconv = deconv_layer(name, bottom, kernel_size, stride, output_dim, padding,
                          bias_term, weights_initializer, biases_initializer)
    relu = tf.nn.relu(deconv)
    return relu

def pooling_layer(name, bottom, kernel_size, stride):
    pool = tf.nn.max_pool(bottom, ksize=[1, kernel_size, kernel_size, 1],
        strides=[1, stride, stride, 1], padding='SAME', name=name)
    return pool

def fc_layer(name, bottom, output_dim, bias_term=True, weights_initializer=None,
             biases_initializer=None):
    # flatten bottom input
    # input has shape [batch, in_height, in_width, in_channels]
    shape = bottom.get_shape().as_list()
    input_dim = 1
    for d in shape[1:]:
        input_dim *= d
    flat_bottom = tf.reshape(bottom, [-1, input_dim])
    
    # weights and biases variables
    with tf.variable_scope(name):
        if weights_initializer is None and biases_initializer is None:
            # initialize the variables
            if weights_initializer is None:
                weights_initializer = tf.random_normal_initializer()
            if bias_term and biases_initializer is None:
                biases_initializer = tf.constant_initializer(0.)

            # weights has shape [input_dim, output_dim]
            weights = tf.get_variable("weights", [input_dim, output_dim],
                initializer=weights_initializer)
            if bias_term:
                biases = tf.get_variable("biases", output_dim,
                    initializer=biases_initializer)

            print weights.name+" initialized as random or retrieved from graph"
            if bias_term:
                print biases.name+" initialized as random or retrieved from graph"
        else:
            weights = tf.get_variable("weights", shape=None,
                initializer=weights_initializer)
            if bias_term:
                biases = tf.get_variable("biases", shape=None,
                    initializer=biases_initializer)

            print weights.name+" initialized from pre-trained parameters or retrieved from graph"
            if bias_term:
                print biases.name+" initialized from pre-trained parameters or retrieved from graph"

    if bias_term:
        fc = tf.nn.xw_plus_b(flat_bottom, weights, biases)
    else:
        fc = tf.matmul(flat_bottom, weights)
    return fc

def fc_relu_layer(name, bottom, output_dim, bias_term=True,
                  weights_initializer=None, biases_initializer=None):
    fc = fc_layer(name, bottom, output_dim, bias_term, weights_initializer,
                  biases_initializer)
    relu = tf.nn.relu(fc)
    return relu

def softmax_loss_layer(name, score_bottom, label_bottom):
    """
    Calculates cumulative Softmax Cross Entropy Loss along the last dimension
    *This function does not divide the loss by batch size*

    Once tensorflow has SparseCrossEntropy function, this one will be replaced
    """
    # Check shape
    score_shape = score_bottom.get_shape().as_list()
    label_shape = label_bottom.get_shape().as_list()
    assert len(score_shape) == len(label_shape) + 1
    assert score_shape[:-1] == label_shape

    # Compute the outer dimensions dimensions in label
    inner_dim = score_shape[-1]
    outer_dim = 1
    for d in label_shape: outer_dim *= d

    # flatten score and label
    flat_score = tf.reshape(score_bottom, [outer_dim, inner_dim])
    flat_label = tf.reshape(label_bottom, [outer_dim, 1])

    # Reshape the labels into a dense Tensor of
    # shape [batch_size, NUM_CLASSES].
    sparse_labels = tf.reshape(labels, [FLAGS.batch_size, 1])
    indices = tf.reshape(tf.range(FLAGS.batch_size), [FLAGS.batch_size, 1])
    concated = tf.concat(1, [indices, sparse_labels])
    dense_labels = tf.sparse_to_dense(concated, [FLAGS.batch_size, NUM_CLASSES],
        1.0, 0.0)


================================================
FILE: video_allframes_info.pkl
================================================
(dp0
S's30-d43'
p1
I19807
sS's30-d40'
p2
I4911
sS's30-d41'
p3
I19163
sS's22-d55'
p4
I1530
sS's25-d52'
p5
I5676
sS's25-d51'
p6
I3409
sS's28-d39'
p7
I22908
sS's35-d55'
p8
I6758
sS's24-d41'
p9
I17753
sS's24-d40'
p10
I6322
sS's24-d48'
p11
I2397
sS's21-d29'
p12
I1881
sS's21-d39'
p13
I3774
sS's22-d29'
p14
I3253
sS's33-d27'
p15
I20003
sS's22-d46'
p16
I14844
sS's32-d70'
p17
I7357
sS's13-d52'
p18
I2767
sS's22-d48'
p19
I2467
sS's13-d54'
p20
I8199
sS's37-d39'
p21
I9169
sS's25-d23'
p22
I3808
sS's27-d70'
p23
I8218
sS's36-d50'
p24
I5219
sS's26-d70'
p25
I5847
sS's22-d53'
p26
I6013
sS's21-d23'
p27
I3452
sS's21-d21'
p28
I4450
sS's24-d53'
p29
I5237
sS's21-d28'
p30
I3693
sS's32-d69'
p31
I12409
sS's35-d48'
p32
I7628
sS's31-d28'
p33
I5841
sS's31-d25'
p34
I2650
sS's22-d25'
p35
I1621
sS's29-d42'
p36
I22198
sS's35-d40'
p37
I13864
sS's23-d31'
p38
I3216
sS's17-d48'
p39
I2273
sS's36-d42'
p40
I18133
sS's34-d34'
p41
I17675
sS's35-d41'
p42
I27306
sS's34-d41'
p43
I15419
sS's37-d46'
p44
I13677
sS's21-d53'
p45
I4683
sS's30-d26'
p46
I17130
sS's24-d28'
p47
I7230
sS's21-d55'
p48
I2133
sS's14-d35'
p49
I2421
sS's30-d29'
p50
I6115
sS's36-d70'
p51
I7055
sS's23-d51'
p52
I6830
sS's31-d31'
p53
I6273
sS's23-d54'
p54
I9474
sS's24-d34'
p55
I5430
sS's17-d55'
p56
I2335
sS's27-d50'
p57
I2430
sS's17-d53'
p58
I7031
sS's22-d43'
p59
I3315
sS's25-d69'
p60
I14351
sS's27-d34'
p61
I1772
sS's23-d46'
p62
I10819
sS's21-d42'
p63
I6090
sS's21-d43'
p64
I4033
sS's23-d42'
p65
I12416
sS's21-d45'
p66
I2866
sS's14-d26'
p67
I12483
sS's14-d27'
p68
I6134
sS's21-d40'
p69
I3238
sS's26-d26'
p70
I41240
sS's26-d23'
p71
I2842
sS's17-d42'
p72
I15601
sS's34-d28'
p73
I11816
sS's23-d45'
p74
I4184
sS's29-d52'
p75
I5871
sS's15-d70'
p76
I8315
sS's27-d45'
p77
I4757
sS's29-d50'
p78
I2847
sS's27-d29'
p79
I8024
sS's21-d35'
p80
I1875
sS's27-d21'
p81
I4408
sS's37-d25'
p82
I1436
sS's37-d21'
p83
I11130
sS's23-d34'
p84
I5231
sS's23-d39'
p85
I5444
sS's37-d29'
p86
I3012
sS's27-d54'
p87
I10969
sS's14-d51'
p88
I8323
sS's28-d46'
p89
I11162
sS's23-d21'
p90
I4866
sS's13-d48'
p91
I3228
sS's32-d27'
p92
I22541
sS's13-d21'
p93
I2955
sS's13-d25'
p94
I2951
sS's13-d28'
p95
I5629
sS's14-d46'
p96
I9249
sS's14-d43'
p97
I4667
sS's22-d34'
p98
I2401
sS's21-d50'
p99
I1607
sS's24-d23'
p100
I5735
sS's29-d31'
p101
I3164
sS's17-d69'
p102
I12454
sS's15-d26'
p103
I33264
sS's29-d39'
p104
I9630
sS's32-d55'
p105
I5689
sS's32-d52'
p106
I5508
sS's13-d31'
p107
I5093
sS's36-d31'
p108
I7071
sS's36-d43'
p109
I26924
sS's33-d45'
p110
I7698
sS's22-d26'
p111
I9365
sS's28-d27'
p112
I17235
sS's28-d25'
p113
I3220
sS's30-d53'
p114
I13379
sS's30-d52'
p115
I7362
sS's33-d54'
p116
I10898
sS's22-d35'
p117
I2555
sS's15-d35'
p118
I6472
sS's33-d50'
p119
I3466
sS's13-d40'
p120
I3347
sS's13-d45'
p121
I4354
sS's36-d23'
p122
I19412
sS's25-d35'
p123
I3900
sS's28-d51'
p124
I19725
sS's26-d69'
p125
I20978
sS's36-d27'
p126
I4390
sS's34-d69'
p127
I12952
s.

================================================
FILE: vs_multilayer.py
================================================
from __future__ import division

import numpy as np
import tensorflow as tf

# components
from tensorflow.python.ops.nn import dropout as drop
from util.cnn import conv_layer as conv
from util.cnn import conv_relu_layer as conv_relu
from util.cnn import pooling_layer as pool
from util.cnn import fc_layer as fc
from util.cnn import fc_relu_layer as fc_relu

def vs_multilayer(input_batch,name,middle_layer_dim=1000,reuse=False):
    with tf.variable_scope(name):
        if reuse==True:
            print name+" reuse variables"
            tf.get_variable_scope().reuse_variables()
        else:
            print name+" doesn't reuse variables"

        layer1 = conv_relu('layer1', input_batch,
                        kernel_size=1,stride=1,output_dim=middle_layer_dim)
        sim_score = conv('layer2', layer1,
                        kernel_size=1,stride=1,output_dim=3)
    return sim_score