[
  {
    "path": "README.md",
    "content": "## TALL: Temporal Activity Localization via Language Query\n\nThis is the repository for our ICCV 2017 paper [_TALL: Temporal Activity Localization via Language Query_](https://arxiv.org/abs/1705.02101).\n\n### Visual Features on TACoS\nDownload the C3D features for [training set](https://drive.google.com/file/d/1zQp0aYGFCm8PqqHOh4UtXfy2U3pJMBeu/view?usp=sharing)  and [test set](https://drive.google.com/file/d/1zC-UrspRf42Qiu5prQw4fQrbgLQfJN-P/view?usp=sharing) of TACoS dataset. Modify the path to feature folders in main.py\n\n### Sentence Embeddings on TACoS\nDownload the Skip-thought sentence embeddings and sample files from [here](https://drive.google.com/file/d/1HF-hNFPvLrHwI5O7YvYKZWTeTxC5Mg1K/view?usp=sharing) of TACoS Dataset, and put them under exp_data folder.\n\n### Reproduce the results on TACoS\n`python main.py`\n\n### Charades-STA anno download\nThe sentence temporal annotations on [Charades](http://allenai.org/plato/charades/) dataset are available here: [train](https://drive.google.com/file/d/1ZjG7wJpPSMIBYnW7BAG2u9VVEoNvFm5c/view?usp=sharing), [test](https://drive.google.com/file/d/1QG4MXFkoj6JFU0YK5olTY75xTARKSW5e/view?usp=sharing). The format is \"[video name] [start time] [end time]##[sentence]\". You may want to generate the skip-thought embeddings and C3D features on Charades-STA, and modify the codes slightly to reproduce the experiments.\n\n### Updates on Charades-STA performance\nI did some anno cleaning for Charades-STA (compared to the version I used in ICCV paper), the updated performance is listed below. Please compare to these results when using Charades-STA.\n\n| Model            | R@1,IoU=0.5 | R@1,IoU=0.7 | R@5,IoU=0.5 | R@5,IoU=0.7 |\n| :--------------- | ----------: | ----------: | ----------: | ----------: | \n| CTRL (aln)       |   17.69     |    5.91     |    55.54    |     23.79   |\n| CTRL (reg-p)     |   19.22     |    6.64     |    57.98    |     25.22   |\n| CTRL (reg-np)    |   21.42     |    7.15     |    59.11    |     26.91   |\n"
  },
  {
    "path": "ctrl_model.py",
    "content": "import numpy as np\nimport tensorflow as tf\nfrom tensorflow.python.framework import dtypes\n\nfrom util.cnn import fc_layer as fc\nimport vs_multilayer \nfrom dataset import TestingDataSet\nfrom dataset import TrainingDataSet\n\n\nclass CTRL_Model(object):\n    def __init__(self, batch_size, train_csv_path, test_csv_path, test_visual_feature_dir, train_visual_feature_dir):\n        \n        self.batch_size = batch_size\n        self.test_batch_size = 1\n        self.vs_lr = 0.005\n        self.lambda_regression = 0.01\n        self.alpha = 1.0/batch_size\n        self.semantic_size = 1024 # the size of visual and semantic comparison size\n        self.sentence_embedding_size = 4800\n        self.visual_feature_dim = 4096*3\n        self.train_set=TrainingDataSet(train_visual_feature_dir, train_csv_path, self.batch_size)\n        self.test_set=TestingDataSet(test_visual_feature_dir, test_csv_path, self.test_batch_size)\n   \n    '''\n    used in training alignment model, CTRL(aln)\n    '''\t\n    def fill_feed_dict_train(self):\n        image_batch,sentence_batch,offset_batch = self.train_set.next_batch()\n        input_feed = {\n                self.visual_featmap_ph_train: image_batch,\n                self.sentence_ph_train: sentence_batch,\n                self.offset_ph: offset_batch\n        }\n\n        return input_feed\n    \n    '''\n    used in training alignment+regression model, CTRL(reg)\n    '''\n    def fill_feed_dict_train_reg(self):\n        image_batch, sentence_batch, offset_batch = self.train_set.next_batch_iou()\n        input_feed = {\n                self.visual_featmap_ph_train: image_batch,\n                self.sentence_ph_train: sentence_batch,\n                self.offset_ph: offset_batch\n        }\n\n        return input_feed\n\n    \n    '''\n    cross modal processing module\n    '''\n    def cross_modal_comb(self, visual_feat, sentence_embed, batch_size):\n        vv_feature = tf.reshape(tf.tile(visual_feat, [batch_size, 1]),\n            [batch_size, batch_size, self.semantic_size])\n        ss_feature = tf.reshape(tf.tile(sentence_embed,[1, batch_size]),[batch_size, batch_size, self.semantic_size])\n        concat_feature = tf.reshape(tf.concat(2,[vv_feature, ss_feature]),[batch_size, batch_size, self.semantic_size+self.semantic_size])\n        print concat_feature.get_shape().as_list()\n        mul_feature = tf.mul(vv_feature, ss_feature) \n        add_feature = tf.add(vv_feature, ss_feature)\n        \n        comb_feature = tf.reshape(tf.concat(2, [mul_feature, add_feature, concat_feature]),[1, batch_size, batch_size, self.semantic_size*4])\n        return comb_feature\n    \n    '''\n    visual semantic inference, including visual semantic alignment and clip location regression\n    '''\n    def visual_semantic_infer(self, visual_feature_train, sentence_embed_train, visual_feature_test, sentence_embed_test):\n        name=\"CTRL_Model\"\n        with tf.variable_scope(name):\n            print \"Building training network...............................\\n\"     \n            transformed_clip_train = fc('v2s_lt', visual_feature_train, output_dim=self.semantic_size) \n            transformed_clip_train_norm = tf.nn.l2_normalize(transformed_clip_train, dim=1)\n            transformed_sentence_train = fc('s2s_lt', sentence_embed_train, output_dim=self.semantic_size)\n            transformed_sentence_train_norm = tf.nn.l2_normalize(transformed_sentence_train, dim=1)  \n            cross_modal_vec_train = self.cross_modal_comb(transformed_clip_train_norm, transformed_sentence_train_norm, self.batch_size)\n            sim_score_mat_train = vs_multilayer.vs_multilayer(cross_modal_vec_train, \"vs_multilayer_lt\", middle_layer_dim=1000)\n            sim_score_mat_train = tf.reshape(sim_score_mat_train,[self.batch_size, self.batch_size, 3])\n\n            tf.get_variable_scope().reuse_variables()\n            print \"Building test network...............................\\n\" \n            transformed_clip_test = fc('v2s_lt', visual_feature_test, output_dim=self.semantic_size)\n            transformed_clip_test_norm = tf.nn.l2_normalize(transformed_clip_test, dim=1)\n            transformed_sentence_test = fc('s2s_lt', sentence_embed_test, output_dim=self.semantic_size)\n            transformed_sentence_test_norm = tf.nn.l2_normalize(transformed_sentence_test, dim=1)\n            cross_modal_vec_test = self.cross_modal_comb(transformed_clip_test_norm, transformed_sentence_test_norm, self.test_batch_size)\n            sim_score_mat_test = vs_multilayer.vs_multilayer(cross_modal_vec_test, \"vs_multilayer_lt\", reuse=True, middle_layer_dim=1000)\n            sim_score_mat_test = tf.reshape(sim_score_mat_test, [3])\n\n            return sim_score_mat_train, sim_score_mat_test\n\n    '''\n    compute alignment and regression loss\n    '''\n    def compute_loss_reg(self, sim_reg_mat, offset_label):\n\n        sim_score_mat, p_reg_mat, l_reg_mat = tf.split(2, 3, sim_reg_mat)\n        sim_score_mat = tf.reshape(sim_score_mat, [self.batch_size, self.batch_size])\n        l_reg_mat = tf.reshape(l_reg_mat, [self.batch_size, self.batch_size])\n        p_reg_mat = tf.reshape(p_reg_mat, [self.batch_size, self.batch_size])\n        # unit matrix with -2\n        I_2 = tf.diag(tf.constant(-2.0, shape=[self.batch_size]))\n        all1 = tf.constant(1.0, shape=[self.batch_size, self.batch_size])\n        #               | -1  1   1...   |\n\n        #   mask_mat =  | 1  -1  -1...   |\n\n        #               | 1   1  -1 ...  |\n        mask_mat = tf.add(I_2, all1)\n        # loss cls, not considering iou\n        I = tf.diag(tf.constant(1.0, shape=[self.batch_size]))\n        I_half = tf.diag(tf.constant(0.5, shape=[self.batch_size]))\n        batch_para_mat = tf.constant(self.alpha, shape=[self.batch_size, self.batch_size])\n        para_mat = tf.add(I,batch_para_mat)\n        loss_mat = tf.log(tf.add(all1, tf.exp(tf.mul(mask_mat, sim_score_mat))))\n        loss_mat = tf.mul(loss_mat, para_mat)\n        loss_align = tf.reduce_mean(loss_mat)\n        # regression loss\n        l_reg_diag = tf.matmul(tf.mul(l_reg_mat, I), tf.constant(1.0, shape=[self.batch_size, 1]))\n        p_reg_diag = tf.matmul(tf.mul(p_reg_mat, I), tf.constant(1.0, shape=[self.batch_size, 1]))\n        offset_pred = tf.concat(1, (p_reg_diag, l_reg_diag))\n        loss_reg = tf.reduce_mean(tf.abs(tf.sub(offset_pred, offset_label)))\n\n        loss=tf.add(tf.mul(self.lambda_regression, loss_reg), loss_align)\n        return loss, offset_pred, loss_reg\n\n\n    def init_placeholder(self):\n        visual_featmap_ph_train = tf.placeholder(tf.float32, shape=(self.batch_size, self.visual_feature_dim))\n        sentence_ph_train = tf.placeholder(tf.float32, shape=(self.batch_size, self.sentence_embedding_size))\n        offset_ph = tf.placeholder(tf.float32, shape=(self.batch_size,2))\n        visual_featmap_ph_test = tf.placeholder(tf.float32, shape=(self.test_batch_size, self.visual_feature_dim))\n        sentence_ph_test = tf.placeholder(tf.float32, shape=(self.test_batch_size, self.sentence_embedding_size))\n\n        return visual_featmap_ph_train,sentence_ph_train,offset_ph,visual_featmap_ph_test,sentence_ph_test\n    \n\n    def get_variables_by_name(self,name_list):\n        v_list = tf.trainable_variables()\n        v_dict = {}\n        for name in name_list:\n            v_dict[name] = []\n        for v in v_list:\n            for name in name_list:\n                if name in v.name: v_dict[name].append(v)\n\n        for name in name_list:\n            print \"Variables of <\"+name+\">\"\n            for v in v_dict[name]:\n                print \"    \"+v.name\n        return v_dict\n\n    def training(self, loss):\n        \n        v_dict = self.get_variables_by_name([\"lt\"])\n        vs_optimizer = tf.train.AdamOptimizer(self.vs_lr, name='vs_adam')\n        vs_train_op = vs_optimizer.minimize(loss, var_list=v_dict[\"lt\"])\n        return vs_train_op\n\n\n    def construct_model(self):\n        # initialize the placeholder\n        self.visual_featmap_ph_train, self.sentence_ph_train, self.offset_ph, self.visual_featmap_ph_test, self.sentence_ph_test=self.init_placeholder()\n        # build inference network\n        sim_reg_mat, sim_reg_mat_test = self.visual_semantic_infer(self.visual_featmap_ph_train, self.sentence_ph_train, self.visual_featmap_ph_test, self.sentence_ph_test)\n        # compute loss\n        self.loss_align_reg, offset_pred, loss_reg = self.compute_loss_reg(sim_reg_mat, self.offset_ph)\n        # optimize\n        self.vs_train_op = self.training(self.loss_align_reg)\n        return self.loss_align_reg, self.vs_train_op, sim_reg_mat_test, offset_pred, loss_reg\n\n\n"
  },
  {
    "path": "ctrl_test_results.txt",
    "content": ""
  },
  {
    "path": "dataset.py",
    "content": "\nimport numpy as np\nfrom math import sqrt\nimport os\nimport random\nimport pickle\n\n'''\ncalculate temporal intersection over union\n'''\ndef calculate_IoU(i0, i1):\n    union = (min(i0[0], i1[0]), max(i0[1], i1[1]))\n    inter = (max(i0[0], i1[0]), min(i0[1], i1[1]))\n    iou = 1.0*(inter[1]-inter[0])/(union[1]-union[0])\n    return iou\n\n'''\ncalculate the non Intersection part over Length ratia, make sure the input IoU is larger than 0\n'''\ndef calculate_nIoL(base, sliding_clip):\n    inter = (max(base[0], sliding_clip[0]), min(base[1], sliding_clip[1]))\n    inter_l = inter[1]-inter[0]\n    length = sliding_clip[1]-sliding_clip[0]\n    nIoL = 1.0*(length-inter_l)/length\n    return nIoL\n\nclass TrainingDataSet(object):\n    def __init__(self, sliding_dir, it_path, batch_size):\n        \n        self.counter = 0\n        self.batch_size = batch_size\n        self.context_num = 1\n        self.context_size = 128\n        print \"Reading training data list from \"+it_path\n        cs = pickle.load(open(it_path))\n        movie_length_info = pickle.load(open(\"./video_allframes_info.pkl\"))\n        self.clip_sentence_pairs = []\n        for l in cs:\n            clip_name = l[0]\n            sent_vecs = l[1]\n            for sent_vec in sent_vecs:\n                self.clip_sentence_pairs.append((clip_name, sent_vec))\n\n        movie_names_set = set()\n        self.movie_clip_names = {}\n        # read groundtruth sentence-clip pairs\n        for k in range(len(self.clip_sentence_pairs)):\n            clip_name = self.clip_sentence_pairs[k][0]\n            movie_name = clip_name.split(\"_\")[0]\n            if not movie_name in movie_names_set:\n                movie_names_set.add(movie_name)\n                self.movie_clip_names[movie_name] = []\n            self.movie_clip_names[movie_name].append(k)\n        self.movie_names = list(movie_names_set)\n        self.visual_feature_dim = 4096*3\n        self.sent_vec_dim = 4800\n        self.num_samples = len(self.clip_sentence_pairs)\n        self.sliding_clip_path = sliding_dir\n        print str(len(self.clip_sentence_pairs))+\" clip-sentence pairs are readed\"\n        \n        # read sliding windows, and match them with the groundtruths to make training samples\n        sliding_clips_tmp = os.listdir(self.sliding_clip_path)\n        self.clip_sentence_pairs_iou = []\n        for clip_name in sliding_clips_tmp:\n            if clip_name.split(\".\")[2]==\"npy\":\n                movie_name = clip_name.split(\"_\")[0]\n                for clip_sentence in self.clip_sentence_pairs:\n                    original_clip_name = clip_sentence[0] \n                    original_movie_name = original_clip_name.split(\"_\")[0]\n                    if original_movie_name==movie_name:\n                        start = int(clip_name.split(\"_\")[1])\n                        end = int(clip_name.split(\"_\")[2].split(\".\")[0])\n                        o_start = int(original_clip_name.split(\"_\")[1]) \n                        o_end = int(original_clip_name.split(\"_\")[2].split(\".\")[0])\n                        iou = calculate_IoU((start, end), (o_start, o_end))\n                        if iou>0.5:\n                            nIoL=calculate_nIoL((o_start, o_end), (start, end))\n                            if nIoL<0.15:\n                                movie_length = movie_length_info[movie_name.split(\".\")[0]]\n                                start_offset =o_start-start\n                                end_offset = o_end-end\n                                self.clip_sentence_pairs_iou.append((clip_sentence[0], clip_sentence[1], clip_name, start_offset, end_offset))\n        self.num_samples_iou = len(self.clip_sentence_pairs_iou)\n        print str(len(self.clip_sentence_pairs_iou))+\" iou clip-sentence pairs are readed\"\n       \n    \n    '''\n    compute left (pre) and right (post) context features\n    '''\n    def get_context_window(self, clip_name, win_length):\n        movie_name = clip_name.split(\"_\")[0]\n        start = int(clip_name.split(\"_\")[1])\n        end = int(clip_name.split(\"_\")[2].split(\".\")[0])\n        clip_length = self.context_size\n        left_context_feats = np.zeros([win_length, 4096], dtype=np.float32)\n        right_context_feats = np.zeros([win_length, 4096], dtype=np.float32)\n        last_left_feat = np.load(self.sliding_clip_path+clip_name)\n        last_right_feat = np.load(self.sliding_clip_path+clip_name)\n        for k in range(win_length):\n            left_context_start = start-clip_length*(k+1)\n            left_context_end = start-clip_length*k\n            right_context_start = end+clip_length*k\n            right_context_end = end+clip_length*(k+1)\n            left_context_name = movie_name+\"_\"+str(left_context_start)+\"_\"+str(left_context_end)+\".npy\"\n            right_context_name = movie_name+\"_\"+str(right_context_start)+\"_\"+str(right_context_end)+\".npy\"\n            if os.path.exists(self.sliding_clip_path+left_context_name):\n                left_context_feat = np.load(self.sliding_clip_path+left_context_name)\n                last_left_feat = left_context_feat\n            else:\n                left_context_feat = last_left_feat\n            if os.path.exists(self.sliding_clip_path+right_context_name):\n                right_context_feat = np.load(self.sliding_clip_path+right_context_name)\n                last_right_feat = right_context_feat\n            else:\n                right_context_feat = last_right_feat\n            left_context_feats[k] = left_context_feat\n            right_context_feats[k] = right_context_feat\n        return np.mean(left_context_feats, axis=0), np.mean(right_context_feats, axis=0)\n    \n    '''\n    read next batch of training data, this function is used for training CTRL-aln\n    '''\n    def next_batch(self):\n        \n        random_batch_index = random.sample(range(self.num_samples), self.batch_size)\n        image_batch = np.zeros([self.batch_size, self.visual_feature_dim])\n        sentence_batch = np.zeros([self.batch_size, self.sent_vec_dim])\n        offset_batch = np.zeros([self.batch_size, 2], dtype=np.float32) # this one is actually useless\n        index = 0\n        clip_set=set()\n        while index < self.batch_size:\n            k = random_batch_index[index]\n            clip_name = self.clip_sentence_pairs[k][0]\n            if not clip_name in clip_set: \n                clip_set.add(clip_name)\n                feat_path = self.image_dir+self.clip_sentence_pairs[k][0]+\".npy\"\n                featmap = np.load(feat_path)\n                image_batch[index,:] = featmap\n                sentence_batch[index,:] = self.clip_sentence_pairs[k][1][:self.sent_vec_dim]\n\n                index+=1\n            else:\n                r = random.choice(range(self.num_samples))\n                random_batch_index[index] = r\n                continue \n                      \n        return image_batch, sentence_batch, offset_batch\n\n    '''\n    read next batch of training data, this function is used for training CTRL-reg\n    '''\n    def next_batch_iou(self):\n\n        random_batch_index = random.sample(range(self.num_samples_iou), self.batch_size)\n        image_batch = np.zeros([self.batch_size, self.visual_feature_dim])\n        sentence_batch = np.zeros([self.batch_size, self.sent_vec_dim])\n        offset_batch = np.zeros([self.batch_size, 2], dtype=np.float32)\n        index = 0\n        clip_set = set()\n        while index < self.batch_size:\n            k = random_batch_index[index]\n            clip_name = self.clip_sentence_pairs_iou[k][0]\n            if not clip_name in clip_set:\n                clip_set.add(clip_name)\n                feat_path = self.sliding_clip_path+self.clip_sentence_pairs_iou[k][2]\n                featmap = np.load(feat_path)\n                # read context features\n                left_context_feat, right_context_feat = self.get_context_window(self.clip_sentence_pairs_iou[k][2], self.context_num)\n                image_batch[index,:] = np.hstack((left_context_feat, featmap, right_context_feat))\n                sentence_batch[index,:] = self.clip_sentence_pairs_iou[k][1][:self.sent_vec_dim]\n                p_offset = self.clip_sentence_pairs_iou[k][3]\n                l_offset = self.clip_sentence_pairs_iou[k][4]\n                offset_batch[index,0] = p_offset\n                offset_batch[index,1] = l_offset\n                index+=1\n            else:\n                r = random.choice(range(self.num_samples_iou))\n                random_batch_index[index] = r\n                continue\n       \n        return image_batch, sentence_batch, offset_batch\n\n\nclass TestingDataSet(object):\n    def __init__(self, img_dir, csv_path, batch_size):\n        #il_path: image_label_file path\n        #self.index_in_epoch = 0\n        #self.epochs_completed = 0\n        self.batch_size = batch_size\n        self.image_dir = img_dir\n        print \"Reading testing data list from \"+csv_path\n        self.semantic_size = 4800\n        csv = pickle.load(open(csv_path))\n        self.clip_sentence_pairs = []\n        for l in csv:\n            clip_name = l[0]\n            sent_vecs = l[1]\n            for sent_vec in sent_vecs:\n                self.clip_sentence_pairs.append((clip_name, sent_vec))\n        print str(len(self.clip_sentence_pairs))+\" pairs are readed\"\n        movie_names_set = set()\n        self.movie_clip_names = {}\n        for k in range(len(self.clip_sentence_pairs)):\n            clip_name = self.clip_sentence_pairs[k][0]\n            movie_name = clip_name.split(\"_\")[0]\n            if not movie_name in movie_names_set:\n                movie_names_set.add(movie_name)\n                self.movie_clip_names[movie_name] = []\n            self.movie_clip_names[movie_name].append(k)\n        self.movie_names = list(movie_names_set)\n        \n        self.clip_num_per_movie_max = 0\n        for movie_name in self.movie_clip_names:\n            if len(self.movie_clip_names[movie_name])>self.clip_num_per_movie_max: self.clip_num_per_movie_max = len(self.movie_clip_names[movie_name])\n        print \"Max number of clips in a movie is \"+str(self.clip_num_per_movie_max)\n        \n        self.sliding_clip_path = img_dir\n        sliding_clips_tmp = os.listdir(self.sliding_clip_path)\n        self.sliding_clip_names = []\n        for clip_name in sliding_clips_tmp:\n            if clip_name.split(\".\")[2]==\"npy\":\n                movie_name = clip_name.split(\"_\")[0]\n                if movie_name in self.movie_clip_names:\n                    self.sliding_clip_names.append(clip_name.split(\".\")[0]+\".\"+clip_name.split(\".\")[1])\n        self.num_samples = len(self.clip_sentence_pairs)\n        print \"sliding clips number: \"+str(len(self.sliding_clip_names))\n        assert self.batch_size <= self.num_samples\n        \n\n    def get_clip_sample(self, sample_num, movie_name, clip_name):\n        length=len(os.listdir(self.image_dir+movie_name+\"/\"+clip_name))\n        sample_step=1.0*length/sample_num\n        sample_pos=np.floor(sample_step*np.array(range(sample_num)))\n        sample_pos_str=[]\n        img_names=os.listdir(self.image_dir+movie_name+\"/\"+clip_name)\n        # sort is very important! to get a correct sequence order\n        img_names.sort()\n       # print img_names\n        for pos in sample_pos:\n            sample_pos_str.append(self.image_dir+movie_name+\"/\"+clip_name+\"/\"+img_names[int(pos)])\n        return sample_pos_str\n    \n    def get_context_window(self, clip_name, win_length):\n        movie_name = clip_name.split(\"_\")[0]\n        start = int(clip_name.split(\"_\")[1])\n        end = int(clip_name.split(\"_\")[2].split(\".\")[0])\n        clip_length = 128#end-start\n        left_context_feats = np.zeros([win_length,4096], dtype=np.float32)\n        right_context_feats = np.zeros([win_length,4096], dtype=np.float32)\n        last_left_feat = np.load(self.sliding_clip_path+clip_name)\n        last_right_feat = np.load(self.sliding_clip_path+clip_name)\n        for k in range(win_length):\n            left_context_start = start-clip_length*(k+1)\n            left_context_end = start-clip_length*k\n            right_context_start = end+clip_length*k\n            right_context_end = end+clip_length*(k+1)\n            left_context_name = movie_name+\"_\"+str(left_context_start)+\"_\"+str(left_context_end)+\".npy\"\n            right_context_name = movie_name+\"_\"+str(right_context_start)+\"_\"+str(right_context_end)+\".npy\"\n            if os.path.exists(self.sliding_clip_path+left_context_name):\n                left_context_feat = np.load(self.sliding_clip_path+left_context_name)\n                last_left_feat = left_context_feat\n            else:\n                left_context_feat = last_left_feat\n            if os.path.exists(self.sliding_clip_path+right_context_name):\n                right_context_feat = np.load(self.sliding_clip_path+right_context_name)\n                last_right_feat = right_context_feat\n            else:\n                right_context_feat = last_right_feat\n            left_context_feats[k] = left_context_feat\n            right_context_feats[k] = right_context_feat\n\n        return np.mean(left_context_feats, axis=0), np.mean(right_context_feats, axis=0)\n\n\n    def load_movie(self, movie_name):\n        movie_clip_sentences=[]\n        for k in range(len(self.clip_names)):\n            if movie_name in self.clip_names[k]:\n                movie_clip_sentences.append((self.clip_names[k], self.sent_vecs[k][:2400], self.sentences[k]))\n\n        movie_clip_imgs=[]\n        for k in range(len(self.movie_frames[movie_name])):\n           # print str(k)+\"/\"+str(len(self.movie_frames[movie_name]))            \n            if os.path.isfile(self.movie_frames[movie_name][k][1]) and os.path.getsize(self.movie_frames[movie_name][k][1])!=0:\n                img=load_image(self.movie_frames[movie_name][k][1])\n                movie_clip_imgs.append((self.movie_frames[movie_name][k][0],img))\n                    \n        return movie_clip_imgs, movie_clip_sentences\n\n    def load_movie_byclip(self,movie_name,sample_num):\n        movie_clip_sentences=[]\n        movie_clip_featmap=[]\n        clip_set=set()\n        for k in range(len(self.clip_sentence_pairs)):\n            if movie_name in self.clip_sentence_pairs[k][0]:\n                movie_clip_sentences.append((self.clip_sentence_pairs[k][0],self.clip_sentence_pairs[k][1][:self.semantic_size]))\n\n                if not self.clip_sentence_pairs[k][0] in clip_set:\n                    clip_set.add(self.clip_sentence_pairs[k][0])\n                    # print str(k)+\"/\"+str(len(self.movie_clip_names[movie_name]))\n                    visual_feature_path=self.image_dir+self.clip_sentence_pairs[k][0]+\".npy\"\n                    feature_data=np.load(visual_feature_path)\n                    movie_clip_featmap.append((self.clip_sentence_pairs[k][0],feature_data))\n        return movie_clip_featmap, movie_clip_sentences\n    \n    def load_movie_slidingclip(self, movie_name, sample_num):\n        movie_clip_sentences = []\n        movie_clip_featmap = []\n        clip_set = set()\n        for k in range(len(self.clip_sentence_pairs)):\n            if movie_name in self.clip_sentence_pairs[k][0]:\n                movie_clip_sentences.append((self.clip_sentence_pairs[k][0], self.clip_sentence_pairs[k][1][:self.semantic_size]))\n        for k in range(len(self.sliding_clip_names)):\n            if movie_name in self.sliding_clip_names[k]:\n                # print str(k)+\"/\"+str(len(self.movie_clip_names[movie_name]))\n                visual_feature_path = self.sliding_clip_path+self.sliding_clip_names[k]+\".npy\"\n                #context_feat=self.get_context(self.sliding_clip_names[k]+\".npy\")\n                left_context_feat,right_context_feat = self.get_context_window(self.sliding_clip_names[k]+\".npy\",1)\n                feature_data = np.load(visual_feature_path)\n                #comb_feat=np.hstack((context_feat,feature_data))\n                comb_feat = np.hstack((left_context_feat,feature_data,right_context_feat))\n                movie_clip_featmap.append((self.sliding_clip_names[k], comb_feat))\n        return movie_clip_featmap, movie_clip_sentences\n\n\n"
  },
  {
    "path": "exp_data/.gitkeep",
    "content": ""
  },
  {
    "path": "main.py",
    "content": "import tensorflow as tf\nimport numpy as np\nimport ctrl_model\nfrom six.moves import xrange\nimport time\nfrom sklearn.metrics import average_precision_score\nimport pickle\nimport vs_multilayer\nimport operator\n\ndef dense_to_one_hot(labels_dense, num_classes):\n    \"\"\"Convert class labels from scalars to one-hot vectors.\"\"\"\n    num_labels = labels_dense.shape[0]\n    index_offset = np.arange(num_labels) * num_classes\n    labels_one_hot = np.zeros((num_labels, num_classes))\n    labels_one_hot.flat[index_offset+labels_dense.ravel()] = 1\n    return labels_one_hot\n\ndef compute_ap(class_score_matrix, labels):\n    num_classes=class_score_matrix.shape[1]\n    one_hot_labels=dense_to_one_hot(labels, num_classes)\n    predictions=np.array(class_score_matrix>0, dtype=\"int32\")\n    average_precision=[]\n    for i in range(num_classes):\n        ps=average_precision_score(one_hot_labels[:, i], class_score_matrix[:, i])\n       # if not np.isnan(ps):\n        average_precision.append(ps)\n    return np.array(average_precision)\n\ndef calculate_IoU(i0,i1):\n    union = (min(i0[0], i1[0]), max(i0[1], i1[1]))\n    inter = (max(i0[0], i1[0]), min(i0[1], i1[1]))\n    iou = 1.0*(inter[1]-inter[0])/(union[1]-union[0])\n    return iou\n\ndef nms_temporal(x1,x2,s, overlap):\n    pick = []\n    assert len(x1)==len(s)\n    assert len(x2)==len(s)\n    if len(x1)==0:\n        return pick\n\n    #x1 = [b[0] for b in boxes]\n    #x2 = [b[1] for b in boxes]\n    #s = [b[-1] for b in boxes]\n    union = map(operator.sub, x2, x1) # union = x2-x1\n    I = [i[0] for i in sorted(enumerate(s), key=lambda x:x[1])] # sort and get index\n\n    while len(I)>0:\n        i = I[-1]\n        pick.append(i)\n\n        xx1 = [max(x1[i],x1[j]) for j in I[:-1]]\n        xx2 = [min(x2[i],x2[j]) for j in I[:-1]]\n        inter = [max(0.0, k2-k1) for k1, k2 in zip(xx1, xx2)]\n        o = [inter[u]/(union[i] + union[I[u]] - inter[u]) for u in range(len(I)-1)]\n        I_new = []\n        for j in range(len(o)):\n            if o[j] <=overlap:\n                I_new.append(I[j])\n        I = I_new\n    return pick\n\n'''\ncompute recall at certain IoU\n'''\ndef compute_IoU_recall_top_n_forreg(top_n, iou_thresh, sentence_image_mat, sentence_image_reg_mat, sclips, iclips):\n    correct_num = 0.0\n    for k in range(sentence_image_mat.shape[0]):\n        gt = sclips[k]\n        gt_start = float(gt.split(\"_\")[1])\n        gt_end = float(gt.split(\"_\")[2])\n        #print gt +\" \"+str(gt_start)+\" \"+str(gt_end)\n        sim_v = [v for v in sentence_image_mat[k]]\n        starts = [s for s in sentence_image_reg_mat[k,:,0]]\n        ends = [e for e in sentence_image_reg_mat[k,:,1]]\n        picks = nms_temporal(starts,ends, sim_v, iou_thresh-0.05)\n        #sim_argsort=np.argsort(sim_v)[::-1][0:top_n]\n        if top_n<len(picks): picks=picks[0:top_n]\n        for index in picks:\n            pred_start = sentence_image_reg_mat[k, index, 0]\n            pred_end = sentence_image_reg_mat[k, index, 1]\n            iou = calculate_IoU((gt_start, gt_end),(pred_start, pred_end))\n            if iou>=iou_thresh:\n                correct_num+=1\n                break\n    return correct_num\n\n'''\nevaluate the model\n'''\ndef do_eval_slidingclips(sess, vs_eval_op, model, movie_length_info, iter_step, test_result_output):\n    IoU_thresh = [0.1, 0.2, 0.3, 0.4, 0.5]\n    all_correct_num_10 = [0.0]*5\n    all_correct_num_5 = [0.0]*5\n    all_correct_num_1 = [0.0]*5\n    all_retrievd = 0.0\n    for movie_name in model.test_set.movie_names:\n        movie_length=movie_length_info[movie_name.split(\".\")[0]]\n        print \"Test movie: \"+movie_name+\"....loading movie data\"\n        movie_clip_featmaps, movie_clip_sentences=model.test_set.load_movie_slidingclip(movie_name, 16)\n        print \"sentences: \"+ str(len(movie_clip_sentences))\n        print \"clips: \"+ str(len(movie_clip_featmaps))\n        sentence_image_mat=np.zeros([len(movie_clip_sentences), len(movie_clip_featmaps)])\n        sentence_image_reg_mat=np.zeros([len(movie_clip_sentences), len(movie_clip_featmaps), 2])\n        for k in range(len(movie_clip_sentences)):\n            #sentence_clip_name=movie_clip_sentences[k][0]\n            #start=float(sentence_clip_name.split(\"_\")[1])\n            #end=float(sentence_clip_name.split(\"_\")[2].split(\"_\")[0])\n            \n            sent_vec=movie_clip_sentences[k][1]\n            sent_vec=np.reshape(sent_vec,[1,sent_vec.shape[0]])\n            for t in range(len(movie_clip_featmaps)):\n                featmap = movie_clip_featmaps[t][1]\n                visual_clip_name = movie_clip_featmaps[t][0]\n                start = float(visual_clip_name.split(\"_\")[1])\n                end = float(visual_clip_name.split(\"_\")[2].split(\"_\")[0])\n                featmap = np.reshape(featmap, [1, featmap.shape[0]])\n                feed_dict = {\n                model.visual_featmap_ph_test: featmap,\n                model.sentence_ph_test:sent_vec\n                }\n                outputs = sess.run(vs_eval_op,feed_dict=feed_dict)\n                sentence_image_mat[k,t] = outputs[0]\n                reg_clip_length = (end-start)*(10**outputs[2])\n                reg_mid_point = (start+end)/2.0+movie_length*outputs[1]\n                reg_end = end+outputs[2]\n                reg_start = start+outputs[1]\n                \n                sentence_image_reg_mat[k,t,0] = reg_start\n                sentence_image_reg_mat[k,t,1] = reg_end\n        \n        iclips = [b[0] for b in movie_clip_featmaps]\n        sclips = [b[0] for b in movie_clip_sentences]\n        \n        # calculate Recall@m, IoU=n\n        for k in range(len(IoU_thresh)):\n            IoU=IoU_thresh[k]\n            correct_num_10 = compute_IoU_recall_top_n_forreg(10, IoU, sentence_image_mat, sentence_image_reg_mat, sclips, iclips)\n            correct_num_5 = compute_IoU_recall_top_n_forreg(5, IoU, sentence_image_mat, sentence_image_reg_mat, sclips, iclips)\n            correct_num_1 = compute_IoU_recall_top_n_forreg(1, IoU, sentence_image_mat, sentence_image_reg_mat, sclips, iclips)\n            print movie_name+\" IoU=\"+str(IoU)+\", R@10: \"+str(correct_num_10/len(sclips))+\"; IoU=\"+str(IoU)+\", R@5: \"+str(correct_num_5/len(sclips))+\"; IoU=\"+str(IoU)+\", R@1: \"+str(correct_num_1/len(sclips))\n            all_correct_num_10[k]+=correct_num_10\n            all_correct_num_5[k]+=correct_num_5\n            all_correct_num_1[k]+=correct_num_1\n        all_retrievd+=len(sclips)\n    for k in range(len(IoU_thresh)):\n        print \" IoU=\"+str(IoU_thresh[k])+\", R@10: \"+str(all_correct_num_10[k]/all_retrievd)+\"; IoU=\"+str(IoU_thresh[k])+\", R@5: \"+str(all_correct_num_5[k]/all_retrievd)+\"; IoU=\"+str(IoU_thresh[k])+\", R@1: \"+str(all_correct_num_1[k]/all_retrievd)\n        test_result_output.write(\"Step \"+str(iter_step)+\": IoU=\"+str(IoU_thresh[k])+\", R@10: \"+str(all_correct_num_10[k]/all_retrievd)+\"; IoU=\"+str(IoU_thresh[k])+\", R@5: \"+str(all_correct_num_5[k]/all_retrievd)+\"; IoU=\"+str(IoU_thresh[k])+\", R@1: \"+str(all_correct_num_1[k]/all_retrievd)+\"\\n\")\n\ndef run_training():\n    initial_steps = 0\n    max_steps = 20000\n    batch_size = 56\n    train_csv_path = \"./exp_data/TACoS/train_clip-sentvec.pkl\"\n    test_csv_path = \"./exp_data/TACoS/test_clip-sentvec.pkl\"\n    test_feature_dir=\"../TACOS/Interval128_256_overlap0.8_c3d_fc6/\"\n    train_feature_dir = \"../TACOS/Interval64_128_256_512_overlap0.8_c3d_fc6/\"\n    \n    model = ctrl_model.CTRL_Model(batch_size, train_csv_path, test_csv_path, test_feature_dir, train_feature_dir)\n    test_result_output=open(\"ctrl_test_results.txt\", \"w\")\n    with tf.Graph().as_default():\n\t\t\n        loss_align_reg, vs_train_op, vs_eval_op, offset_pred, loss_reg = model.construct_model()\n        # Create a session for running Ops on the Graph.\n        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.2)\n        sess = tf.Session(config = tf.ConfigProto(gpu_options = gpu_options))\n        # Run the Op to initialize the variables.\n        init = tf.initialize_all_variables()\n        sess.run(init)\n        for step in xrange(max_steps):\n            start_time = time.time()\n            feed_dict = model.fill_feed_dict_train_reg()\n            _, loss_value, offset_pred_v, loss_reg_v = sess.run([vs_train_op, loss_align_reg, offset_pred, loss_reg], feed_dict=feed_dict)\n            duration = time.time() - start_time\n\n            if step % 5 == 0:\n                # Print status to stdout.\n                print('Step %d: loss = %.3f (%.3f sec)' % (step, loss_value, duration))\n\n            if (step+1) % 2000 == 0:\n                print \"Start to test:-----------------\\n\"\n                movie_length_info=pickle.load(open(\"./video_allframes_info.pkl\"))\n                do_eval_slidingclips(sess, vs_eval_op, model, movie_length_info, step+1, test_result_output)\n\ndef main(_):\n    run_training()\n\n\nif __name__ == '__main__':\n    tf.app.run()\n        \t\n\n\n\n"
  },
  {
    "path": "util/__init__.py",
    "content": ""
  },
  {
    "path": "util/cnn.py",
    "content": "from __future__ import division  #, print_function\n\nimport tensorflow as tf\n\ndef conv_layer(name, bottom, kernel_size, stride, output_dim, padding='SAME',\n               bias_term=True, weights_initializer=None, biases_initializer=None):\n    # input has shape [batch, in_height, in_width, in_channels]\n    input_dim = bottom.get_shape().as_list()[-1]\n\n    # weights and biases variables\n    with tf.variable_scope(name):\n        if weights_initializer is None and biases_initializer is None:\n            # initialize the variables\n            if weights_initializer is None:\n                weights_initializer = tf.random_normal_initializer(mean=0.0, stddev=0.01)\n            if bias_term and biases_initializer is None:\n              biases_initializer = tf.constant_initializer(0.)\n            print \"input_dim\"+str(input_dim)\n            # filter has shape [filter_height, filter_width, in_channels, out_channels]\n            weights = tf.get_variable(\"weights\",\n                [kernel_size, kernel_size, input_dim, output_dim],\n                initializer=weights_initializer)\n            if bias_term:\n                biases = tf.get_variable(\"biases\", output_dim,\n                    initializer=biases_initializer)\n            print str(weights.name)+\" initialized as random or retrieved from graph\"\n            if bias_term:\n                print biases.name+\" initialized as random or retrieved from graph\"\n\n        else:\n            weights = tf.get_variable(\"weights\",\n                shape=None,\n                initializer=weights_initializer)\n            if bias_term:\n                biases = tf.get_variable(\"biases\", shape=None,\n                    initializer=biases_initializer) \n\n            print weights.name+\" initialized from pre-trained parameters or retrieved from graph\"\n            if bias_term:\n                print biases.name+\" initialized from pre-trained parameters or retrieved from graph\"\n\n\n    conv = tf.nn.conv2d(bottom, filter=weights,\n        strides=[1, stride, stride, 1], padding=padding)\n    if bias_term:\n        conv = tf.nn.bias_add(conv, biases)\n    return conv\n\ndef conv_relu_layer(name, bottom, kernel_size, stride, output_dim, padding='SAME',\n                    bias_term=True, weights_initializer=None, biases_initializer=None):\n    conv = conv_layer(name, bottom, kernel_size, stride, output_dim, padding,\n                      bias_term, weights_initializer, biases_initializer)\n    relu = tf.nn.relu(conv)\n    return relu\n\ndef deconv_layer(name, bottom, kernel_size, stride, output_dim, padding='SAME',\n                 bias_term=True, weights_initializer=None, biases_initializer=None):\n    # input_shape is [batch, in_height, in_width, in_channels]\n    input_shape = bottom.get_shape().as_list()\n    batch_size, input_height, input_width, input_dim = input_shape\n    output_shape = [batch_size, input_height*stride, input_width*stride, output_dim]\n\n    # weights and biases variables\n    with tf.variable_scope(name):\n        # initialize the variables\n        if weights_initializer is None:\n            weights_initializer = tf.random_normal_initializer()\n        if bias_term and biases_initializer is None:\n            biases_initializer = tf.constant_initializer(0.)\n\n        # filter has shape [filter_height, filter_width, out_channels, in_channels]\n        weights = tf.get_variable(\"weights\",\n            [kernel_size, kernel_size, output_dim, input_dim],\n            initializer=weights_initializer)\n        if bias_term:\n            biases = tf.get_variable(\"biases\", output_dim,\n                initializer=biases_initializer)\n\n    deconv = tf.nn.conv2d_transpose(bottom, filter=weights,\n        output_shape=output_shape, strides=[1, stride, stride, 1],\n        padding=padding)\n    if bias_term:\n        deconv = tf.nn.bias_add(deconv, biases)\n    return deconv\n\ndef deconv_relu_layer(name, bottom, kernel_size, stride, output_dim, padding='SAME',\n                      bias_term=True, weights_initializer=None, biases_initializer=None):\n    deconv = deconv_layer(name, bottom, kernel_size, stride, output_dim, padding,\n                          bias_term, weights_initializer, biases_initializer)\n    relu = tf.nn.relu(deconv)\n    return relu\n\ndef pooling_layer(name, bottom, kernel_size, stride):\n    pool = tf.nn.max_pool(bottom, ksize=[1, kernel_size, kernel_size, 1],\n        strides=[1, stride, stride, 1], padding='SAME', name=name)\n    return pool\n\ndef fc_layer(name, bottom, output_dim, bias_term=True, weights_initializer=None,\n             biases_initializer=None):\n    # flatten bottom input\n    # input has shape [batch, in_height, in_width, in_channels]\n    shape = bottom.get_shape().as_list()\n    input_dim = 1\n    for d in shape[1:]:\n        input_dim *= d\n    flat_bottom = tf.reshape(bottom, [-1, input_dim])\n    \n    # weights and biases variables\n    with tf.variable_scope(name):\n        if weights_initializer is None and biases_initializer is None:\n            # initialize the variables\n            if weights_initializer is None:\n                weights_initializer = tf.random_normal_initializer()\n            if bias_term and biases_initializer is None:\n                biases_initializer = tf.constant_initializer(0.)\n\n            # weights has shape [input_dim, output_dim]\n            weights = tf.get_variable(\"weights\", [input_dim, output_dim],\n                initializer=weights_initializer)\n            if bias_term:\n                biases = tf.get_variable(\"biases\", output_dim,\n                    initializer=biases_initializer)\n\n            print weights.name+\" initialized as random or retrieved from graph\"\n            if bias_term:\n                print biases.name+\" initialized as random or retrieved from graph\"\n        else:\n            weights = tf.get_variable(\"weights\", shape=None,\n                initializer=weights_initializer)\n            if bias_term:\n                biases = tf.get_variable(\"biases\", shape=None,\n                    initializer=biases_initializer)\n\n            print weights.name+\" initialized from pre-trained parameters or retrieved from graph\"\n            if bias_term:\n                print biases.name+\" initialized from pre-trained parameters or retrieved from graph\"\n\n    if bias_term:\n        fc = tf.nn.xw_plus_b(flat_bottom, weights, biases)\n    else:\n        fc = tf.matmul(flat_bottom, weights)\n    return fc\n\ndef fc_relu_layer(name, bottom, output_dim, bias_term=True,\n                  weights_initializer=None, biases_initializer=None):\n    fc = fc_layer(name, bottom, output_dim, bias_term, weights_initializer,\n                  biases_initializer)\n    relu = tf.nn.relu(fc)\n    return relu\n\ndef softmax_loss_layer(name, score_bottom, label_bottom):\n    \"\"\"\n    Calculates cumulative Softmax Cross Entropy Loss along the last dimension\n    *This function does not divide the loss by batch size*\n\n    Once tensorflow has SparseCrossEntropy function, this one will be replaced\n    \"\"\"\n    # Check shape\n    score_shape = score_bottom.get_shape().as_list()\n    label_shape = label_bottom.get_shape().as_list()\n    assert len(score_shape) == len(label_shape) + 1\n    assert score_shape[:-1] == label_shape\n\n    # Compute the outer dimensions dimensions in label\n    inner_dim = score_shape[-1]\n    outer_dim = 1\n    for d in label_shape: outer_dim *= d\n\n    # flatten score and label\n    flat_score = tf.reshape(score_bottom, [outer_dim, inner_dim])\n    flat_label = tf.reshape(label_bottom, [outer_dim, 1])\n\n    # Reshape the labels into a dense Tensor of\n    # shape [batch_size, NUM_CLASSES].\n    sparse_labels = tf.reshape(labels, [FLAGS.batch_size, 1])\n    indices = tf.reshape(tf.range(FLAGS.batch_size), [FLAGS.batch_size, 1])\n    concated = tf.concat(1, [indices, sparse_labels])\n    dense_labels = tf.sparse_to_dense(concated, [FLAGS.batch_size, NUM_CLASSES],\n        1.0, 0.0)\n"
  },
  {
    "path": "video_allframes_info.pkl",
    "content": "(dp0\nS's30-d43'\np1\nI19807\nsS's30-d40'\np2\nI4911\nsS's30-d41'\np3\nI19163\nsS's22-d55'\np4\nI1530\nsS's25-d52'\np5\nI5676\nsS's25-d51'\np6\nI3409\nsS's28-d39'\np7\nI22908\nsS's35-d55'\np8\nI6758\nsS's24-d41'\np9\nI17753\nsS's24-d40'\np10\nI6322\nsS's24-d48'\np11\nI2397\nsS's21-d29'\np12\nI1881\nsS's21-d39'\np13\nI3774\nsS's22-d29'\np14\nI3253\nsS's33-d27'\np15\nI20003\nsS's22-d46'\np16\nI14844\nsS's32-d70'\np17\nI7357\nsS's13-d52'\np18\nI2767\nsS's22-d48'\np19\nI2467\nsS's13-d54'\np20\nI8199\nsS's37-d39'\np21\nI9169\nsS's25-d23'\np22\nI3808\nsS's27-d70'\np23\nI8218\nsS's36-d50'\np24\nI5219\nsS's26-d70'\np25\nI5847\nsS's22-d53'\np26\nI6013\nsS's21-d23'\np27\nI3452\nsS's21-d21'\np28\nI4450\nsS's24-d53'\np29\nI5237\nsS's21-d28'\np30\nI3693\nsS's32-d69'\np31\nI12409\nsS's35-d48'\np32\nI7628\nsS's31-d28'\np33\nI5841\nsS's31-d25'\np34\nI2650\nsS's22-d25'\np35\nI1621\nsS's29-d42'\np36\nI22198\nsS's35-d40'\np37\nI13864\nsS's23-d31'\np38\nI3216\nsS's17-d48'\np39\nI2273\nsS's36-d42'\np40\nI18133\nsS's34-d34'\np41\nI17675\nsS's35-d41'\np42\nI27306\nsS's34-d41'\np43\nI15419\nsS's37-d46'\np44\nI13677\nsS's21-d53'\np45\nI4683\nsS's30-d26'\np46\nI17130\nsS's24-d28'\np47\nI7230\nsS's21-d55'\np48\nI2133\nsS's14-d35'\np49\nI2421\nsS's30-d29'\np50\nI6115\nsS's36-d70'\np51\nI7055\nsS's23-d51'\np52\nI6830\nsS's31-d31'\np53\nI6273\nsS's23-d54'\np54\nI9474\nsS's24-d34'\np55\nI5430\nsS's17-d55'\np56\nI2335\nsS's27-d50'\np57\nI2430\nsS's17-d53'\np58\nI7031\nsS's22-d43'\np59\nI3315\nsS's25-d69'\np60\nI14351\nsS's27-d34'\np61\nI1772\nsS's23-d46'\np62\nI10819\nsS's21-d42'\np63\nI6090\nsS's21-d43'\np64\nI4033\nsS's23-d42'\np65\nI12416\nsS's21-d45'\np66\nI2866\nsS's14-d26'\np67\nI12483\nsS's14-d27'\np68\nI6134\nsS's21-d40'\np69\nI3238\nsS's26-d26'\np70\nI41240\nsS's26-d23'\np71\nI2842\nsS's17-d42'\np72\nI15601\nsS's34-d28'\np73\nI11816\nsS's23-d45'\np74\nI4184\nsS's29-d52'\np75\nI5871\nsS's15-d70'\np76\nI8315\nsS's27-d45'\np77\nI4757\nsS's29-d50'\np78\nI2847\nsS's27-d29'\np79\nI8024\nsS's21-d35'\np80\nI1875\nsS's27-d21'\np81\nI4408\nsS's37-d25'\np82\nI1436\nsS's37-d21'\np83\nI11130\nsS's23-d34'\np84\nI5231\nsS's23-d39'\np85\nI5444\nsS's37-d29'\np86\nI3012\nsS's27-d54'\np87\nI10969\nsS's14-d51'\np88\nI8323\nsS's28-d46'\np89\nI11162\nsS's23-d21'\np90\nI4866\nsS's13-d48'\np91\nI3228\nsS's32-d27'\np92\nI22541\nsS's13-d21'\np93\nI2955\nsS's13-d25'\np94\nI2951\nsS's13-d28'\np95\nI5629\nsS's14-d46'\np96\nI9249\nsS's14-d43'\np97\nI4667\nsS's22-d34'\np98\nI2401\nsS's21-d50'\np99\nI1607\nsS's24-d23'\np100\nI5735\nsS's29-d31'\np101\nI3164\nsS's17-d69'\np102\nI12454\nsS's15-d26'\np103\nI33264\nsS's29-d39'\np104\nI9630\nsS's32-d55'\np105\nI5689\nsS's32-d52'\np106\nI5508\nsS's13-d31'\np107\nI5093\nsS's36-d31'\np108\nI7071\nsS's36-d43'\np109\nI26924\nsS's33-d45'\np110\nI7698\nsS's22-d26'\np111\nI9365\nsS's28-d27'\np112\nI17235\nsS's28-d25'\np113\nI3220\nsS's30-d53'\np114\nI13379\nsS's30-d52'\np115\nI7362\nsS's33-d54'\np116\nI10898\nsS's22-d35'\np117\nI2555\nsS's15-d35'\np118\nI6472\nsS's33-d50'\np119\nI3466\nsS's13-d40'\np120\nI3347\nsS's13-d45'\np121\nI4354\nsS's36-d23'\np122\nI19412\nsS's25-d35'\np123\nI3900\nsS's28-d51'\np124\nI19725\nsS's26-d69'\np125\nI20978\nsS's36-d27'\np126\nI4390\nsS's34-d69'\np127\nI12952\ns."
  },
  {
    "path": "vs_multilayer.py",
    "content": "from __future__ import division\n\nimport numpy as np\nimport tensorflow as tf\n\n# components\nfrom tensorflow.python.ops.nn import dropout as drop\nfrom util.cnn import conv_layer as conv\nfrom util.cnn import conv_relu_layer as conv_relu\nfrom util.cnn import pooling_layer as pool\nfrom util.cnn import fc_layer as fc\nfrom util.cnn import fc_relu_layer as fc_relu\n\ndef vs_multilayer(input_batch,name,middle_layer_dim=1000,reuse=False):\n    with tf.variable_scope(name):\n        if reuse==True:\n            print name+\" reuse variables\"\n            tf.get_variable_scope().reuse_variables()\n        else:\n            print name+\" doesn't reuse variables\"\n\n        layer1 = conv_relu('layer1', input_batch,\n                        kernel_size=1,stride=1,output_dim=middle_layer_dim)\n        sim_score = conv('layer2', layer1,\n                        kernel_size=1,stride=1,output_dim=3)\n    return sim_score\n"
  }
]