Repository: xlliu7/TadTR Branch: master Commit: 983ae14bcec8 Files: 37 Total size: 197.1 KB Directory structure: gitextract_inim4nmp/ ├── .gitignore ├── Evaluation/ │ ├── README.md │ ├── eval_detection.py │ └── utils.py ├── LICENSE ├── README.md ├── configs/ │ └── thumos14_i3d2s_tadtr.yml ├── datasets/ │ ├── __init__.py │ ├── data_utils.py │ ├── path.yml │ ├── tad_dataset.py │ └── tad_eval.py ├── demo.py ├── docs/ │ └── 1_train_on_your_dataset.md ├── engine.py ├── main.py ├── models/ │ ├── __init__.py │ ├── custom_loss.py │ ├── matcher.py │ ├── ops/ │ │ ├── roi_align/ │ │ │ ├── __init__.py │ │ │ ├── roi_align.py │ │ │ └── src/ │ │ │ ├── roi_align_cuda.cpp │ │ │ └── roi_align_kernel.cu │ │ ├── setup.py │ │ └── temporal_deform_attn/ │ │ ├── __init__.py │ │ └── temporal_deform_attn.py │ ├── position_encoding.py │ ├── tadtr.py │ └── transformer.py ├── opts.py ├── requirements.txt ├── scripts/ │ ├── run_parallel.sh │ └── test_reference_models.sh └── util/ ├── __init__.py ├── logger.py ├── misc.py └── segment_ops.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # User defined data/ outputs/ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ ================================================ FILE: Evaluation/README.md ================================================ #ActivityNet Large Scale Activity Recognition Challenge - Evaluation Toolkit This is the documentation of the ActivityNet Large Scale Activity Recognition Challenge Evaluation Toolkit. It includes APIs to evaluate the performance of a method in the two different tasks in the challenge: *untrimmed video classification* and *activity detection*. For more information about the challenge competitions, please read the [guidelines](http://activity-net.org/challenges/2016/guidelines.html). ##Dependencies The Evaluation Toolkit is purely written in Python (>=2.7) and it requires the following third party libraries: * [Numpy](http://www.numpy.org/) * [Pandas](http://pandas.pydata.org/) ##Getting started We include sample prediction files in the folder data to show how to evaluate your prediction results. Please follow this steps to obtain the performance evaluation on the provided sample files: * Run `git clone` this repository. * To evaluate classification performance call: `python get_classification_performance.py data/activity_net.v1-3.min.json sample_classification_prediction.json` * To evaluate detection performance call: `python get_detection_performance.py data/activity_net.v1-3.min.json sample_detection_prediction.json` ##Contributions and Troubleshooting We are welcome to contributions, please keep your pull-request simple so we can go back to you as soon as we can. If you found a bug please open a new issue and describe the problem. ================================================ FILE: Evaluation/eval_detection.py ================================================ import json import sys import urllib.error, urllib.parse import numpy as np import pandas as pd from .utils import get_blocked_videos from .utils import interpolated_prec_rec from .utils import segment_iou import pdb import traceback import logging from joblib import Parallel, delayed logger_initilized = False def setup_logger(log_file_path, name=None, level=logging.INFO): """ Setup a logger that simultaneously output to a file and stdout ARGS log_file_path: string, path to the logging file """ # logging settings # log_formatter = logging.Formatter("%(asctime)s [%(levelname)-5.5s] %(message)s") log_formatter = logging.Formatter( "[%(asctime)s][%(levelname)s] %(pathname)s: %(lineno)4d: %(message)s", datefmt="%m/%d %H:%M:%S") root_logger = logging.getLogger(name) if name: root_logger.propagate = False root_logger.setLevel(level) # file handler if log_file_path is not None: log_file_handler = logging.FileHandler(log_file_path) log_file_handler.setFormatter(log_formatter) root_logger.addHandler(log_file_handler) log_formatter = logging.Formatter( "[%(asctime)s][%(levelname)s]: %(message)s", datefmt="%m/%d %H:%M:%S") log_stream_handler = logging.StreamHandler(sys.stdout) log_stream_handler.setFormatter(log_formatter) # log_stream_handler.setLevel(logging.INFO) root_logger.addHandler(log_stream_handler) logging.info('Log file is %s' % log_file_path) global logger_initilized logger_initilized = True return root_logger def get_classes(anno_dict): if 'classes' in anno_dict: classes = anno_dict['classes'] else: database = anno_dict['database'] all_gts = [] for vid in database: all_gts += database[vid]['annotations'] classes = list(sorted({x['label'] for x in all_gts})) return classes class ANETdetection(object): GROUND_TRUTH_FIELDS = ['database', 'taxonomy', 'version'] PREDICTION_FIELDS = ['results', 'version', 'external_data'] def __init__(self, ground_truth_filename=None, prediction_filename=None, ground_truth_fields=GROUND_TRUTH_FIELDS, prediction_fields=PREDICTION_FIELDS, tiou_thresholds=np.linspace(0.5, 0.95, 10), subset='validation', verbose=False, check_status=False, log_path=None, exclude_videos=None): if not ground_truth_filename: raise IOError('Please input a valid ground truth file.') if not prediction_filename: raise IOError('Please input a valid prediction file.') self.subset = subset # if log_path is None: if not logger_initilized: print('setup logger') logger = setup_logger(log_path) else: logger = logging.getLogger() self.logger = logger self.tiou_thresholds = tiou_thresholds self.verbose = verbose self.gt_fields = ground_truth_fields self.pred_fields = prediction_fields self.ap = None self.check_status = check_status self.blocked_videos = exclude_videos if exclude_videos else list() # self.blocked_videos = ['video_test_0000270', 'video_test_0001292', 'video_test_0001496'] # Import ground truth and predictions. self.ground_truth, self.activity_index = self._import_ground_truth( ground_truth_filename) self.prediction = self._import_prediction(prediction_filename) if self.verbose: self.logger.info('[INIT] Loaded annotations from {} subset.'.format(subset)) nr_gt = len(self.ground_truth) self.logger.info('\tNumber of ground truth instances: {}'.format(nr_gt)) nr_pred = len(self.prediction) self.logger.info('\tNumber of predictions: {}'.format(nr_pred)) self.logger.info('\tFixed threshold for tiou score: {}'.format(self.tiou_thresholds)) def _import_ground_truth(self, ground_truth_filename): """Reads ground truth file, checks if it is well formatted, and returns the ground truth instances and the activity classes. Parameters ---------- ground_truth_filename : str Full path to the ground truth json file. Outputs ------- ground_truth : df Data frame containing the ground truth instances. activity_index : dict Dictionary containing class index. """ if isinstance(ground_truth_filename, str): with open(ground_truth_filename, 'r') as fobj: data = json.load(fobj) else: data = ground_truth_filename # # Checking format # if not all([field in list(data.keys()) for field in self.gt_fields]): # raise IOError('Please input a valid ground truth file.') # Read ground truth data. # activity_index, cidx = {}, 0 class_list = get_classes(data) activity_index = {cls_name: idx for idx, cls_name in enumerate(class_list)} video_lst, t_start_lst, t_end_lst, label_lst, difficult_lst = [], [], [], [], [] for videoid, v in data['database'].items(): if self.subset != v['subset']: continue if videoid in self.blocked_videos: continue for ann in v['annotations']: # if ann['label'] not in class_list: # class_list.append(ann['label']) video_lst.append(videoid) t_start_lst.append(float(ann['segment'][0])) t_end_lst.append(float(ann['segment'][1])) label_lst.append(activity_index[ann['label']]) difficult = 0 if 'difficult' not in ann else ann['difficult'] difficult_lst.append(difficult) ground_truth = pd.DataFrame({'video-id': video_lst, 't-start': t_start_lst, 't-end': t_end_lst, 'label': label_lst, 'difficult': difficult_lst}) self.class_list = [x for x in class_list] return ground_truth, activity_index def _import_prediction(self, prediction_filename): """Reads prediction file, checks if it is well formatted, and returns the prediction instances. Parameters ---------- prediction_filename : str Full path to the prediction json file. Outputs ------- prediction : df Data frame containing the prediction instances. """ if isinstance(prediction_filename, str): with open(prediction_filename, 'r') as fobj: data = json.load(fobj) else: data = prediction_filename # Checking format... if not all([field in list(data.keys()) for field in self.pred_fields]): raise IOError('Please input a valid prediction file.') # Read predicitons. video_lst, t_start_lst, t_end_lst = [], [], [] label_lst, score_lst = [], [] for videoid, v in data['results'].items(): if videoid in self.blocked_videos: continue for result in v: label = self.activity_index[result['label']] video_lst.append(videoid) t_start_lst.append(result['segment'][0]) t_end_lst.append(result['segment'][1]) label_lst.append(label) score_lst.append(result['score']) prediction = pd.DataFrame({'video-id': video_lst, 't-start': t_start_lst, 't-end': t_end_lst, 'label': label_lst, 'score': score_lst}) return prediction # def wrapper_compute_average_precision(self): # """Computes average precision for each class in the subset. # """ # ap = np.zeros((len(self.tiou_thresholds), len(list(self.activity_index.items())))) # for activity, cidx in self.activity_index.items(): # gt_idx = self.ground_truth['label'] == cidx # pred_idx = self.prediction['label'] == cidx # ap[:,cidx] = compute_average_precision_detection( # self.ground_truth.loc[gt_idx].reset_index(drop=True), # self.prediction.loc[pred_idx].reset_index(drop=True), # tiou_thresholds=self.tiou_thresholds) # return ap ################################# copied from GTAD ####################################### def _get_predictions_with_label(self, prediction_by_label, label_name, cidx): """Get all predicitons of the given label. Return empty DataFrame if there is no predcitions with the given label. """ try: return prediction_by_label.get_group(cidx).reset_index(drop=True) except: if self.verbose: print('Warning: No predictions of label \'%s\' were provdied.' % label_name) return pd.DataFrame() def wrapper_compute_average_precision(self): """Computes average precision for each class in the subset. """ ap = np.zeros((len(self.tiou_thresholds), len(self.activity_index))) # Adaptation to query faster ground_truth_by_label = self.ground_truth.groupby('label') prediction_by_label = self.prediction.groupby('label') results = Parallel(n_jobs=len(self.activity_index))( delayed(compute_average_precision_detection)( ground_truth=ground_truth_by_label.get_group(cidx).reset_index(drop=True), prediction=self._get_predictions_with_label(prediction_by_label, label_name, cidx), tiou_thresholds=self.tiou_thresholds, ) for label_name, cidx in self.activity_index.items()) for i, cidx in enumerate(self.activity_index.values()): ap[:,cidx] = results[i] return ap ################################################################################# def evaluate(self): """Evaluates a prediction file. For the detection task we measure the interpolated mean average precision to measure the performance of a method. """ self.ap = self.wrapper_compute_average_precision() self.mAP = self.ap.mean(axis=1) if self.verbose: self.logger.info('[RESULTS] Performance on ActivityNet detection task.') self.logger.info('\n{}'.format(' '.join(['%.4f' % (x * 1) for x in self.mAP]))) self.logger.info('\tAverage-mAP: {}'.format(self.mAP.mean())) def compute_average_precision_detection(ground_truth, prediction, tiou_thresholds=np.linspace(0.5, 0.95, 10)): """Compute average precision (detection task) between ground truth and predictions data frames. If multiple predictions occurs for the same predicted segment, only the one with highest score is matches as true positive. This code is greatly inspired by Pascal VOC devkit. Parameters ---------- ground_truth : df Data frame containing the ground truth instances. Required fields: ['video-id', 't-start', 't-end'] prediction : df Data frame containing the prediction instances. Required fields: ['video-id, 't-start', 't-end', 'score'] tiou_thresholds : 1darray, optional Temporal intersection over union threshold. Outputs ------- ap : float Average precision score. """ npos = float(len(ground_truth)) lock_gt = np.ones((len(tiou_thresholds),len(ground_truth))) * -1 # Sort predictions by decreasing score order. sort_idx = prediction['score'].values.argsort()[::-1] prediction = prediction.loc[sort_idx].reset_index(drop=True) # Initialize true positive and false positive vectors. tp = np.zeros((len(tiou_thresholds), len(prediction))) fp = np.zeros((len(tiou_thresholds), len(prediction))) # Adaptation to query faster ground_truth_gbvn = ground_truth.groupby('video-id') # Assigning true positive to truly grount truth instances. for idx, this_pred in prediction.iterrows(): try: # Check if there is at least one ground truth in the video associated. ground_truth_videoid = ground_truth_gbvn.get_group(this_pred['video-id']) except Exception as e: # print(e) fp[:, idx] = 1 continue this_gt = ground_truth_videoid.reset_index() tiou_arr = segment_iou(this_pred[['t-start', 't-end']].values, this_gt[['t-start', 't-end']].values) # We would like to retrieve the predictions with highest tiou score. tiou_sorted_idx = tiou_arr.argsort()[::-1] # matched_to_difficult = False for tidx, tiou_thr in enumerate(tiou_thresholds): for jdx in tiou_sorted_idx: if tiou_arr[jdx] < tiou_thr: fp[tidx, idx] = 1 break if lock_gt[tidx, this_gt.loc[jdx]['index']] >= 0: continue # Assign as true positive after the filters above. tp[tidx, idx] = 1 lock_gt[tidx, this_gt.loc[jdx]['index']] = idx break if fp[tidx, idx] == 0 and tp[tidx, idx] == 0: fp[tidx, idx] = 1 ap = np.zeros(len(tiou_thresholds)) for tidx in range(len(tiou_thresholds)): # Computing prec-rec this_tp = np.cumsum(tp[tidx,:]).astype(np.float) this_fp = np.cumsum(fp[tidx,:]).astype(np.float) rec = this_tp / npos prec = this_tp / (this_tp + this_fp) ap[tidx] = interpolated_prec_rec(prec, rec) return ap ================================================ FILE: Evaluation/utils.py ================================================ import json import urllib.request, urllib.error, urllib.parse import numpy as np API = 'http://ec2-52-11-11-89.us-west-2.compute.amazonaws.com/challenge17/api.py' def get_blocked_videos(api=API): api_url = '{}?action=get_blocked'.format(api) req = urllib.request.Request(api_url) response = urllib.request.urlopen(req) return json.loads(response.read()) def interpolated_prec_rec(prec, rec): """Interpolated AP - VOCdevkit from VOC 2011. """ mprec = np.hstack([[0], prec, [0]]) mrec = np.hstack([[0], rec, [1]]) for i in range(len(mprec) - 1)[::-1]: mprec[i] = max(mprec[i], mprec[i + 1]) idx = np.where(mrec[1::] != mrec[0:-1])[0] + 1 ap = np.sum((mrec[idx] - mrec[idx - 1]) * mprec[idx]) return ap def segment_iou(target_segment, candidate_segments): """Compute the temporal intersection over union between a target segment and all the test segments. Parameters ---------- target_segment : 1d array Temporal target segment containing [starting, ending] times. candidate_segments : 2d array Temporal candidate segments containing N x [starting, ending] times. Outputs ------- tiou : 1d array Temporal intersection over union score of the N's candidate segments. """ tt1 = np.maximum(target_segment[0], candidate_segments[:, 0]) tt2 = np.minimum(target_segment[1], candidate_segments[:, 1]) # Intersection including Non-negative overlap score. segments_intersection = (tt2 - tt1).clip(0) # Segment union. segments_union = (candidate_segments[:, 1] - candidate_segments[:, 0]) \ + (target_segment[1] - target_segment[0]) - segments_intersection # Compute overlap as the ratio of the intersection # over union of two segments. tIoU = segments_intersection.astype(float) / segments_union return tIoU def wrapper_segment_iou(target_segments, candidate_segments): """Compute intersection over union btw segments Parameters ---------- target_segments : ndarray 2-dim array in format [m x 2:=[init, end]] candidate_segments : ndarray 2-dim array in format [n x 2:=[init, end]] Outputs ------- tiou : ndarray 2-dim array [n x m] with IOU ratio. Note: It assumes that candidate-segments are more scarce that target-segments """ if candidate_segments.ndim != 2 or target_segments.ndim != 2: raise ValueError('Dimension of arguments is incorrect') n, m = candidate_segments.shape[0], target_segments.shape[0] tiou = np.empty((n, m)) for i in range(m): tiou[:, i] = segment_iou(target_segments[i,:], candidate_segments) return tiou ================================================ FILE: LICENSE ================================================ Copyright (c) 2021 - 2022, Xiaolong Liu et al. All Rights Reserved. Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. ----------------------------------------------------------------------- Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) Copyright 2020, SenseTime Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ----------------------------------------------------------------------- DETR (https://github.com/facebookresearch/detr) Copyright 2020 - present, Facebook, Inc Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================ # TadTR: End-to-end Temporal Action Detection with Transformer [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/end-to-end-temporal-action-detection-with/temporal-action-localization-on-thumos14)](https://paperswithcode.com/sota/temporal-action-localization-on-thumos14?p=end-to-end-temporal-action-detection-with) By [Xiaolong Liu](https://github.com/xlliu7), [Qimeng Wang](https://scholar.google.com/citations?user=hi7AeE8AAAAJ), [Yao Hu](https://scholar.google.com/citations?user=LIu7k7wAAAAJ), [Xu Tang](https://scholar.google.com/citations?user=grP24aAAAAAJ), [Shiwei Zhang](https://scholar.google.com/citations?user=ZO3OQ-8AAAAJ), [Song Bai](http://songbai.site), [Xiang Bai](https://scholar.google.com/citations?user=UeltiQ4AAAAJ). This repo holds the code for TadTR, described in the paper [End-to-end temporal action detection with Transformer](https://arxiv.org/abs/2106.10271) published in IEEE Transactions on Image Processing (TIP) 2022. We have also explored fully end-to-end training from RGB images with TadTR. See our CVPR 2022 work [E2E-TAD][e2e-tad]. ## Introduction TadTR is an end-to-end Temporal Action Detection TRansformer. It has the following advantages over previous methods: - Simple. It adopts a set-prediction pipeline and achieves TAD with a *single network*. It does not require a separate proposal generation stage. - Flexible. It removes hand-crafted design such as anchor setting and NMS. - Sparse. It produces very sparse detections (e.g. 10 on ActivityNet), thus requiring lower computation cost. - Strong. As a *self-contained* temporal action detector, TadTR achieves state-of-the-art performance on HACS and THUMOS14. It is also much stronger than concurrent Transformer-based methods such as **RTD-Net** and **AGT**. ![](data_intro/arch.png "Architecture") ## Updates [2023.2.19] Fix a bug a loss caculation ([issue #21](https://github.com/xlliu7/TadTR/issues/21)). Thank [@zachpvin](https://github.com/zachpvin) for raising this issue! [2022.8.7] Add support for training/testing on THUMOS14! [2022.7.4] Glad to share that this paper will appear in IEEE Transactions on Image Processing (TIP). Although I am still busy with my thesis, I will try to make the code accessible soon. Thanks for your patience. [2022.6] Update the technical report of this work on arxiv (now v3). [2022.3] Our new work [E2E-TAD][e2e-tad] based on TadTR is accepted to CVPR 2022. It supports fully end-to-end training from RGB images. [2021.9.15] Update the performance on THUMOS14. [2021.9.1] Add demo code. [2021.7] Our revised paper was submitted to IEEE Transactions on Image Processing. [2021.6] Our revised paper was uploaded to arxiv. [2021.1.21] Our paper was submitted to IJCAI 2021. ## TODOs - [x] add model code - [x] add inference code - [x] add training code - [x] support training/inference with video input. See [E2E-TAD][e2e-tad] ## Main Results - HACS Segments |Method|Feature|mAP@0.5|mAP@0.75|mAP@0.95|Avg. mAP| | :----: |:----: | :--: | :----: | :---: | :----: | |TadTR|I3D RGB|47.14 |32.11 |10.94| 32.09| - THUMOS14 |Method|Feature|mAP@0.3|mAP@0.4|mAP@0.5|mAP@0.6|mAP@0.7|Avg. mAP| | :----: |:----: | :--: | :----: | :---: | :----: |:----: | :----: | |TadTR|I3D 2stream|74.8 |69.1| 60.1| 46.6| 32.8| 56.7| - ActivityNet-1.3 |Method|Feature|mAP@0.5|mAP@0.75|mAP@0.95|Avg. mAP| | :----: |:----: | :--: | :----: | :---: | :----: | |TadTR|TSN 2stream|51.29 |34.99| 9.49| 34.64| |TadTR|TSP|53.62| 37.52| 10.56| 36.75| ## Install ### Requirements * Linux or Windows * Python>=3.7 * (Optional) CUDA>=9.2, GCC>=5.4 * PyTorch>=1.5.1, torchvision>=0.6.1 (following instructions [here](https://pytorch.org/)) * Other requirements ```bash pip install -r requirements.txt ``` ### Compiling CUDA extensions (Optional) The RoIAlign operator is implemented with CUDA extension. If your machine does have a NVIDIA GPU with CUDA support, you can run this step. Otherwise, please set `disable_cuda=True` in `opts.py`. ```bash cd model/ops; # If you have multiple installations of CUDA Toolkits, you'd better add a prefix # CUDA_HOME= to specify the correct version. python setup.py build_ext --inplace ``` ### Run a quick test ``` python demo.py ``` ## 1.Data Preparation Currently we only support `thumos14`. ### THUMOS14 Download all data from [[BaiduDrive(code: adTR)]](https://pan.baidu.com/s/183VprlbKNjMb3Gr-rfmROQ) or [[OneDrive]](https://husteducn-my.sharepoint.com/:f:/g/personal/liuxl_hust_edu_cn/EsMyXDlkrTdBsikoRQSIeUsBkxJJRsplbMyIQVYotiZRIQ?e=QYgiCH). - Features: Download the I3D features `I3D_2stream_Pth.tar`. It was originally provided by the authors of P-GCN. I have concatenated the RGB and Flow features (drop the tail of the longer one if the lengths are inconsistent) and converted the data to float32 precision to save space. - Annotations: The annotations of action instances and the meta information of feature files. Both are in JSON format (`th14_annotations_with_fps_duration.json` and `th14_i3d2s_ft_info.json`). - Pre-trained Reference Models: Our pretrained model that use I3D features `thumos14_i3d2s_tadtr_reference.pth`. This model corresponds to the config file `configs/thumos14_i3d2s_tadtr.yml`. After downloading is finished, extract the archived feature files inplace by `cd data;tar -xf I3D_2stream_Pth.tar`. Then put the features, annotations, the model under the `data/thumos14` directory. We expect the following structure in root folder. ``` - data - thumos14 - I3D_2stream_Pth - xxxxx - xxxxx - th14_annotations_with_fps_duration.json - th14_i3d2s_ft_info.json - thumos14_tadtr_reference.pth ``` ## 2.Testing Pre-trained Models Run ``` python main.py --cfg CFG_PATH --eval --resume CKPT_PATH ``` CFG_PATH is the path to the YAML-format config file that defines the experimental setting. For example, `configs/thumos14_i3d2s_tadtr.yml`. CKPT_PATH is the path of the pre-trained model. Alternatively, you can execute the Shell script `bash scripts/test_reference_models.sh thumos14` for simplity. ## 3.Training by Yourself Run the following command ``` python main.py --cfg CFG_PATH ``` This codebase supports running on both CPU and GPU. - To run on CPU: please add ` --device cpu` to the above command. Also, you need to set `disable_cuda=True` in `opts.py`. The CPU mode does not support actionness regression and the detection performance is lower. - To run on GPU: since the model is very lightweight, just one GPU is enough. You may specify the GPU device ID (e.g., 0) to use by the adding the prefix `CUDA_VISIBLE_DEVICES=ID ` before the above command. To run on multiple GPUs, please refer to `scripts/run_parallel.sh`. During training, our code will automatically perform testing every N epochs (N is the `test_interval` in opts.py). Training takes 6~10 minutes on THUMOS14 if you use a modern GPU (e.g. TITAN Xp). You can also monitor the training process with Tensorboard (need to set `cfg.tensorboard=True` in `opts.py`). The tensorboard record and the checkpoint will be saved at `output_dir` (can be modified in config file). After training is done, you can also test your trained model by running ``` python main.py --cfg CFG_PATH --eval ``` It will automatically use the best model checkpoint. If you want to manually specify the model checkpoint, run ``` python main.py --cfg CFG_PATH --eval --resume CKPT_PATH ``` Note that the performance of the model trained by your own may be different from the reference model, even though all seeds are fixed. The reason is that TadTR uses the `grid_sample` operator, whoses gradient computation involves the non-deterministic `AtomicAdd` operator. Please refer to [ref1](https://pytorch.org/docs/stable/notes/randomness.html) [ref2](https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html#torch.use_deterministic_algorithms) [ref3(Chinese)](https://zhuanlan.zhihu.com/p/109166845) for details. ## Acknowledgement The code is based on the [DETR](https://github.com/facebookresearch/detr) and [Deformable DETR](https://github.com/fundamentalvision/Deformable-DETR). We also borrow the implementation of the RoIAlign1D from [G-TAD](https://github.com/Frostinassiky/gtad). Thanks for their great works. ## Citing ``` @article{liu2022end, title={End-to-end Temporal Action Detection with Transformer}, author={Liu, Xiaolong and Wang, Qimeng and Hu, Yao and Tang, Xu and Zhang, Shiwei and Bai, Song and Bai, Xiang}, journal={IEEE Transactions on Image Processing (TIP)}, year={2022} } ``` ## Contact For questions and suggestions, please contact Xiaolong Liu by email ("liuxl at hust dot edu dot cn"). [e2e-tad]: https://github.com/xlliu7/E2E-TAD ================================================ FILE: configs/thumos14_i3d2s_tadtr.yml ================================================ # model setting enc_layers: 4 dec_layers: 4 dim_feedforward: 1024 num_queries: 40 # data setting dataset_name: thumos14 feature: i3d2s feature_dim: 2048 online_slice: true slice_len: 128 slice_overlap: 0.75 test_slice_overlap: 0.25 # output output_dir: outputs/thumos14_i3d2s_tadtr ================================================ FILE: datasets/__init__.py ================================================ from .tad_dataset import build as build_video_dataset def build_dataset(subset, args, mode): if args.dataset_name in ['activitynet', 'thumos14', 'hacs', 'muses']: return build_video_dataset(args.dataset_name, subset, args, mode) raise ValueError(f'dataset {args.dataset_name} not supported') ================================================ FILE: datasets/data_utils.py ================================================ '''Utilities for data loading''' import json import math import logging import os import pandas as pd import easydict import yaml import numpy as np # import cv2 import torch import torch.nn.functional as F # import ipdb as pdb def load_json(path): return json.load(open(path)) def get_valid_anno(gt_instances, slice, thr=0.75, start_getter=lambda x: x['segment'][0], end_getter=lambda x: x['segment'][1]): '''Perform integrity based instance filtering''' start, end = slice kept_instances = [] for inst in gt_instances: # ignore insts outside the time window (slice) if end_getter(inst) <= start or start_getter(inst) >= end: continue else: # clamped inst new_start = max(start_getter(inst), start) new_end = min(end_getter(inst), end) integrity = (new_end - new_start) * 1.0 / (end_getter(inst) - start_getter(inst)) if integrity >= thr: new_inst = {k:v for k,v in inst.items()} new_inst['segment'] = [new_start - start, new_end - start] kept_instances.append(new_inst) return kept_instances def get_dataset_dict(video_info_path, video_anno_path, subset, mode='test', exclude_videos=None, online_slice=False, slice_len=None, ignore_empty=True, slice_overlap=0, return_id_list=False): ''' Prepare a dict that contains the information of each video, such as duration, annotations. Args: video_info_path: path to the video info file in json format. This file records the length and fps of each video. video_anno_path: path to the ActivityNet-style video annotation in json format. subset: e.g. train, val, test mode: train (for training) or test (for inference). online_slice: cut videos into slices for training and testing. It should be enabled if the videos are too long. slice_len: length of video slices. ignore_empty: ignore video slices that does not contain any action instance. This should be enabled only in the training phase. slice_overlap: overlap ration between adjacent slices (= overlap_length / slice_len) Return: dict ''' video_ft_info = load_json(video_info_path) anno_data = load_json(video_anno_path)['database'] video_dict = {} id_list = [] cnt = 0 video_set = set([x for x in anno_data if anno_data[x]['subset'] in subset]) video_set = video_set.intersection(video_ft_info.keys()) if exclude_videos is not None: assert isinstance(exclude_videos, (list, tuple)) video_set = video_set.difference(exclude_videos) video_list = list(sorted(video_set)) for video_name in video_list: # remove ambiguous instances on THUMOS14 annotations = [x for x in anno_data[video_name]['annotations'] if x['label'] != 'Ambiguous'] annotations = list(sorted(annotations, key=lambda x: sum(x['segment']))) if video_name in video_ft_info: # video_info records the length in snippets, duration and fps (#frames per second) of the feature/image sequence video_info = video_ft_info[video_name] # number of frames or snippets feature_length = int(video_info['feature_length']) feature_fps = video_info['feature_fps'] feature_second = video_info['feature_second'] else: continue video_subset = anno_data[video_name]['subset'] # For THUMOS14, we crop video into slices of fixed length if online_slice: stride = slice_len * (1 - slice_overlap) if feature_length <= slice_len: slices = [[0, feature_length]] else: # stride * (i - 1) + slice_len <= feature_length # i <= (feature_length - slice_len) num_complete_slices = int(math.floor( (feature_length / slice_len - 1) / (1 - slice_overlap) + 1)) slices = [ [int(i * stride), int(i * stride) + slice_len] for i in range(num_complete_slices)] if (num_complete_slices - 1) * stride + slice_len < feature_length: # if video_name == 'video_test_0000006': # pdb.set_trace() if mode != 'train': # take the last incomplete slice last_slice_start = int(stride * num_complete_slices) else: # move left to get a complete slice. # This is a historical issue. The performance might be better # if we keep the same rule for training and inference last_slice_start = max(0, feature_length - slice_len) slices.append([last_slice_start, feature_length]) num_kept_slice = 0 for slice in slices: time_slices = [slice[0] / video_info['feature_fps'], slice[1] / video_info['feature_fps']] feature_second = time_slices[1] - time_slices[0] # perform integrity-based instance filtering valid_annotations = get_valid_anno(annotations, time_slices) if not ignore_empty or len(valid_annotations) >= 1: # rename the video slice new_vid_name = video_name + '_window_{}_{}'.format(*slice) new_vid_info = { 'annotations': valid_annotations, 'src_vid_name': video_name, 'feature_fps': feature_fps, 'feature_length': slice_len, 'subset': subset, 'feature_second': feature_second, 'time_offset': time_slices[0]} video_dict[new_vid_name] = new_vid_info id_list.append(new_vid_name) num_kept_slice += 1 if num_kept_slice > 0: cnt += 1 # for ActivityNet and hacs, use the full-length videos as samples else: if not ignore_empty or len(annotations) >= 1: # Remove incorrect annotions on ActivityNet valid_annotations = [x for x in annotations if x['segment'][1] - x['segment'][0] > 0.02] if ignore_empty and len(valid_annotations) == 0: continue video_dict[video_name] = { 'src_vid_name': video_name, 'annotations': valid_annotations, 'feature_fps': feature_fps, 'feature_length': int(feature_length), 'subset': video_subset, 'feature_second': feature_second, 'time_offset': 0} id_list.append(video_name) cnt += 1 logging.info('{} videos, {} slices'.format(cnt, len(video_dict))) if return_id_list: return video_dict, id_list else: return video_dict def load_video_frames(frame_dir, start, seq_len, stride=1, fn_tmpl='img_%07d.jpg'): raise NotImplementedError def load_feature(ft_path, ft_format, shape=None): if ft_format == 'npy': video_df = np.load(ft_path) if shape == "CT": video_df = video_df.T elif ft_format == 'torch': video_df = torch.load(ft_path).numpy() else: raise ValueError('unsupported feature format: {}'.format(ft_format)) return video_df def get_dataset_info(dataset, feature): '''get basic information for each dataset''' path_info = easydict.EasyDict(yaml.load(open('datasets/path.yml'), yaml.SafeLoader)) if dataset == 'thumos14': subset_mapping = {'train': 'val', 'val': 'test'} ann_file = path_info['thumos14']['ann_file'] if feature == 'i3d2s': feature_info = {'local_path': path_info['thumos14'][feature]['local_path'], 'format': 'torch', 'fn_templ': '%s'} ft_info_file = path_info['thumos14'][feature]['ft_info_file'] else: raise ValueError('unsupported feature, should be one of [i3d2s]') elif dataset == 'activitynet': raise NotImplementedError elif dataset == 'hacs': raise NotImplementedError elif dataset == 'muses': raise NotImplementedError else: raise ValueError('unsupported dataset {}'.format(dataset)) return subset_mapping, feature_info, ann_file, ft_info_file def make_img_transform(*args, **kwargs): raise NotImplementedError ================================================ FILE: datasets/path.yml ================================================ # set the path of features, anno file and feature info file thumos14: ann_file: 'data/thumos14/th14_annotations_with_fps_duration.json' i3d2s: local_path: data/thumos14/I3D_2stream_Pth ft_info_file: 'data/thumos14/th14_i3d2s_ft_info.json' ================================================ FILE: datasets/tad_dataset.py ================================================ # ------------------------------------------------------------------------ # TadTR: End-to-end Temporal Action Detection with Transformer # Copyright (c) 2021 - 2022. Xiaolong Liu. # ------------------------------------------------------------------------ '''Universal TAD Dataset loader.''' import json import logging import math import os.path as osp import numpy as np import pandas as pd import torch import torch.nn.functional as F import torch.utils.data import tqdm import h5py from .data_utils import get_dataset_dict, load_feature, load_video_frames, get_dataset_info, make_img_transform # from util.config import cfg from util.segment_ops import segment_t1t2_to_cw class TADDataset(torch.utils.data.Dataset): def __init__(self, subset, mode, feature_info, ann_file, ft_info_file, transforms, mem_cache=False, online_slice=False, slice_len=None, slice_overlap=0, binary=False, padding=True, input_type='feature', img_stride=1): '''TADDataset Parameters: subset: train/val/test mode: train, or test feature_info: basic info of video features, e.g. path, file format, filename template ann_file: path to the ground truth file ft_info_file: path to the file that describe other information of each video transforms: which transform to use mem_cache: cache features of the whole dataset into memory. binary: transform all gt to binary classes. This is required for training a class-agnostic detector padding: whether to pad the input feature to `slice_len` ''' super().__init__() self.feature_info = feature_info self.ann_file = ann_file self.ft_info_file = ft_info_file self.subset = subset self.online_slice = online_slice self.slice_len = slice_len self.slice_overlap = slice_overlap self.padding = padding self.mode = mode self.transforms = transforms print('Use data transform {}'.format(self.transforms)) self.binary = binary self.is_image_input = input_type == 'image' self.mem_cache = mem_cache self.img_stride = img_stride self._prepare() def _get_classes(self, anno_dict): '''get class list from the annotation dict''' if 'classes' in anno_dict: classes = anno_dict['classes'] else: database = anno_dict['database'] all_gts = [] for vid in database: all_gts += database[vid]['annotations'] classes = list(sorted({x['label'] for x in all_gts})) return classes def _prepare(self): '''parse annotation file''' anno_dict = json.load(open(self.ann_file)) self.classes = self._get_classes(anno_dict) self.video_dict, self.video_list = get_dataset_dict(self.ft_info_file, self.ann_file, self.subset, mode=self.mode, online_slice=self.online_slice, slice_len=self.slice_len, slice_overlap=self.slice_overlap, ignore_empty=self.mode == 'train', return_id_list=True) # video_list = self.video_dict.keys() # self.video_list = list(sorted(video_list)) logging.info("{} subset video numbers: {}".format(self.subset,len(self.video_list))) self.anno_dict = anno_dict self.cached_data = {} # if the features of all videos is saved in one hdf5 file (all in one), e.g. TSP features self.all_video_data = {} feature_info = self.feature_info fn_templ = feature_info['fn_templ'] src_video_list = {self.video_dict[k]['src_vid_name'] for k in self.video_list} # if feature_info.get('all_in_one', False): data = h5py.File(feature_info['local_path'][self.subset]) for k in src_video_list: self.all_video_data[k] = np.array(data[fn_templ % k]).T if not self.online_slice: self.cached_data = self.all_video_data def __len__(self): return len(self.video_list) def _get_video_data(self, index): if self.is_image_input: return self._get_img_data(index) else: return self._get_feature_data(index) def _get_feature_data(self,index): video_name = self.video_list[index] # directly fetch from memory if video_name in self.cached_data: video_data = self.cached_data[video_name] return torch.Tensor(video_data).float().contiguous() src_vid_name = self.video_dict[video_name]['src_vid_name'] # retrieve feature info feature_info = self.feature_info # "ft" is short for "feature" local_ft_dir = feature_info['local_path'] ft_format = feature_info['format'] local_ft_path = osp.join(local_ft_dir, feature_info['fn_templ'] % src_vid_name) if local_ft_dir else None # the shape of feature sequence, can be TxC (in most cases) or CxT shape = feature_info.get('shape', 'TC') if src_vid_name in self.all_video_data: feature_data = self.all_video_data[src_vid_name].T else: feature_data = load_feature(local_ft_path, ft_format, shape) feature_data = feature_data.T # T x C to C x T. if self.online_slice: slice_start, slice_end = [int(x) for x in video_name.split('_')[-2:]] assert slice_end > slice_start assert slice_start < feature_data.shape[1] feature_data = feature_data[:, slice_start:slice_end] if self.padding and feature_data.shape[1] < self.slice_len: diff = self.slice_len - feature_data.shape[1] feature_data = np.pad( feature_data, ((0, 0), (0, diff)), mode='constant') # IMPORATANT: if padded is done, the length info must be modified self.video_dict[video_name]['feature_length'] = self.slice_len self.video_dict[video_name]['feature_second'] = self.slice_len / self.video_dict[video_name]['feature_fps'] if self.mem_cache and video_name not in self.cached_data: self.cached_data[video_name] = feature_data feature_data = torch.Tensor(feature_data).float().contiguous() return feature_data def _get_img_data(self, index): '''have not been tested''' raise NotImplementedError def _get_train_label(self, video_name): '''get normalized target''' video_info = self.video_dict[video_name] video_labels = video_info['annotations'] feature_second = video_info['feature_second'] target = { 'segments': [], 'labels': [], 'orig_labels': [], 'video_id': video_name, 'video_duration': feature_second, # only used in inference 'feature_fps': video_info['feature_fps'], } for j in range(len(video_labels)): tmp_info=video_labels[j] segment = tmp_info['segment'] # special rule for thumos14, treat ambiguous instances as negatives if tmp_info['label'] not in self.classes: continue # the label id of first forground class is 0 label_id = self.classes.index(tmp_info['label']) target['orig_labels'].append(label_id) if self.binary: label_id = 0 target['segments'].append(segment) target['labels'].append(label_id) # normalized the coordinate target['segments'] = np.array(target['segments']) / feature_second if len(target['segments']) > 0: target['segments'] = segment_t1t2_to_cw(target['segments']) # convert to torch format for k, dtype in zip(['segments', 'labels'], ['float32', 'int64']): if not isinstance(target[k], torch.Tensor): target[k] = torch.from_numpy(np.array(target[k], dtype=dtype)) return target def __getitem__(self, index): # index = index % len(self.video_list) video_data = self._get_video_data(index) video_name = self.video_list[index] target = self._get_train_label(video_name) return video_data, target def build(dataset, subset, args, mode): '''build TADDataset''' subset_mapping, feature_info, ann_file, ft_info_file = get_dataset_info(dataset, args.feature) transforms = None if args.input_type == 'image': transforms = make_img_transform(mode) else: transforms = None return TADDataset( subset_mapping[subset], mode, feature_info, ann_file, ft_info_file, transforms, online_slice=args.online_slice, slice_len=args.slice_len, slice_overlap=args.slice_overlap if mode=='train' else args.test_slice_overlap, binary=args.binary, input_type=args.input_type) ================================================ FILE: datasets/tad_eval.py ================================================ # TadTR: End-to-end Temporal Action Detection with Transformer import json import os.path as osp import os import pandas as pd import time import numpy as np import logging import concurrent.futures import sys import logging # import ipdb as pdb import pickle from opts import cfg from Evaluation.eval_detection import compute_average_precision_detection # from Evaluation.eval_proposal import average_recall_vs_avg_nr_proposals import matplotlib.pyplot as plt # from util.proposal_utils import soft_nms from .data_utils import get_dataset_dict from util.misc import all_gather from util.segment_ops import soft_nms, temporal_nms def eval_ap(iou, cls, gt, predition): ap = compute_average_precision_detection(gt, predition, iou) sys.stdout.flush() return cls, ap def apply_nms(dets_arr, nms_thr=0.4, use_soft_nms=False): # the last column are class ids unique_classes = np.unique(dets_arr[:, 3]) output_dets = [] for cls in unique_classes: this_cls_dets = dets_arr[dets_arr[:,3] == cls] if not use_soft_nms: this_cls_dets_kept = temporal_nms(this_cls_dets, nms_thr) else: classes = this_cls_dets[:, [3]] this_cls_dets_kept = soft_nms(this_cls_dets, 0.8, 0, 0, 100) this_cls_dets_kept = np.concatenate((this_cls_dets_kept, classes), -1) output_dets.append(this_cls_dets_kept) output_dets = np.concatenate(output_dets, axis=0) sort_idx = output_dets[:, 2].argsort()[::-1] output_dets = output_dets[sort_idx, :] return output_dets class TADEvaluator(object): def __init__(self, dataset_name, subset, video_dict=None, nms_mode=['raw'], iou_range=[0.5], epoch=None, num_workers=None): '''dataset_name: thumos14, activitynet or hacs subset: val or test video_dict: the dataset dict created in video_dataset.py iou_range: [0.3:0.7:0.1] for thumos14; [0.5:0.95:0.05] for anet and hacs. ''' self.epoch = epoch self.iou_range = iou_range self.nms_mode = nms_mode self.dataset_name = dataset_name self.ignored_videos = list() if dataset_name == 'thumos14': subset_mapping = {'train': 'val', 'val': 'test'} anno_file = 'data/thumos14/th14_annotations_with_fps_duration.json' # follow SSN/PGCN/AFSD/MUSES to remove three falsely annotated videos self.ignored_videos = ['video_test_0000270', 'video_test_0001292', 'video_test_0001496'] else: raise NotImplementedError anno_dict = json.load(open(anno_file)) classes = self._get_classes(anno_dict) num_classes = len(classes) database = anno_dict['database'] all_gt = [] unique_video_list = [x for x in database if database[x]['subset'] in subset_mapping[subset]] for vid in unique_video_list: if vid in self.ignored_videos: continue this_gts = [x for x in database[vid]['annotations'] if x['label'] != 'Ambiguous'] all_gt += [[vid, classes.index(x['label']), x['segment'][0], x['segment'][1]] for x in this_gts] all_gt = pd.DataFrame(all_gt, columns=["video-id", "cls","t-start", "t-end"]) self.video_ids = all_gt['video-id'].unique().tolist() logging.info('{} ground truth instances from {} videos'.format(len(all_gt), len(self.video_ids))) # per class ground truth gt_by_cls = [] for cls in range(num_classes): gt_by_cls.append(all_gt[all_gt.cls == cls].reset_index(drop=True).drop('cls', 1)) self.gt_by_cls = gt_by_cls self.all_pred = {k: [] for k in self.nms_mode} self.num_classes = num_classes self.classes = classes self.anno_dict = anno_dict self.all_gt = all_gt self.num_workers = num_classes if num_workers is None else num_workers self.video_dict = video_dict self.stats = {k: dict() for k in self.nms_mode} self.subset = subset def _get_classes(self, anno_dict): if 'classes' in anno_dict: classes = anno_dict['classes'] else: database = anno_dict['database'] all_gts = [] for vid in database: all_gts += database[vid]['annotations'] classes = list(sorted({x['label'] for x in all_gts})) return classes def update(self, pred, assign_cls_labels=False): '''pred: a dict of predictions for each video. For each video, the predictions are in a dict with these fields: scores, labels, segments assign_cls_labels: manually assign class labels to the detections. This is necessary when the predictions are class-agnostic. ''' pred_numpy = {k: {kk: vv.detach().cpu().numpy() for kk, vv in v.items()} for k,v in pred.items()} for k, v in pred_numpy.items(): # pdb.set_trace() if 'window' not in k: this_dets = [ [v['segments'][i, 0], v['segments'][i, 1], v['scores'][i], v['labels'][i]] for i in range(len(v['scores']))] video_id = k else: window_start = self.video_dict[k]['time_offset'] video_id = self.video_dict[k]['src_vid_name'] this_dets = [ [v['segments'][i, 0] + window_start, v['segments'][i, 1] + window_start, v['scores'][i], v['labels'][i]] for i in range(len(v['scores']))] # ignore videos that are not in ground truth set if video_id not in self.video_ids: continue this_dets = np.array(this_dets) # start, end, score, label for nms_mode in self.nms_mode: input_dets = np.copy(this_dets) # if nms_mode == 'nms' and not (cfg.TEST_SLICE_OVERLAP > 0 and self.dataset_name == 'thumos14'): # when cfg.TEST_SLICE_OVERLAP > 0, only do nms at summarization # dets = apply_nms(input_dets, nms_thr=cfg.nms_thr, use_soft_nms=self.dataset_name=='activitynet' and assign_cls_labels) # else: if True: sort_idx = input_dets[:, 2].argsort()[::-1] dets = input_dets[sort_idx, :] # only keep top 200 detections per video dets = dets[:200, :] # On ActivityNet, follow the tradition to use external video label if assign_cls_labels: raise NotImplementedError self.all_pred[nms_mode] += [[video_id, k] + det for det in dets.tolist()] def nms_whole_dataset(self): video_ids = list(set([v['src_vid_name'] for k, v in self.video_dict.items()])) all_pred = [] for vid in video_ids: this_dets = self.all_pred['nms'][self.all_pred['nms']['video-id'] == vid][['t-start', 't-end', 'score', 'cls']].values this_dets = apply_nms(this_dets)[:200, ...] this_dets = [[vid] + x.tolist() for x in this_dets] all_pred += this_dets self.all_pred['nms'] = pd.DataFrame(all_pred, columns=["video-id", "t-start", "t-end", "score", "cls"]) def cross_window_fusion(self): ''' merge detections in the overlapped regions of adjacent windows. Only used for THUMOS14 ''' # video_ids = list(set([v['src_vid_name'] for k, v in self.video_dict.items()])) all_pred = [] video_ids = self.all_pred['raw']['video-id'].unique() vid = video_ids[0] for vid in video_ids: this_dets = self.all_pred['raw'][self.all_pred['raw']['video-id'] == vid] slice_ids = this_dets['slice-id'].unique().tolist() if len(slice_ids) > 1: slice_sorted = sorted(slice_ids, key=lambda k: int(k.split('_')[4])) overlap_region_time_list = [] for i in range(0, len(slice_sorted) - 1): slice_name = slice_sorted[i] feature_fps = self.video_dict[slice_name]['feature_fps'] time_base = 0 # self.video_dict[slice_name]['time_base'] # parse the temporal coordinate from name cur_slice = [int(x) for x in slice_sorted[i].split('_')[4:6]] next_slice = [int(x) for x in slice_sorted[i+1].split('_')[4:6]] overlap_region_time = [next_slice[0], cur_slice[1]] # add time offset of each window/slice overlap_region_time = [time_base + overlap_region_time[iii] / feature_fps for iii in range(2)] overlap_region_time_list.append(overlap_region_time) mask_union = None processed_dets = [] for overlap_region_time in overlap_region_time_list: inters = np.minimum(this_dets['t-end'], overlap_region_time[1]) - np.maximum(this_dets['t-start'], overlap_region_time[0]) # we only perform NMS to the overlapped regions mask = inters > 0 overlap_dets = this_dets[mask] overlap_dets_arr = overlap_dets[['t-start', 't-end', 'score', 'cls']].values if len(overlap_dets) > 0: kept_dets_arr = apply_nms(np.concatenate((overlap_dets_arr, np.arange(len(overlap_dets_arr))[:, None]), axis=1)) processed_dets.append(overlap_dets.iloc[kept_dets_arr[:, -1].astype('int64')]) if mask_union is not None: mask_union = mask_union | mask else: mask_union = mask # instances not in overlapped region processed_dets.append(this_dets[~mask_union]) all_pred += processed_dets else: all_pred.append(this_dets) all_pred = pd.concat(all_pred) self.all_pred['raw'] = all_pred def accumulate(self, test_slice_overlap=0): '''accumulate detections in all videos''' for nms_mode in self.nms_mode: self.all_pred[nms_mode] = pd.DataFrame(self.all_pred[nms_mode], columns=["video-id", "slice-id", "t-start", "t-end", "score", "cls"]) self.pred_by_cls = {} for nms_mode in self.nms_mode: if self.dataset_name == 'thumos14' and nms_mode == 'raw' and test_slice_overlap > 0: self.cross_window_fusion() # if you really want to use NMS if self.dataset_name == 'thumos14' and nms_mode == 'nms' and test_slice_overlap > 0: self.nms_whole_dataset() self.pred_by_cls[nms_mode] = [self.all_pred[nms_mode][self.all_pred[nms_mode].cls == cls].reset_index(drop=True).drop('cls', 1) for cls in range(self.num_classes)] def import_prediction(self): pass def format_arr(self, arr, format='{:.2f}'): line = ' '.join([format.format(x) for x in arr]) return line def synchronize_between_processes(self): mode = self.nms_mode[0] print( len(self.all_pred[mode]), len({x[0] for x in self.all_pred[mode]}) ) self.all_pred = merge_distributed(self.all_pred) def summarize(self): '''Compute mAP and collect stats''' if self.dataset_name in ['thumos14', 'muses']: # 0.3~0.7 avg display_iou_thr_inds = [0, 1, 2, 3, 4] else: # 0.5 0.75 0.95 avg display_iou_thr_inds = [0, 5, 9] for nms_mode in self.nms_mode: logging.info( 'mode={} {} predictions from {} videos'.format( nms_mode, len(self.all_pred[nms_mode]), len(self.all_pred[nms_mode]['video-id'].unique())) ) header = ' '.join('%.2f' % self.iou_range[i] for i in display_iou_thr_inds) + ' avg' # 0 5 9 lines = [] for nms_mode in self.nms_mode: per_iou_ap = self.compute_map(nms_mode) line = ' '.join(['%.2f' % (100*per_iou_ap[i]) for i in display_iou_thr_inds]) + ' %.2f' % (100*per_iou_ap.mean()) + ' {} epoch{}'.format(nms_mode, self.epoch) lines.append(line) msg = header for l in lines: msg += '\n' + l logging.info('\n' + msg) for nms_mode in self.nms_mode: if self.dataset_name == 'thumos14': self.stats[nms_mode]['AP50'] = self.stats[nms_mode]['per_iou_ap'][2] else: self.stats[nms_mode]['AP50'] = self.stats[nms_mode]['per_iou_ap'][0] self.stats_summary = msg def compute_map(self, nms_mode): '''Compute mean average precision''' start_time = time.time() gt_by_cls, pred_by_cls = self.gt_by_cls, self.pred_by_cls[nms_mode] iou_range = self.iou_range num_classes = self.num_classes ap_values = np.zeros((num_classes, len(iou_range))) with concurrent.futures.ProcessPoolExecutor(min(self.num_workers, 8)) as p: futures = [] for cls in range(len(pred_by_cls)): if len(gt_by_cls[cls]) == 0: logging.info('no gt for class {}'.format(self.classes[cls])) if len(pred_by_cls[cls]) == 0: logging.info('no prediction for class {}'.format(self.classes[cls])) futures.append(p.submit(eval_ap, iou_range, cls, gt_by_cls[cls], pred_by_cls[cls])) for f in concurrent.futures.as_completed(futures): x = f.result() ap_values[x[0], :] = x[1] per_iou_ap = ap_values.mean(axis=0) per_cls_ap = ap_values.mean(axis=1) mAP = per_cls_ap.mean() self.stats[nms_mode]['mAP'] = mAP self.stats[nms_mode]['ap_values'] = ap_values self.stats[nms_mode]['per_iou_ap'] = per_iou_ap self.stats[nms_mode]['per_cls_ap'] = per_cls_ap return per_iou_ap def dump_to_json(self, dets, save_path): result_dict = {} videos = dets['video-id'].unique() for video in videos: this_detections = dets[dets['video-id'] == video] det_list = [] for idx, row in this_detections.iterrows(): det_list.append( {'segment': [float(row['t-start']), float(row['t-end'])], 'label': self.classes[int(row['cls'])], 'score': float(row['score'])} ) video_id = video[2:] if video.startswith('v_') else video result_dict[video_id] = det_list # the standard detection format for ActivityNet output_dict={ "version": "VERSION 1.3", "results": result_dict, "external_data":{}} if save_path: dirname = osp.dirname(save_path) if not osp.exists(dirname): os.makedirs(dirname) with open(save_path, 'w') as f: json.dump(output_dict, f) # return output_dict def dump_detection(self, save_path=None): for nms_mode in self.nms_mode: logging.info( 'dump detection result in JSON format to {}'.format(save_path.format(nms_mode))) self.dump_to_json(self.all_pred[nms_mode], save_path.format(nms_mode)) def merge_distributed(all_pred): '''gather outputs from different nodes at distributed mode''' all_pred_gathered = all_gather(all_pred) merged_all_pred = {k: [] for k in all_pred} for p in all_pred_gathered: for k in p: merged_all_pred[k] += p[k] return merged_all_pred if __name__ == '__main__': pass ================================================ FILE: demo.py ================================================ # ------------------------------------------------------------------------ # TadTR: End-to-end Temporal Action Detection with Transformer # Copyright (c) 2021. Xiaolong Liu. # ------------------------------------------------------------------------ from models import build_model from opts import update_cfg_from_file from util.misc import NestedTensor import torch import time import pdb # @torch.no_grad() def demo(args, cfg): device = torch.device(args.device) model, _, _ = build_model(cfg) bs, t = 1, 100 x = torch.rand([bs, cfg.feature_dim, t]).to(device) mask = torch.ones([bs, t], dtype=torch.bool).to(device) samples = NestedTensor(x, mask) targets = [ { 'labels': torch.LongTensor([0, 0]).to(device), 'segments': torch.FloatTensor([[0.5, 0.2], [0.7, 0.3]]).to(device), 'orig_size': 100.0 } for i in range(bs)] model.to(device) outputs = model(samples) # orig_target_sizes = torch.FloatTensor( # [t["orig_size"] for t in targets]).cuda() # results = postprocessor(outputs, orig_target_sizes) print('Passed') if __name__ == '__main__': from opts import get_args_parser, cfg, update_cfg_with_args args = get_args_parser().parse_args() if args.cfg: update_cfg_from_file(cfg, args.cfg) update_cfg_with_args(cfg, args.opt) if cfg.disable_cuda: cfg.act_reg = False demo(args, cfg) ================================================ FILE: docs/1_train_on_your_dataset.md ================================================ # Train and Evaluate TadTR on Your Dataset TODO ================================================ FILE: engine.py ================================================ # ------------------------------------------------------------------------ # TadTR: End-to-end Temporal Action Detection with Transformer # ------------------------------------------------------------------------ # Modified from DETR (https://github.com/facebookresearch/detr) # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved # ------------------------------------------------------------------------ """ Train and eval functions used in main.py """ import math import os.path as osp import sys from typing import Iterable import tqdm import logging import torch import util.misc as utils from datasets.tad_eval import TADEvaluator import pickle def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, data_loader: Iterable, optimizer: torch.optim.Optimizer, device: torch.device, epoch: int, cfg, max_norm: float = 0): model.train() criterion.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter('lr', utils.SmoothedValue( window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) print_freq = 20 cnt = 0 for samples, targets in metric_logger.log_every(data_loader, print_freq, header): samples = samples.to(device) targets = [{k: v.to(device) if k in ['segments', 'labels'] else v for k, v in t.items()} for t in targets] outputs = model((samples.tensors, samples.mask)) loss_dict = criterion(outputs, targets) weight_dict = criterion.weight_dict losses = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) # loss of each type loss_dict_reduced_unscaled = {f'{k}_unscaled': v for k, v in loss_dict_reduced.items()} # weighted_loss of each type loss_dict_reduced_scaled = {k: v * weight_dict[k] for k, v in loss_dict_reduced.items() if k in weight_dict} losses_reduced_scaled = sum(loss_dict_reduced_scaled.values()) loss_value = losses_reduced_scaled.item() if not math.isfinite(loss_value): logging.info("Loss is {}, stopping training".format(loss_value)) logging.info(str(loss_dict_reduced)) sys.exit(1) losses.backward() if (cnt + 1) % cfg.iter_size == 0: # scale gradients when iter size is functioning if cfg.iter_size != 1: for g in optimizer.param_groups: for p in g['params']: p.grad /= cfg.iter_size if max_norm > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) optimizer.step() optimizer.zero_grad() metric_logger.update( loss=loss_value, **loss_dict_reduced_scaled, **loss_dict_reduced_unscaled) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) cnt += 1 optimizer.zero_grad() # gather the stats from all processes metric_logger.synchronize_between_processes() logging.info(f"Averaged stats:{metric_logger}") return {k: meter.global_avg for k, meter in metric_logger.meters.items()} def to_device(t, device): if isinstance(t, (list, tuple)): return t else: return t.to(device) @torch.no_grad() def test(model, criterion, postprocessor, data_loader, base_ds, device, output_dir, cfg, subset='val', epoch=None, test_mode=False): ''' Run inference and evaluation. Do not compute loss test_mode: indicates that we are evaluating specific epoch during testing ''' model.eval() criterion.eval() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter('class_error', utils.SmoothedValue( window_size=1, fmt='{value:.2f}')) iou_range = [0.3, 0.4, 0.5, 0.6, 0.7] if cfg.dataset_name == 'thumos14' else [ num/100 for num in range(50, 100, 5)] # logging.info('iou range {}'.format(iou_range)) # action_evaluator = None action_evaluator = TADEvaluator(cfg.dataset_name, subset, base_ds, nms_mode=[ 'raw'], iou_range=iou_range, epoch=epoch) # raw_res = [] cnt = 0 for (samples, targets) in tqdm.tqdm(data_loader, total=len(data_loader)): samples = samples.to(device) outputs = model((samples.tensors, samples.mask)) # raw_res.append((outputs, targets)) video_duration = torch.FloatTensor( [t["video_duration"] for t in targets]).to(device) results = postprocessor(outputs, video_duration, fuse_score=cfg.act_reg) res = {target['video_id']: output for target, output in zip(targets, results)} if action_evaluator is not None: action_evaluator.update(res, assign_cls_labels=cfg.binary) # if cnt >= 9: # break cnt += 1 # accumulate predictions from all videos if action_evaluator is not None: action_evaluator.synchronize_between_processes() action_evaluator.accumulate(cfg.test_slice_overlap) # dump detections if test_mode: save_path = osp.join('outputs', 'detection_{}.json') action_evaluator.dump_detection(save_path) action_evaluator.summarize() stats = {} if action_evaluator is not None: for k, v in action_evaluator.stats.items(): for vk, vv in v.items(): stats[vk + '_' + k] = vv mAP_values = ' '.join([f'{k}: {100*v:.2f}'.format(k, v) for k, v in stats.items() if k.startswith('mAP')]) logging.info(mAP_values) stats['stats_summary'] = action_evaluator.stats_summary # with open('raw_outputs.pkl', 'wb') as f: # pickle.dump(raw_res, f) return stats ================================================ FILE: main.py ================================================ # ------------------------------------------------------------------------ # TadTR: End-to-end Temporal Action Detection with Transformer # Copyright (c) 2021 - 2012. Xiaolong Liu # ------------------------------------------------------------------------ # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 # ------------------------------------------------------------------------ # and DETR (https://github.com/facebookresearch/detr) # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved # ------------------------------------------------------------------------ '''Entry for training and testing''' import datetime import json import random import time from pathlib import Path import re import os import logging import sys import os.path as osp import numpy as np import torch from torch.utils.data import DataLoader, DistributedSampler from opts import get_args_parser, cfg, update_cfg_with_args, update_cfg_from_file import util.misc as utils from datasets import build_dataset from engine import train_one_epoch, test from models import build_model if cfg.tensorboard: from torch.utils.tensorboard import SummaryWriter def main(args): from util.logger import setup_logger if args.cfg is not None: update_cfg_from_file(cfg, args.cfg) update_cfg_with_args(cfg, args.opt) if cfg.output_dir: Path(cfg.output_dir).mkdir(parents=True, exist_ok=True) # The actionness regression module requires CUDA support # If your machine does not have CUDA enabled, this module will be disabled. if cfg.disable_cuda: cfg.act_reg = False utils.init_distributed_mode(args) if not args.eval: mode = 'train' else: mode = 'test' # Logs will be saved in log_path log_path = os.path.join(cfg.output_dir, mode + '.log') setup_logger(log_path) logging.info("git:\n {}\n".format(utils.get_sha())) logging.info(' '.join(sys.argv)) with open(osp.join(cfg.output_dir, mode + '_cmd.txt'), 'w') as f: f.write(' '.join(sys.argv) + '\n') logging.info(str(args)) logging.info(str(cfg)) device = torch.device(args.device) # fix the seed seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) if cfg.input_type == 'image': # We plan to support image input in the future raise NotImplementedError model, criterion, postprocessors = build_model(cfg) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu], find_unused_parameters=True) model_without_ddp = model.module elif args.multi_gpu: model = torch.nn.DataParallel(model) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters()) logging.info('number of params: {}'.format(n_parameters)) def match_name_keywords(n, name_keywords): out = False for b in name_keywords: if b in n: out = True break return out param_dicts = [ # non-backbone, non-offset { "params": [p for n, p in model_without_ddp.named_parameters() if not match_name_keywords(n, cfg.lr_backbone_names) and not match_name_keywords(n, cfg.lr_linear_proj_names) and p.requires_grad], "lr": cfg.lr, "initial_lr": cfg.lr }, # backbone { "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, cfg.lr_backbone_names) and p.requires_grad], "lr": cfg.lr_backbone, "initial_lr": cfg.lr_backbone }, # offset { "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, cfg.lr_linear_proj_names) and p.requires_grad], "lr": cfg.lr * cfg.lr_linear_proj_mult, "initial_lr": cfg.lr * cfg.lr_linear_proj_mult } ] optimizer = torch.optim.__dict__[cfg.optimizer](param_dicts, lr=cfg.lr, weight_decay=cfg.weight_decay) output_dir = Path(cfg.output_dir) if args.resume == 'latest': args.resume = osp.join(cfg.output_dir, 'checkpoint.pth') elif args.resume == 'best': args.resume = osp.join(cfg.output_dir, 'model_best.pth') if 'model_best.pth' in os.listdir(cfg.output_dir) and not args.resume and not args.eval: # for many times, my trained models were accidentally overwrittern by new models😂. So I add this to avoid that logging.error( 'Danger! You are overwriting an existing output dir {}, probably because you forget to change the output_dir option'.format(cfg.output_dir)) confirm = input('confirm: y/n') if confirm != 'y': return last_epoch = -1 if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') last_epoch = checkpoint['epoch'] lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, cfg.lr_step, last_epoch=last_epoch) dataset_val = build_dataset(subset=cfg.test_set, args=cfg, mode='val') if not args.eval: dataset_train = build_dataset(subset='train', args=cfg, mode='train') if args.distributed: if not args.eval: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: if not args.eval: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) if not args.eval: batch_sampler_train = torch.utils.data.BatchSampler( sampler_train, cfg.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) data_loader_val = DataLoader(dataset_val, cfg.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) base_ds = dataset_val.video_dict if not args.eval and cfg.tensorboard and utils.is_main_process(): smry_writer = SummaryWriter(output_dir) else: smry_writer = None best_metric = -1 best_metric_txt = '' if args.eval and not args.resume: args.resume = osp.join(output_dir, 'model_best.pth') # start training from this epoch. You do not to set this option. start_epoch = 0 if args.resume: print('loading checkpint {}'.format(args.resume)) if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url( args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model'], strict=False) if 'epoch' in checkpoint: start_epoch = checkpoint['epoch'] + 1 if 'best_metric' in checkpoint: best_metric = checkpoint['best_metric'] if args.eval: test_stats = test(model, criterion, postprocessors, data_loader_val, base_ds, device, cfg.output_dir, cfg, subset=cfg.test_set, epoch=checkpoint['epoch'], test_mode=True) return logging.info("Start training") start_time = time.time() for epoch in range(start_epoch, cfg.epochs): if args.distributed: sampler_train.set_epoch(epoch) for group in optimizer.param_groups: logging.info('lr={}'.format(group['lr'])) train_stats = train_one_epoch( model, criterion, data_loader_train, optimizer, device, epoch, cfg, cfg.clip_max_norm) lr_scheduler.step() if cfg.output_dir: # save checkpoint every `cfg.ckpt_interval` epochs, also when reducing the learning rate checkpoint_paths = [output_dir / 'checkpoint.pth'] if (epoch + 1) in cfg.lr_step or (epoch + 1) % cfg.ckpt_interval == 0: checkpoint_paths.append( output_dir / f'checkpoint{epoch:04}.pth') ckpt = { 'model': model_without_ddp.state_dict(), 'epoch': epoch, 'args': args, 'cfg': cfg, 'best_metric': best_metric, } for checkpoint_path in checkpoint_paths: utils.save_on_master(ckpt, checkpoint_path) if (epoch + 1) % cfg.test_interval == 0: test_stats = test( model, criterion, postprocessors, data_loader_val, base_ds, device, cfg.output_dir, cfg, epoch=epoch ) prime_metric = 'mAP_raw' if test_stats[prime_metric] > best_metric: best_metric = test_stats[prime_metric] best_metric_txt = test_stats['stats_summary'] logging.info( 'new best metric {:.4f}@epoch{}'.format(best_metric, epoch)) if cfg.output_dir: ckpt['best_metric'] = best_metric best_ckpt_path = output_dir / 'model_best.pth' utils.save_on_master(ckpt, best_ckpt_path) else: test_stats = {} log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters} if cfg.output_dir and utils.is_main_process(): for k, v in log_stats.items(): if isinstance(v, np.ndarray): log_stats[k] = v.tolist() with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") if smry_writer: for k, v in log_stats.items(): if re.findall('loss_\S+unscaled', k) or k.endswith('loss') or 'lr' in k or 'AP50' in k or 'AP75' in k or 'AP95' in k or 'mAP' in k or 'AR' in k: smry_writer.add_scalar(k, v, epoch) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) if utils.is_main_process(): logging.info('Training time {}'.format(total_time_str)) logging.info(str( ['{}:{}'.format(k, v) for k, v in test_stats.items() if 'AP' in k or 'AR' in k])) if smry_writer is not None: smry_writer.close() logging.info('best det result\n{}'.format(best_metric_txt)) logging.info(log_path) if __name__ == '__main__': import argparse parser = argparse.ArgumentParser( 'TadTR training and evaluation script', parents=[get_args_parser()]) args = parser.parse_args() s_ = time.time() main(args) logging.info('main takes {:.3f} seconds'.format(time.time() - s_)) ================================================ FILE: models/__init__.py ================================================ # ------------------------------------------------------------------------ # TadTR: End-to-end Temporal Action Detection with Transformer # Copyright (c) 2021. Xiaolong Liu. # ------------------------------------------------------------------------ '''build models''' from .tadtr import build def build_model(args): return build(args) ================================================ FILE: models/custom_loss.py ================================================ # Mostly copied from DETR (https://github.com/facebookresearch/detr) # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved '''Focal loss implementation''' import torch import torch.nn.functional as F def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2): """ Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. Args: inputs: A float tensor of arbitrary shape. The predictions for each example. targets: A float tensor with the same shape as inputs. Stores the binary classification label for each element in inputs (0 for the negative class and 1 for the positive class). alpha: (optional) Weighting factor in range (0,1) to balance positive vs negative examples. Default = -1 (no weighting). gamma: Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples. Returns: Loss tensor """ prob = inputs.sigmoid() ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none") p_t = prob * targets + (1 - prob) * (1 - targets) loss = ce_loss * ((1 - p_t) ** gamma) if alpha >= 0: alpha_t = alpha * targets + (1 - alpha) * (1 - targets) loss = alpha_t * loss return loss.mean(1).sum() / num_boxes if __name__ == "__main__": import numpy as np pred = torch.from_numpy(np.random.random([8, 2])) target = torch.from_numpy(np.random.random(8) > 0.5).long() loss = sigmoid_focal_loss(pred, target) ================================================ FILE: models/matcher.py ================================================ # ------------------------------------------------------------------------ # TadTR: End-to-end Temporal Action Detection with Transformer # ------------------------------------------------------------------------ # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) # Copyright (c) 2020 SenseTime. All Rights Reserved. # ------------------------------------------------------------------------ # Modified from DETR (https://github.com/facebookresearch/detr) # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. # ------------------------------------------------------------------------ # Modified from DETR (https://github.com/facebookresearch/detr) # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from scipy.optimize import linear_sum_assignment from torch import nn from util.segment_ops import segment_cw_to_t1t2, segment_iou import pdb class HungarianMatcher(nn.Module): """This class computes an assignment between the targets and the predictions of the network For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are un-matched (and thus treated as non-objects). """ def __init__(self, cost_class: float = 1, cost_seg: float = 1, cost_iou: float = 1): """Creates the matcher Params: cost_class: This is the relative weight of the classification error in the matching cost cost_seg: This is the relative weight of the L1 error of the segment coordinates in the matching cost cost_iou: This is the relative weight of the iou loss of the segment in the matching cost """ super().__init__() self.cost_class = cost_class self.cost_seg = cost_seg self.cost_iou = cost_iou assert cost_class != 0 or cost_seg!= 0 or cost_iou != 0, "all costs cant be 0" @torch.no_grad() def forward(self, outputs, targets): """ Performs the matching Params: outputs: This is a dict that contains at least these entries: "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits "pred_segments": Tensor of dim [batch_size, num_queries, 2] with the predicted segment coordinates targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: "labels": Tensor of dim [num_target_segments] (where num_target_segments is the number of ground-truth objects in the target) containing the class labels "segments": Tensor of dim [num_target_segments, 2] containing the target segment coordinates Returns: A list of size batch_size, containing tuples of (index_i, index_j) where: - index_i is the indices of the selected predictions (in order) - index_j is the indices of the corresponding selected targets (in order) For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_segments) """ bs, num_queries = outputs["pred_logits"].shape[:2] # We flatten to compute the cost matrices in a batch out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid() # [batch_size * num_queries, num_classes] out_seg = outputs["pred_segments"].flatten(0, 1) # [batch_size * num_queries, 2] # Also concat the target labels and segments tgt_ids = torch.cat([v["labels"] for v in targets]) # shape = n1+n2+... tgt_seg = torch.cat([v["segments"] for v in targets]) # Compute the classification cost. alpha = 0.25 gamma = 2.0 neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log()) pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log()) cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids] # Compute the L1 cost between segments cost_seg = torch.cdist(out_seg, tgt_seg, p=1) # Compute the iou cost betwen segments cost_iou = -segment_iou(segment_cw_to_t1t2(out_seg), segment_cw_to_t1t2(tgt_seg)) # Final cost matrix, [bs x nq, batch_ngt] C = self.cost_seg * cost_seg + self.cost_class * cost_class + self.cost_iou * cost_iou C = C.view(bs, num_queries, -1).cpu() sizes = [len(v["segments"]) for v in targets] indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))] return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] def build_matcher(args): return HungarianMatcher(cost_class=args.set_cost_class, cost_seg=args.set_cost_seg, cost_iou=args.set_cost_iou) ================================================ FILE: models/ops/roi_align/__init__.py ================================================ from .roi_align import ROIAlign # __all__ = ['roi_pool', 'ROIAlign'] ================================================ FILE: models/ops/roi_align/roi_align.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. import torch from torch import nn from torch.autograd import Function from torch.autograd.function import once_differentiable from torch.nn.modules.utils import _pair from . import Align1D as _align_1d class _Align1D(Function): @staticmethod def forward(ctx, input, roi, feature_dim, ratio): ctx.save_for_backward(roi) ctx.feature_dim = feature_dim ctx.input_shape = input.size() ctx.sampling_ratio = ratio output = _align_1d.forward( input, roi, feature_dim, ratio ) return output @staticmethod @once_differentiable def backward(ctx, grad_output): rois, = ctx.saved_tensors feature_dim = ctx.feature_dim bs, ch, t = ctx.input_shape ratio = ctx.sampling_ratio grad_input = _align_1d.backward( grad_output, rois, feature_dim, bs, ch, t, ratio ) return grad_input, None, None, None, None align1d = _Align1D.apply class ROIAlign(nn.Module): def __init__(self, feature_dim, ratio=0): super(ROIAlign, self).__init__() self.feature_dim = feature_dim self.ratio = ratio def forward(self, input, rois): # print('- input shape is', input.shape) # print('- input mean is', input.mean()) # print('- rois shape is', rois.shape) # print('- rois is on', rois.get_device()) assert input.device==rois.device, 'Align operation requires ' + \ 'both feature and roi are on the same device! ' + \ 'Get feature on {} but roi on {}'.format(input.device,rois.device) out = align1d(input, rois, self.feature_dim, self.ratio) # print('- output shape is', out.shape) # print('- output mean is', out.mean()) return out def __repr__(self): tmpstr = self.__class__.__name__ + "(" tmpstr += "feature_dim=" + str(self.feature_dim) tmpstr += "sampling_ratio=" + str(self.ratio) tmpstr += ")" return tmpstr if __name__ == "__main__": layer = Align1DLayer(16) # layer = torch.nn.DataParallel(layer, device_ids=[0,1]) input = torch.tensor([[[1.,2,3,4,5,6,7,8,9,10],[11,12,13,14,15,16,17,18,19,20]]]).cuda() proposal = torch.tensor([[0,-0.5,9.5],[0,0.1,0.9]]).cuda() print(input.shape, proposal.shape) output = layer(input, proposal) print("output has shape {}, with mean {}".format(output.shape, torch.mean(output))) print(output) ================================================ FILE: models/ops/roi_align/src/roi_align_cuda.cpp ================================================ #include #include // CUDA forward declarations at::Tensor Align_forward_cuda(const at::Tensor& input, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int sampling_ratio); at::Tensor Align_backward_cuda(const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int batch_size, const int channels, const int height, const int sampling_ratio); // C++ interface at::Tensor Align_forward(const at::Tensor& input, // (bs,ch,t) const at::Tensor& rois, // (bs, start, end) const int pooled_height, const int sampling_ratio){ return Align_forward_cuda( input, rois, 1.0, pooled_height, sampling_ratio); } at::Tensor Align_backward(const at::Tensor& grad, const at::Tensor& rois, const int pooled_height, const int batch_size, const int channels, const int height, const int sampling_ratio){ return Align_backward_cuda(grad, rois, 1.0, pooled_height, batch_size, channels, height, sampling_ratio); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("forward", &Align_forward, "Align forward (CUDA)"); m.def("backward", &Align_backward, "Align backward (CUDA)"); } ================================================ FILE: models/ops/roi_align/src/roi_align_kernel.cu ================================================ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. // Modifies by Frost for 1D ussage #include #include #include #include #include // TODO make it in a common file #define CUDA_1D_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ i += blockDim.x * gridDim.x) template __device__ T linear_interpolate(const T* bottom_data, const int height, T t, const int index /* index for debug only*/) { // deal with cases that inverse elements are out of feature map boundary if (t < -1.0 || t > height) { //empty return 0; } if (t <= 0) t = 0; int t_low = (int) t; int t_high; // get closest integers to t if (t_low >= height - 1) { t_high = t_low = height - 1; t = (T) t_low; } else { t_high = t_low + 1; } // get the distance to t T lt = t - t_low; T ht = 1. - lt; // do linear interpolation T v1 = bottom_data[t_low]; T v2 = bottom_data[t_high]; T w1 = ht, w2 = lt; T val = (w1 * v1 + w2 * v2); // printf("Check Linear Interpolate: w1=%f, v1=%f, w2=%f, v2=%f \n", w1, v1, w2, v2); return val; } template __global__ void Align1DForward(const int nthreads, const T* bottom_data, const T spatial_scale, const int channels, const int height, const int pooled_height, const int sampling_ratio, const T* bottom_rois, T* top_data) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, pt) is an element in the pooled output int pt = index % pooled_height; int c = (index / pooled_height) % channels; int n = index / pooled_height / channels; // printf("Debug Main Loop: get pt, c, n are %d, %d, %d \n", pt, c, n); const T* offset_bottom_rois = bottom_rois + n * 3; int roi_batch_ind = offset_bottom_rois[0]; // Do not using rounding; this implementation detail is critical T roi_start = offset_bottom_rois[1] * spatial_scale; T roi_end = offset_bottom_rois[2] * spatial_scale; // printf("Debug roi boundary: w1, w2, is %f, %f \n", roi_start,roi_end,); // Force malformed ROIs to be 1x1 T roi_height = max(roi_end- roi_start, (T)1.); T bin_size = static_cast(roi_height) / static_cast(pooled_height); const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2 // We do average (integral) pooling inside a bin const T count = roi_bin_grid; // e.g. = 4 T output_val = 0.; for (int it = 0; it < roi_bin_grid; it ++) // e.g., it = 0, 1 { const T t = roi_start + pt * bin_size + static_cast(it + .5f) * bin_size / static_cast(roi_bin_grid); // e.g., 0.5, 1.5 T val = linear_interpolate(offset_bottom_data, height, t, index); // printf("Debug linear_interpolate: input=height:%d, t:%f, ... ; output=val:%f \n", height, t, val); output_val += val; } output_val /= count; top_data[index] = output_val; } } template __device__ void linear_interpolate_gradient( const int height, T t, T & w1, T & w2, int & t_low, int & t_high, const int index /* index for debug only*/) { // deal with cases that inverse elements are out of feature map boundary if (t < -1.0 || t > height) { //empty w1 = w2 = 0.; t_low = t_high = -1; return; } if (t <= 0) t = 0; t_low = (int) t; if (t_low >= height - 1) { t_high = t_low = height - 1; t = (T) t_low; } else { t_high = t_low + 1; } T lt = t - t_low; T ht = 1. - lt; // T val = (w1 * v1 + w2 * v2); // T w1 = ht, w2 = lt; w1 = ht , w2 = lt; return; } template __global__ void Align1DBackwardFeature(const int nthreads, const T* top_diff, const int num_rois, const T spatial_scale, const int channels, const int height, const int pooled_height, const int sampling_ratio, T* bottom_diff, const T* bottom_rois) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, pt) is an element in the pooled output int pt = (index ) % pooled_height; int c = (index / pooled_height) % channels; int n = index / pooled_height / channels; const T* offset_bottom_rois = bottom_rois + n * 3; int roi_batch_ind = offset_bottom_rois[0]; // Do not using rounding; this implementation detail is critical T roi_start= offset_bottom_rois[1] * spatial_scale; T roi_end= offset_bottom_rois[2] * spatial_scale; // Force malformed ROIs to be 1x1 T roi_height = max(roi_end- roi_start, (T)1.); T bin_size = static_cast(roi_height) / static_cast(pooled_height); T* offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height; int top_offset = (n * channels + c) * pooled_height; const T* offset_top_diff = top_diff + top_offset; const T top_diff_this_bin = offset_top_diff[pt]; // We use roi_bin_grid to sample the grid and mimic integral int roi_bin_grid= (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2 // We do average (integral) pooling inside a bin const T count = roi_bin_grid; // e.g. = 4 for (int it = 0; it < roi_bin_grid; it ++) // e.g., iy = 0, 1 { const T t = roi_start+ pt * bin_size+ static_cast(it + .5f) * bin_size/ static_cast(roi_bin_grid); // e.g., 0.5, 1.5 T w1, w2; int t_low, t_high; linear_interpolate_gradient(height, t, w1, w2, t_low, t_high, index); T g1 = top_diff_this_bin * w1 / count; T g2 = top_diff_this_bin * w2 / count; if (t_low >= 0 && t_high >= 0) { atomicAdd(offset_bottom_diff + t_low, static_cast(g1)); atomicAdd(offset_bottom_diff + t_high, static_cast(g2)); } // if } // it } // CUDA_1D_KERNEL_LOOP } // RoIAlignBackward at::Tensor Align_forward_cuda(const at::Tensor& input, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int sampling_ratio) { AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor"); auto num_rois = rois.size(0); auto channels = input.size(1); auto height = input.size(2); auto output = at::empty({num_rois, channels, pooled_height}, input.options()); auto output_size = num_rois * pooled_height * channels; cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min(THCCeilDiv((long)output_size, 512L), 4096L)); dim3 block(512); // printf("Debug main function: height:%d\n", height); if (output.numel() == 0) { THCudaCheck(cudaGetLastError()); return output; } AT_DISPATCH_FLOATING_TYPES(input.type(), "Align1D_forward", [&] { Align1DForward<<>>( output_size, input.contiguous().data(), spatial_scale, channels, height, pooled_height, sampling_ratio, rois.contiguous().data(), output.data()); }); THCudaCheck(cudaGetLastError()); return output; } // TODO remove the dependency on input and use instead its sizes -> save memory at::Tensor Align_backward_cuda(const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale, const int pooled_height, const int batch_size, const int channels, const int height, const int sampling_ratio) { AT_ASSERTM(grad.type().is_cuda(), "grad must be a CUDA tensor"); AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor"); auto num_rois = rois.size(0); auto grad_input = at::zeros({batch_size, channels, height}, grad.options()); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); dim3 grid(std::min(THCCeilDiv((long)grad.numel(), 512L), 4096L)); dim3 block(512); // handle possibly empty gradients if (grad.numel() == 0) { THCudaCheck(cudaGetLastError()); return grad_input; } AT_DISPATCH_FLOATING_TYPES(grad.type(), "ROIAlign_backward", [&] { Align1DBackwardFeature<<>>( grad.numel(), grad.contiguous().data(), num_rois, spatial_scale, channels, height, pooled_height, sampling_ratio, grad_input.data(), rois.contiguous().data()); }); THCudaCheck(cudaGetLastError()); return grad_input; } ================================================ FILE: models/ops/setup.py ================================================ # ------------------------------------------------------------------------ # TadTR: End-to-end Temporal Action Detection with Transformer # Copyright (c) 2021. Xiaolong Liu. # ------------------------------------------------------------------------ # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ import os import glob import pdb import torch from torch.utils.cpp_extension import CUDA_HOME from torch.utils.cpp_extension import CppExtension from torch.utils.cpp_extension import CUDAExtension from setuptools import find_packages from setuptools import setup requirements = ["torch", "torchvision"] def get_sources(extensions_dir): main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) return main_file + source_cpu + source_cuda def get_extensions(): this_dir = os.path.dirname(os.path.abspath(__file__)) extra_compile_args = {"cxx": []} define_macros = [] if torch.cuda.is_available() and CUDA_HOME is not None: define_macros += [("WITH_CUDA", None)] extra_compile_args["nvcc"] = [ "-DCUDA_HAS_FP16=1", "-D__CUDA_NO_HALF_OPERATORS__", "-D__CUDA_NO_HALF_CONVERSIONS__", "-D__CUDA_NO_HALF2_OPERATORS__", ] else: raise NotImplementedError('Cuda is not availabel') ext_modules = [ # Temporal Deformable Attention, optional # CUDAExtension( # "temporal_deform_attn.TemporalDeformableAttention", # get_sources(os.path.join(this_dir, "temporal_deform_attn/src")), # include_dirs=[os.path.join(this_dir, "temporal_deform_attn/src")], # define_macros=define_macros, # extra_compile_args=extra_compile_args # ), CUDAExtension('roi_align.Align1D', [ 'roi_align/src/roi_align_cuda.cpp', 'roi_align/src/roi_align_kernel.cu']) ] return ext_modules setup( name="TadTR_release", version="1.0", author="Xiaolong Liu", description="PyTorch Wrapper for CUDA Functions of TadTR", packages=find_packages(exclude=("configs", "tests",)), ext_modules=get_extensions(), cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, ) ================================================ FILE: models/ops/temporal_deform_attn/__init__.py ================================================ # ------------------------------------------------------------------------ # TadTR: End-to-end Temporal Action Detection with Transformer # Copyright (c) 2021. Xiaolong Liu. # ------------------------------------------------------------------------------------------------ # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ from .temporal_deform_attn import DeformAttn ================================================ FILE: models/ops/temporal_deform_attn/temporal_deform_attn.py ================================================ # ------------------------------------------------------------------------ # TadTR: End-to-end Temporal Action Detection with Transformer # Copyright (c) 2021. Xiaolong Liu. # ------------------------------------------------------------------------------------------------ # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 [see LICENSE for details] # ------------------------------------------------------------------------------------------------ # Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 # ------------------------------------------------------------------------------------------------ from __future__ import absolute_import from __future__ import print_function from __future__ import division from opts import cfg # if not cfg.disable_cuda: # from .functions import TDAFunction import warnings import math import pdb import torch from torch import nn import torch.nn.functional as F from torch.nn.init import xavier_uniform_, constant_ def _is_power_of_2(n): if (not isinstance(n, int)) or (n < 0): raise ValueError( "invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) return (n & (n-1) == 0) and n != 0 class DeformAttn(nn.Module): def __init__(self, d_model=256, n_levels=1, n_heads=8, n_points=4): """ Deformable Attention Module :param d_model hidden dimension :param n_levels number of feature levels :param n_heads number of attention heads :param n_points number of sampling points per attention head """ super().__init__() if d_model % n_heads != 0: raise ValueError( 'd_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads)) _d_per_head = d_model // n_heads # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation if not _is_power_of_2(_d_per_head): warnings.warn("You'd better set d_model in DeformAttn to make the dimension of each attention head a power of 2 " "which is more efficient in our CUDA implementation.") assert n_levels == 1, 'multi-level attention is not supported!' self.seq2col_step = 64 self.d_model = d_model self.n_levels = n_levels self.n_heads = n_heads self.n_points = n_points self.sampling_offsets = nn.Linear( d_model, n_heads * n_levels * n_points) self.attention_weights = nn.Linear( d_model, n_heads * n_levels * n_points) self.value_proj = nn.Linear(d_model, d_model) self.output_proj = nn.Linear(d_model, d_model) self._reset_parameters() def _reset_parameters(self): constant_(self.sampling_offsets.weight.data, 0.) # Initial offsets: # (1, 0, -1, 0, -1, 0, 1, 0) thetas = torch.arange( self.n_heads, dtype=torch.float32) * (4.0 * math.pi / self.n_heads) grid_init = thetas.cos()[:, None] grid_init = grid_init.view(self.n_heads, 1, 1, 1).repeat( 1, self.n_levels, self.n_points, 1) for i in range(self.n_points): grid_init[:, :, i, :] *= i + 1 with torch.no_grad(): self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) constant_(self.attention_weights.weight.data, 0.) constant_(self.attention_weights.bias.data, 0.) xavier_uniform_(self.value_proj.weight.data) constant_(self.value_proj.bias.data, 0.) xavier_uniform_(self.output_proj.weight.data) constant_(self.output_proj.bias.data, 0.) def forward(self, query, reference_points, input_flatten, input_temporal_lens, input_level_start_index, input_padding_mask=None): """ :param query (= src + pos) (N, Length_{query}, C) :param reference_points (N, Length_{query}, n_levels, 1), range in [0, 1], left (0), right (1), including padding area or (N, Length_{query}, n_levels, 2), add additional (t) to form reference segments :param input_flatten (=src) (N, \sum_{l=0}^{L-1} T_l, C) :param input_temporal_lens (n_levels), [T_0, T_1, ..., T_(L-1)] :param input_level_start_index (n_levels, ), [0, T_0, T_1, T_2, ..., T_{L-1}] :param input_padding_mask (N, \sum_{l=0}^{L-1} T_l), True for padding elements, False for non-padding elements :return output (N, Length_{query}, C) """ N, Len_q, _ = query.shape N, Len_in, _ = input_flatten.shape assert input_temporal_lens.sum() == Len_in value = self.value_proj(input_flatten) if input_padding_mask is not None: value = value.masked_fill(input_padding_mask[..., None], float(0)) value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads) # the predicted offset in temporal axis. They are *absolute* values, not normalized sampling_offsets = self.sampling_offsets(query).view( N, Len_q, self.n_heads, self.n_levels, self.n_points, 1) attention_weights = self.attention_weights(query).view( N, Len_q, self.n_heads, self.n_levels * self.n_points) attention_weights = F.softmax( attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points) if reference_points.shape[-1] == 1: # the reference points are normalized, but the offset are unnormalized # so we need to normalize the offsets offset_normalizer = input_temporal_lens[..., None] # (N, Length_{query}, n_heads, n_levels, n_points, 1) sampling_locations = reference_points[:, :, None, :, None, :] \ + sampling_offsets / \ offset_normalizer[None, None, None, :, None, :] # deform attention in the l-th (l >= 2) decoder layer when segment refinement is enabled elif reference_points.shape[-1] == 2: # offsets are related with the size of the reference segment sampling_locations = reference_points[:, :, None, :, None, :1] \ + sampling_offsets / self.n_points * \ reference_points[:, :, None, :, None, 1:] * 0.5 else: raise ValueError( 'Last dim of reference_points must be 1 or 2, but get {} instead.'.format(reference_points.shape[-1])) if cfg.dfm_att_backend == 'pytorch' or cfg.disable_cuda: # Implementation with PyTorch grid_sample operator. # Note that grid_sample only supports image inputs. We need to view the sequence as an image with height=1 sampling_locations = torch.cat((sampling_locations, torch.ones_like(sampling_locations)*0.5), dim=-1) input_spatial_shapes = torch.stack((torch.ones_like(input_temporal_lens), input_temporal_lens), dim=-1) output = deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights) else: raise NotImplementedError # # CUDA implementation. You will get identical results with the pytorch implementation # output = TDAFunction.apply( # value, input_temporal_lens, input_level_start_index, sampling_locations, attention_weights, self.seq2col_step) output = self.output_proj(output) return output, (sampling_locations, attention_weights) def deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): '''deformable attention implemeted with grid_sample.''' N_, S_, M_, D_ = value.shape _, Lq_, M_, L_, P_, _ = sampling_locations.shape value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) sampling_grids = 2 * sampling_locations - 1 sampling_value_list = [] for lid_, (H_, W_) in enumerate(value_spatial_shapes): # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) # N_*M_, D_, Lq_, P_ sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, mode='bilinear', padding_mode='zeros', align_corners=False) sampling_value_list.append(sampling_value_l_) # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) return output.transpose(1, 2).contiguous() ================================================ FILE: models/position_encoding.py ================================================ # ------------------------------------------------------------------------ # TadTR: End-to-end Temporal Action Detection with Transformer # Copyright (c) 2021. Xiaolong Liu.# # ------------------------------------------------------------------------ # Modified from DETR (https://github.com/facebookresearch/detr) # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved # ------------------------------------------------------------------------ """ Positional encodings for the transformer. """ import math import torch from torch import nn from util.misc import NestedTensor class PositionEmbeddingSine(nn.Module): """ This is a more standard version of the position embedding, very similar to the one used by the Attention is all you need paper, generalized to work on videos. """ def __init__(self, num_pos_feats=256, temperature=10000, normalize=False, scale=None): super().__init__() self.num_pos_feats = num_pos_feats self.temperature = temperature self.normalize = normalize if scale is not None and normalize is False: raise ValueError("normalize should be True if scale is passed") if scale is None: scale = 2 * math.pi self.scale = scale def forward(self, tensor_list: NestedTensor): x = tensor_list.tensors mask = tensor_list.mask assert mask is not None not_mask = ~mask x_embed = not_mask.cumsum(1, dtype=torch.float32) # N x T if self.normalize: eps = 1e-6 x_embed = x_embed / (x_embed[:, -1:] + eps) * self.scale dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) pos_x = x_embed[:, :, None] / dim_t # N x T x C # n,c,t pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2) pos = pos_x.permute(0, 2, 1) # N x C x T return pos def build_position_encoding(args): feat_dim = args.hidden_dim if args.position_embedding in ('v2', 'sine'): position_embedding = PositionEmbeddingSine(feat_dim, normalize=True) else: raise ValueError(f"not supported {args.position_embedding}") return position_embedding ================================================ FILE: models/tadtr.py ================================================ # ------------------------------------------------------------------------ # TadTR: End-to-end Temporal Action Detection with Transformer # Copyright (c) 2021. Xiaolong Liu. # ------------------------------------------------------------------------ # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 # ------------------------------------------------------------------------ # and DETR (https://github.com/facebookresearch/detr) # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved # ------------------------------------------------------------------------ """ TadTR model and criterion classes. """ import math import copy import torch import torch.nn.functional as F from torch import nn from util import segment_ops from util.misc import (NestedTensor, nested_tensor_from_tensor_list, accuracy, get_world_size, is_dist_avail_and_initialized, inverse_sigmoid) from models.matcher import build_matcher from models.position_encoding import build_position_encoding from .custom_loss import sigmoid_focal_loss from .transformer import build_deformable_transformer from opts import cfg if not cfg.disable_cuda: from models.ops.roi_align import ROIAlign def _get_clones(module, N): return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) def get_norm(norm_type, dim, num_groups=None): if norm_type == 'gn': assert num_groups is not None, 'num_groups must be specified' return nn.GroupNorm(num_groups, dim) elif norm_type == 'bn': return nn.BatchNorm1d(dim) else: raise NotImplementedError class TadTR(nn.Module): """ This is the TadTR module that performs temporal action detection """ def __init__(self, position_embedding, transformer, num_classes, num_queries, aux_loss=True, with_segment_refine=True, with_act_reg=True): """ Initializes the model. Parameters: backbone: torch module of the backbone to be used. See backbone.py transformer: torch module of the transformer architecture. See deformable_transformer.py num_classes: number of action classes num_queries: number of action queries, ie detection slot. This is the maximal number of actions TadTR can detect in a single video. aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used. with_segment_refine: iterative segment refinement """ super().__init__() self.num_queries = num_queries self.transformer = transformer hidden_dim = transformer.d_model self.class_embed = nn.Linear(hidden_dim, num_classes) self.segment_embed = MLP(hidden_dim, hidden_dim, 2, 3) self.query_embed = nn.Embedding(num_queries, hidden_dim*2) self.input_proj = nn.ModuleList([ nn.Sequential( nn.Conv1d(2048, hidden_dim, kernel_size=1), nn.GroupNorm(32, hidden_dim), )]) # self.backbone = backbone self.position_embedding = position_embedding self.aux_loss = aux_loss self.with_segment_refine = with_segment_refine self.with_act_reg = with_act_reg prior_prob = 0.01 bias_value = -math.log((1 - prior_prob) / prior_prob) self.class_embed.bias.data = torch.ones(num_classes) * bias_value nn.init.constant_(self.segment_embed.layers[-1].weight.data, 0) nn.init.constant_(self.segment_embed.layers[-1].bias.data, 0) for proj in self.input_proj: nn.init.xavier_uniform_(proj[0].weight, gain=1) nn.init.constant_(proj[0].bias, 0) num_pred = transformer.decoder.num_layers if with_segment_refine: self.class_embed = _get_clones(self.class_embed, num_pred) self.segment_embed = _get_clones(self.segment_embed, num_pred) nn.init.constant_( self.segment_embed[0].layers[-1].bias.data[1:], -2.0) # hack implementation for segment refinement self.transformer.decoder.segment_embed = self.segment_embed else: nn.init.constant_( self.segment_embed.layers[-1].bias.data[1:], -2.0) self.class_embed = nn.ModuleList( [self.class_embed for _ in range(num_pred)]) self.segment_embed = nn.ModuleList( [self.segment_embed for _ in range(num_pred)]) self.transformer.decoder.segment_embed = None if with_act_reg: # RoIAlign params self.roi_size = 16 self.roi_scale = 0 self.roi_extractor = ROIAlign(self.roi_size, self.roi_scale) self.actionness_pred = nn.Sequential( nn.Linear(self.roi_size * hidden_dim, hidden_dim), nn.ReLU(inplace=True), nn.Linear(hidden_dim, hidden_dim), nn.ReLU(inplace=True), nn.Linear(hidden_dim, 1), nn.Sigmoid() ) def _to_roi_align_format(self, rois, T, scale_factor=1): '''Convert RoIs to RoIAlign format. Params: RoIs: normalized segments coordinates, shape (batch_size, num_segments, 4) T: length of the video feature sequence ''' # transform to absolute axis B, N = rois.shape[:2] rois_center = rois[:, :, 0:1] rois_size = rois[:, :, 1:2] * scale_factor rois_abs = torch.cat( (rois_center - rois_size/2, rois_center + rois_size/2), dim=2) * T # expand the RoIs rois_abs = torch.clamp(rois_abs, min=0, max=T) # (N, T, 2) # add batch index batch_ind = torch.arange(0, B).view((B, 1, 1)).to(rois_abs.device) batch_ind = batch_ind.repeat(1, N, 1) rois_abs = torch.cat((batch_ind, rois_abs), dim=2) # NOTE: stop gradient here to stablize training return rois_abs.view((B*N, 3)).detach() def forward(self, samples): """ The forward expects a NestedTensor, which consists of: - samples.tensors: batched images, of shape [batch_size x 3 x H x W] - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels or a tuple of tensors and mask It returns a dict with the following elements: - "pred_logits": the classification logits (including no-action) for all queries. Shape= [batch_size x num_queries x (num_classes + 1)] - "pred_segments": The normalized segments coordinates for all queries, represented as (center, width). These values are normalized in [0, 1], relative to the size of each individual image (disregarding possible padding). See PostProcess for information on how to retrieve the unnormalized segment. - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of dictionnaries containing the two above keys for each decoder layer. """ if not isinstance(samples, NestedTensor): if isinstance(samples, (list, tuple)): samples = NestedTensor(*samples) else: samples = nested_tensor_from_tensor_list(samples) # (n, c, t) pos = [self.position_embedding(samples)] src, mask = samples.tensors, samples.mask srcs = [self.input_proj[0](src)] masks = [mask] query_embeds = self.query_embed.weight hs, init_reference, inter_references, memory = self.transformer( srcs, masks, pos, query_embeds) outputs_classes = [] outputs_coords = [] # gather outputs from each decoder layer for lvl in range(hs.shape[0]): if lvl == 0: reference = init_reference else: reference = inter_references[lvl - 1] reference = inverse_sigmoid(reference) outputs_class = self.class_embed[lvl](hs[lvl]) tmp = self.segment_embed[lvl](hs[lvl]) # the l-th layer (l >= 2) if reference.shape[-1] == 2: tmp += reference # the first layer else: assert reference.shape[-1] == 1 tmp[..., 0] += reference[..., 0] outputs_coord = tmp.sigmoid() outputs_classes.append(outputs_class) outputs_coords.append(outputs_coord) outputs_class = torch.stack(outputs_classes) outputs_coord = torch.stack(outputs_coords) if not self.with_act_reg: out = {'pred_logits': outputs_class[-1], 'pred_segments': outputs_coord[-1]} else: # perform RoIAlign B, N = outputs_coord[-1].shape[:2] origin_feat = memory rois = self._to_roi_align_format( outputs_coord[-1], origin_feat.shape[2], scale_factor=1.5) roi_features = self.roi_extractor(origin_feat, rois) roi_features = roi_features.view((B, N, -1)) pred_actionness = self.actionness_pred(roi_features) last_layer_cls = outputs_class[-1] last_layer_reg = outputs_coord[-1] out = {'pred_logits': last_layer_cls, 'pred_segments': last_layer_reg, 'pred_actionness': pred_actionness} if self.aux_loss: out['aux_outputs'] = self._set_aux_loss( outputs_class, outputs_coord) return out @torch.jit.unused def _set_aux_loss(self, outputs_class, outputs_coord): # this is a workaround to make torchscript happy, as torchscript # doesn't support dictionary with non-homogeneous values, such # as a dict having both a Tensor and a list. return [{'pred_logits': a, 'pred_segments': b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])] class SetCriterion(nn.Module): """ This class computes the loss for TadTR. The process happens in two steps: 1) we compute hungarian assignment between ground truth segments and the outputs of the model 2) we supervise each pair of matched ground-truth / prediction (supervise class and segment) """ def __init__(self, num_classes, matcher, weight_dict, losses, focal_alpha=0.25): """ Create the criterion. Parameters: num_classes: number of action categories, omitting the special no-action category matcher: module able to compute a matching between targets and proposals weight_dict: dict containing as key the names of the losses and as values their relative weight. losses: list of all the losses to be applied. See get_loss for list of available losses. focal_alpha: alpha in Focal Loss """ super().__init__() self.num_classes = num_classes self.matcher = matcher self.weight_dict = weight_dict self.losses = losses self.focal_alpha = focal_alpha def loss_labels(self, outputs, targets, indices, num_segments, log=True): """Classification loss (NLL) targets dicts must contain the key "labels" containing a tensor of dim [nb_target_segments] """ assert 'pred_logits' in outputs src_logits = outputs['pred_logits'] idx = self._get_src_permutation_idx(indices) target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)]) target_classes = torch.full(src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device) target_classes[idx] = target_classes_o target_classes_onehot = torch.zeros([src_logits.shape[0], src_logits.shape[1], src_logits.shape[2] + 1], dtype=src_logits.dtype, layout=src_logits.layout, device=src_logits.device) target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1) target_classes_onehot = target_classes_onehot[:,:,:-1] loss_ce = sigmoid_focal_loss(src_logits, target_classes_onehot, num_segments, alpha=self.focal_alpha, gamma=2) * src_logits.shape[1] # nq losses = {'loss_ce': loss_ce} if log: # TODO this should probably be a separate loss, not hacked in this one here losses['class_error'] = 100 - accuracy(src_logits[idx], target_classes_o)[0] return losses def loss_segments(self, outputs, targets, indices, num_segments): """Compute the losses related to the segmentes, the L1 regression loss and the IoU loss targets dicts must contain the key "segments" containing a tensor of dim [nb_target_segments, 2] The target segments are expected in format (center, width), normalized by the video length. """ assert 'pred_segments' in outputs idx = self._get_src_permutation_idx(indices) src_segments = outputs['pred_segments'][idx] target_segments = torch.cat([t['segments'][i] for t, (_, i) in zip(targets, indices)], dim=0) loss_segment = F.l1_loss(src_segments, target_segments, reduction='none') losses = {} losses['loss_segments'] = loss_segment.sum() / num_segments loss_iou = 1 - torch.diag(segment_ops.segment_iou( segment_ops.segment_cw_to_t1t2(src_segments), segment_ops.segment_cw_to_t1t2(target_segments))) losses['loss_iou'] = loss_iou.sum() / num_segments return losses def loss_actionness(self, outputs, targets, indices, num_segments): """Compute the actionness regression loss targets dicts must contain the key "segments" containing a tensor of dim [nb_target_segments, 2] The target segments are expected in format (center, width), normalized by the video length. """ assert 'pred_segments' in outputs assert 'pred_actionness' in outputs src_segments = outputs['pred_segments'].view((-1, 2)) target_segments = torch.cat([t['segments'] for t in targets], dim=0) losses = {} iou_mat = segment_ops.segment_iou( segment_ops.segment_cw_to_t1t2(src_segments), segment_ops.segment_cw_to_t1t2(target_segments)) gt_iou = iou_mat.max(dim=1)[0] pred_actionness = outputs['pred_actionness'] loss_actionness = F.l1_loss(pred_actionness.view(-1), gt_iou.view(-1).detach()) losses['loss_actionness'] = loss_actionness return losses def _get_src_permutation_idx(self, indices): # permute predictions following indices batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)]) src_idx = torch.cat([src for (src, _) in indices]) return batch_idx, src_idx def _get_tgt_permutation_idx(self, indices): # permute targets following indices batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]) tgt_idx = torch.cat([tgt for (_, tgt) in indices]) return batch_idx, tgt_idx def get_loss(self, loss, outputs, targets, indices, num_segments, **kwargs): loss_map = { 'labels': self.loss_labels, 'segments': self.loss_segments, 'actionness': self.loss_actionness, } assert loss in loss_map, f'do you really want to compute {loss} loss?' return loss_map[loss](outputs, targets, indices, num_segments, **kwargs) def forward(self, outputs, targets): """ This performs the loss computation. Parameters: outputs: dict of tensors, see the output specification of the model for the format targets: list of dicts, such that len(targets) == batch_size. The expected keys in each dict depends on the losses applied, see each loss' doc """ outputs_without_aux = {k: v for k, v in outputs.items() if k != 'aux_outputs'} # Retrieve the matching between the outputs of the last layer and the targets indices = self.matcher(outputs_without_aux, targets) # Compute the average number of target segments accross all nodes, for normalization purposes num_segments = sum(len(t["labels"]) for t in targets) num_segments = torch.as_tensor([num_segments], dtype=torch.float, device=next(iter(outputs.values())).device) if is_dist_avail_and_initialized(): torch.distributed.all_reduce(num_segments) num_segments = torch.clamp(num_segments / get_world_size(), min=1).item() # Compute all the requested losses losses = {} for loss in self.losses: kwargs = {} losses.update(self.get_loss(loss, outputs, targets, indices, num_segments, **kwargs)) # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. if 'aux_outputs' in outputs: for i, aux_outputs in enumerate(outputs['aux_outputs']): indices = self.matcher(aux_outputs, targets) for loss in self.losses: # we do not compute actionness loss for aux outputs if 'actionness' in loss: continue kwargs = {} if loss == 'labels': # Logging is enabled only for the last layer kwargs['log'] = False l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_segments, **kwargs) l_dict = {k + f'_{i}': v for k, v in l_dict.items()} losses.update(l_dict) self.indices = indices return losses class PostProcess(nn.Module): """ This module converts the model's output into the format expected by the TADEvaluator""" @torch.no_grad() def forward(self, outputs, target_sizes, fuse_score=True): """ Perform the computation Parameters: outputs: raw outputs of the model target_sizes: tensor of dimension [batch_size] containing the duration of each video of the batch """ out_logits, out_segments = outputs['pred_logits'], outputs['pred_segments'] assert len(out_logits) == len(target_sizes) # assert target_sizes.shape[1] == 1 prob = out_logits.sigmoid() # [bs, nq, C] if fuse_score: prob *= outputs['pred_actionness'] segments = segment_ops.segment_cw_to_t1t2(out_segments) # bs, nq, 2 if cfg.postproc_rank == 1: # default # sort across different instances, pick top 100 at most topk_values, topk_indexes = torch.topk(prob.view( out_logits.shape[0], -1), min(cfg.postproc_ins_topk, prob.shape[1]*prob.shape[2]), dim=1) scores = topk_values topk_segments = topk_indexes // out_logits.shape[2] labels = topk_indexes % out_logits.shape[2] # bs, nq, 2; bs, num, 2 segments = torch.gather( segments, 1, topk_segments.unsqueeze(-1).repeat(1, 1, 2)) query_ids = topk_segments else: # pick topk classes for each query # pdb.set_trace() scores, labels = torch.topk(prob, cfg.postproc_cls_topk, dim=-1) scores, labels = scores.flatten(1), labels.flatten(1) # (bs, nq, 1, 2) segments = segments[:, [ i//cfg.postproc_cls_topk for i in range(cfg.postproc_cls_topk*segments.shape[1])], :] query_ids = (torch.arange(0, cfg.postproc_cls_topk*segments.shape[1], 1, dtype=labels.dtype, device=labels.device) // cfg.postproc_cls_topk)[None, :].repeat(labels.shape[0], 1) # from normalized [0, 1] to absolute [0, length] coordinates vid_length = target_sizes scale_fct = torch.stack([vid_length, vid_length], dim=1) segments = segments * scale_fct[:, None, :] results = [{'scores': s, 'labels': l, 'segments': b, 'query_ids': q} for s, l, b, q in zip(scores, labels, segments, query_ids)] return results class MLP(nn.Module): """ Very simple multi-layer perceptron (also called FFN)""" def __init__(self, input_dim, hidden_dim, output_dim, num_layers): super().__init__() self.num_layers = num_layers h = [hidden_dim] * (num_layers - 1) self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) def forward(self, x): for i, layer in enumerate(self.layers): x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) return x def build(args): if args.binary: num_classes = 1 else: if args.dataset_name == 'thumos14': num_classes = 20 elif args.dataset_name == 'muses': num_classes = 25 elif args.dataset_name in ['activitynet', 'hacs']: num_classes = 200 else: raise ValueError('unknown dataset {}'.format(args.dataset_name)) pos_embed = build_position_encoding(args) transformer = build_deformable_transformer(args) model = TadTR( pos_embed, transformer, num_classes=num_classes, num_queries=args.num_queries, aux_loss=args.aux_loss, with_segment_refine=args.seg_refine, with_act_reg=args.act_reg ) matcher = build_matcher(args) losses = ['labels', 'segments'] weight_dict = { 'loss_ce': args.cls_loss_coef, 'loss_segments': args.seg_loss_coef, 'loss_iou': args.iou_loss_coef} if args.act_reg: weight_dict['loss_actionness'] = args.act_loss_coef losses.append('actionness') if args.aux_loss: aux_weight_dict = {} for i in range(args.dec_layers - 1): aux_weight_dict.update({k + f'_{i}': v for k, v in weight_dict.items()}) aux_weight_dict.update({k + f'_enc': v for k, v in weight_dict.items()}) weight_dict.update(aux_weight_dict) criterion = SetCriterion(num_classes, matcher, weight_dict, losses, focal_alpha=args.focal_alpha) postprocessor = PostProcess() return model, criterion, postprocessor ================================================ FILE: models/transformer.py ================================================ # ------------------------------------------------------------------------ # TadTR: End-to-end Temporal Action Detection with Transformer # Copyright (c) 2021. Xiaolong Liu. # ------------------------------------------------------------------------ # Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) # Copyright (c) 2020 SenseTime. All Rights Reserved. # Licensed under the Apache License, Version 2.0 # ------------------------------------------------------------------------ # and DETR (https://github.com/facebookresearch/detr) # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved # ------------------------------------------------------------------------ import copy import torch import torch.nn.functional as F from torch import nn, Tensor from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_ from util.misc import inverse_sigmoid from models.ops.temporal_deform_attn import DeformAttn from opts import cfg class DeformableTransformer(nn.Module): def __init__(self, d_model=256, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=1024, dropout=0.1, activation="relu", return_intermediate_dec=False, num_feature_levels=4, dec_n_points=4, enc_n_points=4): super().__init__() self.d_model = d_model self.nhead = nhead encoder_layer = DeformableTransformerEncoderLayer(d_model, dim_feedforward, dropout, activation, num_feature_levels, nhead, enc_n_points) self.encoder = DeformableTransformerEncoder(encoder_layer, num_encoder_layers) decoder_layer = DeformableTransformerDecoderLayer(d_model, dim_feedforward, dropout, activation, num_feature_levels, nhead, dec_n_points) self.decoder = DeformableTransformerDecoder(decoder_layer, num_decoder_layers, return_intermediate_dec) self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model)) self.reference_points = nn.Linear(d_model, 1) self._reset_parameters() def _reset_parameters(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) for m in self.modules(): if isinstance(m, DeformAttn): m._reset_parameters() xavier_uniform_(self.reference_points.weight.data, gain=1.0) constant_(self.reference_points.bias.data, 0.) normal_(self.level_embed) def get_valid_ratio(self, mask): _, T = mask.shape valid_T = torch.sum(~mask, 1) valid_ratio = valid_T.float() / T return valid_ratio # shape=(bs) def forward(self, srcs, masks, pos_embeds, query_embed=None): ''' Params: srcs: list of Tensor with shape (bs, c, t) masks: list of Tensor with shape (bs, t) pos_embeds: list of Tensor with shape (bs, c, t) query_embed: list of Tensor with shape (nq, 2c) Returns: hs: list, per layer output of decoder init_reference_out: reference points predicted from query embeddings inter_references_out: reference points predicted from each decoder layer memory: (bs, c, t), final output of the encoder ''' assert query_embed is not None # prepare input for encoder src_flatten = [] mask_flatten = [] lvl_pos_embed_flatten = [] temporal_lens = [] for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)): bs, c, t = src.shape temporal_lens.append(t) # (bs, c, t) => (bs, t, c) src = src.transpose(1, 2) pos_embed = pos_embed.transpose(1, 2) lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1) lvl_pos_embed_flatten.append(lvl_pos_embed) src_flatten.append(src) mask_flatten.append(mask) src_flatten = torch.cat(src_flatten, 1) mask_flatten = torch.cat(mask_flatten, 1) lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) temporal_lens = torch.as_tensor(temporal_lens, dtype=torch.long, device=src_flatten.device) level_start_index = torch.cat((temporal_lens.new_zeros((1, )), temporal_lens.cumsum(0)[:-1])) valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1) # (bs, nlevels) # deformable encoder memory = self.encoder(src_flatten, temporal_lens, level_start_index, valid_ratios, lvl_pos_embed_flatten if cfg.use_pos_embed else None, mask_flatten) # shape=(bs, t, c) bs, _, c = memory.shape query_embed, tgt = torch.split(query_embed, c, dim=1) query_embed = query_embed.unsqueeze(0).expand(bs, -1, -1) tgt = tgt.unsqueeze(0).expand(bs, -1, -1) reference_points = self.reference_points(query_embed).sigmoid() init_reference_out = reference_points # decoder hs, inter_references = self.decoder(tgt, reference_points, memory, temporal_lens, level_start_index, valid_ratios, query_embed, mask_flatten) inter_references_out = inter_references return hs, init_reference_out, inter_references_out, memory.transpose(1, 2) class DeformableTransformerEncoderLayer(nn.Module): def __init__(self, d_model=256, d_ffn=1024, dropout=0.1, activation="relu", n_levels=4, n_heads=8, n_points=4): super().__init__() # self attention self.self_attn = DeformAttn(d_model, n_levels, n_heads, n_points) self.dropout1 = nn.Dropout(dropout) self.norm1 = nn.LayerNorm(d_model) # ffn self.linear1 = nn.Linear(d_model, d_ffn) self.activation = _get_activation_fn(activation) self.dropout2 = nn.Dropout(dropout) self.linear2 = nn.Linear(d_ffn, d_model) self.dropout3 = nn.Dropout(dropout) self.norm2 = nn.LayerNorm(d_model) @staticmethod def with_pos_embed(tensor, pos): return tensor if pos is None else tensor + pos def forward_ffn(self, src): src2 = self.linear2(self.dropout2(self.activation(self.linear1(src)))) src = src + self.dropout3(src2) src = self.norm2(src) return src def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, padding_mask=None): # self attention src2, _ = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index, padding_mask) src = src + self.dropout1(src2) src = self.norm1(src) # ffn src = self.forward_ffn(src) return src class DeformableTransformerEncoder(nn.Module): def __init__(self, encoder_layer, num_layers): super().__init__() self.layers = _get_clones(encoder_layer, num_layers) self.num_layers = num_layers @staticmethod def get_reference_points(spatial_shapes, valid_ratios, device): reference_points_list = [] for lvl, T_ in enumerate(spatial_shapes): ref = torch.linspace(0.5, T_ - 0.5, T_, dtype=torch.float32, device=device) # (t,) ref = ref[None] / (valid_ratios[:, None, lvl] * T_) # (bs, t) reference_points_list.append(ref) reference_points = torch.cat(reference_points_list, 1) reference_points = reference_points[:, :, None] * valid_ratios[:, None] # (N, t, n_levels) return reference_points[..., None] # (N, t, n_levels, 1) def forward(self, src, temporal_lens, level_start_index, valid_ratios, pos=None, padding_mask=None): ''' src: shape=(bs, t, c) temporal_lens: shape=(n_levels). content: [t1, t2, t3, ...] level_start_index: shape=(n_levels,). [0, t1, t1+t2, ...] valid_ratios: shape=(bs, n_levels). ''' output = src # (bs, t, levels, 1) reference_points = self.get_reference_points(temporal_lens, valid_ratios, device=src.device) for _, layer in enumerate(self.layers): output = layer(output, pos, reference_points, temporal_lens, level_start_index, padding_mask) return output class DeformableTransformerDecoderLayer(nn.Module): def __init__(self, d_model=256, d_ffn=1024, dropout=0.1, activation="relu", n_levels=4, n_heads=8, n_points=4): super().__init__() # cross attention self.cross_attn = DeformAttn(d_model, n_levels, n_heads, n_points) self.dropout1 = nn.Dropout(dropout) self.norm1 = nn.LayerNorm(d_model) # self attention self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout) self.dropout2 = nn.Dropout(dropout) self.norm2 = nn.LayerNorm(d_model) # ffn self.linear1 = nn.Linear(d_model, d_ffn) self.activation = _get_activation_fn(activation) self.dropout3 = nn.Dropout(dropout) self.linear2 = nn.Linear(d_ffn, d_model) self.dropout4 = nn.Dropout(dropout) self.norm3 = nn.LayerNorm(d_model) @staticmethod def with_pos_embed(tensor, pos): return tensor if pos is None else tensor + pos def forward_ffn(self, tgt): tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt)))) tgt = tgt + self.dropout4(tgt2) tgt = self.norm3(tgt) return tgt def forward(self, tgt, query_pos, reference_points, src, src_spatial_shapes, level_start_index, src_padding_mask=None): if not cfg.disable_query_self_att: # self attention q = k = self.with_pos_embed(tgt, query_pos) tgt2 = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), tgt.transpose(0, 1))[0].transpose(0, 1) tgt = tgt + self.dropout2(tgt2) tgt = self.norm2(tgt) else: pass # cross attention tgt2, _ = self.cross_attn(self.with_pos_embed(tgt, query_pos), reference_points, src, src_spatial_shapes, level_start_index, src_padding_mask) tgt = tgt + self.dropout1(tgt2) tgt = self.norm1(tgt) # ffn tgt = self.forward_ffn(tgt) return tgt class DeformableTransformerDecoder(nn.Module): def __init__(self, decoder_layer, num_layers, return_intermediate=False): super().__init__() self.layers = _get_clones(decoder_layer, num_layers) self.num_layers = num_layers self.return_intermediate = return_intermediate # hack implementation for iterative bounding box refinement and two-stage Deformable DETR self.segment_embed = None self.class_embed = None def forward(self, tgt, reference_points, src, src_spatial_shapes, src_level_start_index, src_valid_ratios, query_pos=None, src_padding_mask=None): ''' tgt: [bs, nq, C] reference_points: [bs, nq, 1 or 2] src: [bs, T, C] src_valid_ratios: [bs, levels] ''' output = tgt intermediate = [] intermediate_reference_points = [] for lid, layer in enumerate(self.layers): # (bs, nq, 1, 1 or 2) x (bs, 1, num_level, 1) => (bs, nq, num_level, 1 or 2) reference_points_input = reference_points[:, :, None] * src_valid_ratios[:, None,:, None] output = layer(output, query_pos, reference_points_input, src, src_spatial_shapes, src_level_start_index, src_padding_mask) # hack implementation for segment refinement if self.segment_embed is not None: # update the reference point/segment of the next layer according to the output from the current layer tmp = self.segment_embed[lid](output) if reference_points.shape[-1] == 2: new_reference_points = tmp + inverse_sigmoid(reference_points) new_reference_points = new_reference_points.sigmoid() else: # at the 0-th decoder layer # d^(n+1) = delta_d^(n+1) # c^(n+1) = sigmoid( inverse_sigmoid(c^(n)) + delta_c^(n+1)) assert reference_points.shape[-1] == 1 new_reference_points = tmp new_reference_points[..., :1] = tmp[..., :1] + inverse_sigmoid(reference_points) new_reference_points = new_reference_points.sigmoid() reference_points = new_reference_points.detach() if self.return_intermediate: intermediate.append(output) intermediate_reference_points.append(reference_points) if self.return_intermediate: return torch.stack(intermediate), torch.stack(intermediate_reference_points) return output, reference_points def _get_clones(module, N): return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) def _get_activation_fn(activation): """Return an activation function given a string""" if activation == "relu": return F.relu if activation == "gelu": return F.gelu if activation == "glu": return F.glu if activation == "leaky_relu": return F.leaky_relu raise RuntimeError(F"activation should be relu/gelu, not {activation}.") def build_deformable_transformer(args): return DeformableTransformer( d_model=args.hidden_dim, nhead=args.nheads, num_encoder_layers=args.enc_layers, num_decoder_layers=args.dec_layers, dim_feedforward=args.dim_feedforward, dropout=args.dropout, activation=args.activation, return_intermediate_dec=True, num_feature_levels=1, dec_n_points=args.dec_n_points, enc_n_points=args.enc_n_points) ================================================ FILE: opts.py ================================================ # ------------------------------------------------------------------------ # TadTR: End-to-end Temporal Action Detection with Transformer # Copyright (c) 2021 - 2022. Xiaolong Liu. # ------------------------------------------------------------------------ import argparse from easydict import EasyDict import yaml def str2bool(x): if x.lower() in ['true', 't', '1', 'y']: return True else: return False def get_args_parser(): parser = argparse.ArgumentParser('TadTR', add_help=False) parser.add_argument('--cfg', type=str, help='the config file to use') parser.add_argument('--device', default='cuda', help='device to use for training / testing') parser.add_argument('--seed', default=42, type=int) parser.add_argument('--resume', default='', help='resume from checkpoint') parser.add_argument('--eval', action='store_true', help='perform testing') parser.add_argument('--num_workers', default=2, type=int, help='number of dataloader workers') # Multi-GPU training # We support both DataParallel and Distributed DataParallel (DDP) parser.add_argument('--multi_gpu', action='store_true', help='use nn.DataParallel') parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes') parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training') # Other options parser.add_argument('opt', nargs=argparse.REMAINDER, help='Command arguments that override configs') return parser cfg = EasyDict() # ---- Basic option ---- # whether to enable tensorboard cfg.tensorboard = False # Disable CUDA extensions so that we can run the model on CPU cfg.disable_cuda = False # The backend of deformable attention, pytorch or CUDA cfg.dfm_att_backend = 'pytorch' # path where to save, empty for no saving cfg.output_dir = '' # # ------ Data options ------ cfg.dataset_name = 'thumos14' # Use feature input or raw image input (jointly train the video encoder and the detection head). Choices: {feature, image} cfg.input_type = 'feature' # Which kind of feature to use. e.g. i3d, tsn. cfg.feature = 'i3d2s' # dimension (channels) of the video feature cfg.feature_dim = 2048 # Perform binary detection (proposal generation) only cfg.binary = False # Testing on Which subset 'val' or 'test' (For Anet and HACS). Note that we rename the training/validation/testing subsets for all datasets. For example, the validation subset used for training on THUMOS14 is renamed as 'train' subset. cfg.test_set = 'val' # whether to crop video into windows (A window is also called a slice in this codebase). Required for THUMOS14 cfg.online_slice = False # length of video slices. For feature input, the length is for feature sequence. For video input, the length is for frame sequence. cfg.slice_len = None # overlap ratio (=overlap_length/slice_length) between adjacent slices during training cfg.slice_overlap = 0 # overlap ratio between adjacent slices during inference cfg.test_slice_overlap = 0 # ---- Model option -------- # Name of the convolutional backbone to use. If we use video features as input, backbone should be 'none' cfg.backbone = 'none' # whether to use position embedding cfg.use_pos_embed = True # Type of positional embedding to use on top of the video features. Only support sine embedding. cfg.position_embedding = "sine" # Number of encoding layers in the transformer cfg.enc_layers = 2 # Number of decoding layers in the transformer cfg.dec_layers = 4 # Intermediate size of the feedforward layers in the transformer blocks cfg.dim_feedforward = 2048 # Size of the embeddings (dimension of the transformer) cfg.hidden_dim = 256 # Dropout applied in the transformer cfg.dropout = 0.1 # Number of attention heads inside the transformer's attentions cfg.nheads = 8 # Number of sampled points per head for deformable attention in the encoder cfg.enc_n_points = 4 # Number of sampled points per head for deformable attention in the decoder cfg.dec_n_points = 4 # Number of action queries cfg.num_queries = 30 # Transformer activation type, relu|leaky_relu|gelu cfg.activation = 'relu' # Whether to enable segment refinement mechanism cfg.seg_refine = True # Whether to enable actionness regression head cfg.act_reg = True # whether to disable self-attention between action queries cfg.disable_query_self_att = False # ----- Loss and matcher setting ------- # Enable auxiliary decoding losses (loss at each layer) cfg.aux_loss = True # Loss weight cfg.act_loss_coef = 4 cfg.cls_loss_coef = 2 cfg.seg_loss_coef = 5 cfg.iou_loss_coef = 2 # Relative classification weight of the no-action class cfg.eos_coef = 0.1 # For focal loss cfg.focal_alpha = 0.25 # Set cost weight cfg.set_cost_class = 6 # Class coefficient cfg.set_cost_seg = 5 # Segment L1 coefficient cfg.set_cost_iou = 2 # Segment IoU coefficient # ----- Training option ------- # base learning rate. If you set lr in yaml file, don't use this format, use 0.0002 instead cfg.lr = 2e-4 # Valid only when the input is video frames # specify the name pattern of the backbone layers. cfg.lr_backbone_names = ['backbone'] # learning rate of backbone layers cfg.lr_backbone = 1e-5 # special linear projection layers that need to use smaller lr cfg.lr_linear_proj_names = ['reference_points', 'sampling_offsets'] cfg.lr_linear_proj_mult = 0.1 # which optimizer to use, choose from ['AdamW', 'Adam', 'SGD'] cfg.optimizer = 'AdamW' cfg.batch_size = 16 cfg.weight_decay = 1e-4 # gradient clipping max norm cfg.clip_max_norm = 0.1 # maximum number of training epochs cfg.epochs = 16 # when to decay lr cfg.lr_step = [14] # save checkpoint every N epochs. Set it to a small value if you want to save intermediate models cfg.ckpt_interval = 10 # update parameters every N forward-backward passes. N=1 (default) cfg.iter_size = 1 # test model every N epochs. N=1 (default) cfg.test_interval = 1 # ----- Postproc option ------- # How to rank the predicted instances. # 1: for each query, generate a instance for each class; then pick top-scored instance from the whole set # 2: pick top classes for each query cfg.postproc_rank = 1 # for each query, pick top k classes; keep all queries # this setting is useful for debug cfg.postproc_cls_topk = 1 # for each video, pick topk detections cfg.postproc_ins_topk = 100 # IoU threshold for NMS. Note that NMS is not necessary. cfg.nms_thr = 0.4 def update_cfg_with_args(cfg, arg_list): from ast import literal_eval for i in range(0, len(arg_list), 2): cur_entry = cfg key_parts = arg_list[i].split('.') for k in key_parts[:-1]: cur_entry = cur_entry[k] node = key_parts[-1] try: cur_entry[node] = literal_eval(arg_list[i+1]) except: # print(f'literal_eval({arg_list[i+1]}) failed, directly take the value') cur_entry[node] = arg_list[i+1] def update_cfg_from_file(cfg, cfg_path): import os assert os.path.exists(cfg_path), 'cfg_path is invalid' cfg_from_file = yaml.load(open(cfg_path), yaml.FullLoader) cfg.update(cfg_from_file) ================================================ FILE: requirements.txt ================================================ torch>=1.5.1 torchvision>=0.6.1 scipy tqdm easydict PyYAML numpy pandas ================================================ FILE: scripts/run_parallel.sh ================================================ # Run on two GPUs in non-distributed mode (more convenient) CUDA_VISIBLE_DEVICES=0,1 python -u main.py --cfg "CFG_PATH" --multi_gpu # Run on two GPUs in distributed mode (more powerful) MASTER_PORT=29510 CUDA_VISIBLE_DEVICES=0,1 python -u -m torch.distributed.launch --nproc_per_node=2 --master_port ${MASTER_PORT} --use_env main.py --cfg "CFG_PATH" ================================================ FILE: scripts/test_reference_models.sh ================================================ dataset=$1 if [[ $dataset = thumos14 ]];then CUDA_VISIBLE_DEVICES=0 python main.py --cfg configs/thumos14_i3d2s_tadtr.yml --eval --resume data/thumos14/thumos14_i3d2s_tadtr_reference.pth else echo "Unsupported dataset ${dataset}. Exit" fi ================================================ FILE: util/__init__.py ================================================ # ------------------------------------------------------------------------ # TadTR: End-to-end Temporal Action Detection with Transformer # Copyright (c) 2021. Xiaolong Liu. # ------------------------------------------------------------------------ ================================================ FILE: util/logger.py ================================================ # ------------------------------------------------------------------------ # TadTR: End-to-end Temporal Action Detection with Transformer # Copyright (c) 2021. Xiaolong Liu. # ------------------------------------------------------------------------ import builtins import logging import sys from .misc import is_main_process def _suppress_print(): """ Suppresses printing from the current process. """ def print_pass(*objects, sep=" ", end="\n", file=sys.stdout, flush=False): pass builtins.print = print_pass def setup_logger(log_file_path, name=None, level=logging.INFO): """ Setup a logger that simultaneously output to a file and stdout ARGS log_file_path: string, path to the logging file """ if is_main_process(): print('this is master process, set up logger') # logging settings # log_formatter = logging.Formatter("%(asctime)s [%(levelname)-5.5s] %(message)s") log_formatter = logging.Formatter( "[%(asctime)s][%(levelname)s] %(pathname)s: %(lineno)4d: %(message)s", datefmt="%m/%d %H:%M:%S") root_logger = logging.getLogger(name) if name: root_logger.propagate = False root_logger.setLevel(level) # file handler if log_file_path is not None: log_file_handler = logging.FileHandler(log_file_path) log_file_handler.setFormatter(log_formatter) root_logger.addHandler(log_file_handler) # stdout handler log_formatter = logging.Formatter( "[%(asctime)s][%(levelname)s]: %(message)s", datefmt="%m/%d %H:%M:%S") log_stream_handler = logging.StreamHandler(sys.stdout) log_stream_handler.setFormatter(log_formatter) root_logger.addHandler(log_stream_handler) logging.info('Log file is %s' % log_file_path) return root_logger else: print('this is not a master process, suppress print') _suppress_print() ================================================ FILE: util/misc.py ================================================ # ------------------------------------------------------------------------ # TadTR: End-to-end Temporal Action Detection with Transformer # Copyright (c) 2021. Xiaolong Liu. # ------------------------------------------------------------------------ # Modified from DETR (https://github.com/facebookresearch/detr) # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved # ------------------------------------------------------------------------ """ Misc functions, including distributed helpers. Mostly copy-paste from torchvision references. """ import os import subprocess import time from collections import defaultdict, deque import datetime import pickle from typing import Optional, List import torch import torch.distributed as dist from torch import Tensor import logging # needed due to empty tensor bug in pytorch and torchvision 0.5 import torchvision def mkdir_if_not_exist(dirname): if not os.path.exists(dirname): os.makedirs(dirname) class SmoothedValue(object): """Track a series of values and provide access to smoothed values over a window or the global series average. """ def __init__(self, window_size=20, fmt=None): if fmt is None: fmt = "{median:.4f} ({global_avg:.4f})" self.deque = deque(maxlen=window_size) self.total = 0.0 self.count = 0 self.fmt = fmt def update(self, value, n=1): self.deque.append(value) self.count += n self.total += value * n def synchronize_between_processes(self): """ Warning: does not synchronize the deque! """ if not is_dist_avail_and_initialized(): return t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda') dist.barrier() dist.all_reduce(t) t = t.tolist() self.count = int(t[0]) self.total = t[1] @property def median(self): d = torch.tensor(list(self.deque)) return d.median().item() @property def avg(self): d = torch.tensor(list(self.deque), dtype=torch.float32) return d.mean().item() @property def global_avg(self): return self.total / self.count @property def max(self): return max(self.deque) @property def value(self): return self.deque[-1] def __str__(self): return self.fmt.format( median=self.median, avg=self.avg, global_avg=self.global_avg, max=self.max, value=self.value) def all_gather(data): """ Run all_gather on arbitrary picklable data (not necessarily tensors) Args: data: any picklable object Returns: list[data]: list of data gathered from each rank """ world_size = get_world_size() if world_size == 1: return [data] # serialized to a Tensor buffer = pickle.dumps(data) storage = torch.ByteStorage.from_buffer(buffer) tensor = torch.ByteTensor(storage).to("cuda") # obtain Tensor size of each rank local_size = torch.tensor([tensor.numel()], device="cuda") size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)] dist.all_gather(size_list, local_size) size_list = [int(size.item()) for size in size_list] max_size = max(size_list) # receiving Tensor from all ranks # we pad the tensor because torch all_gather does not support # gathering tensors of different shapes tensor_list = [] for _ in size_list: tensor_list.append(torch.empty( (max_size,), dtype=torch.uint8, device="cuda")) if local_size != max_size: padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda") tensor = torch.cat((tensor, padding), dim=0) dist.all_gather(tensor_list, tensor) data_list = [] for size, tensor in zip(size_list, tensor_list): buffer = tensor.cpu().numpy().tobytes()[:size] data_list.append(pickle.loads(buffer)) return data_list def reduce_dict(input_dict, average=True): """ Args: input_dict (dict): all the values will be reduced average (bool): whether to do average or sum Reduce the values in the dictionary from all processes so that all processes have the averaged results. Returns a dict with the same fields as input_dict, after reduction. """ world_size = get_world_size() if world_size < 2: return input_dict with torch.no_grad(): names = [] values = [] # sort the keys so that they are consistent across processes for k in sorted(input_dict.keys()): names.append(k) values.append(input_dict[k]) values = torch.stack(values, dim=0) dist.all_reduce(values) if average: values /= world_size reduced_dict = {k: v for k, v in zip(names, values)} return reduced_dict class MetricLogger(object): def __init__(self, delimiter="\t"): self.meters = defaultdict(SmoothedValue) self.delimiter = delimiter def update(self, **kwargs): for k, v in kwargs.items(): if isinstance(v, torch.Tensor): v = v.item() assert isinstance(v, (float, int)) self.meters[k].update(v) def __getattr__(self, attr): if attr in self.meters: return self.meters[attr] if attr in self.__dict__: return self.__dict__[attr] raise AttributeError("'{}' object has no attribute '{}'".format( type(self).__name__, attr)) def __str__(self): loss_str = [] for name, meter in self.meters.items(): loss_str.append( "{}: {}".format(name, str(meter)) ) return self.delimiter.join(loss_str) def synchronize_between_processes(self): for meter in self.meters.values(): meter.synchronize_between_processes() def add_meter(self, name, meter): self.meters[name] = meter def log_every(self, iterable, print_freq, header=None): i = 0 if not header: header = '' start_time = time.time() end = time.time() iter_time = SmoothedValue(fmt='{avg:.4f}') data_time = SmoothedValue(fmt='{avg:.4f}') space_fmt = ':' + str(len(str(len(iterable)))) + 'd' if torch.cuda.is_available(): log_msg = self.delimiter.join([ header, '[{0' + space_fmt + '}/{1}]', 'eta: {eta}', '{meters}', 'time: {time}', 'data: {data}', 'max mem: {memory:.0f}' ]) else: log_msg = self.delimiter.join([ header, '[{0' + space_fmt + '}/{1}]', 'eta: {eta}', '{meters}', 'time: {time}', 'data: {data}' ]) MB = 1024.0 * 1024.0 for obj in iterable: data_time.update(time.time() - end) yield obj iter_time.update(time.time() - end) if i % print_freq == 0 or i == len(iterable) - 1: eta_seconds = iter_time.global_avg * (len(iterable) - i) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if torch.cuda.is_available(): logging.info(log_msg.format( i, len(iterable), eta=eta_string, meters=str(self), time=str(iter_time), data=str(data_time), memory=torch.cuda.max_memory_allocated() / MB)) else: logging.info(log_msg.format( i, len(iterable), eta=eta_string, meters=str(self), time=str(iter_time), data=str(data_time))) i += 1 end = time.time() total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) logging.info('{} Total time: {} ({:.4f} s / it)'.format( header, total_time_str, total_time / len(iterable))) def get_sha(): cwd = os.path.dirname(os.path.abspath(__file__)) def _run(command): return subprocess.check_output(command, cwd=cwd).decode('ascii').strip() sha = 'N/A' diff = "clean" branch = 'N/A' try: sha = _run(['git', 'rev-parse', 'HEAD']) subprocess.check_output(['git', 'diff'], cwd=cwd) diff = _run(['git', 'diff-index', 'HEAD']) diff = "has uncommited changes" if diff else "clean" branch = _run(['git', 'rev-parse', '--abbrev-ref', 'HEAD']) except Exception: pass message = f"sha: {sha}, status: {diff}, branch: {branch}" return message def collate_fn(batch): batch = list(zip(*batch)) batch[0] = nested_tensor_from_tensor_list(batch[0]) # print('collate_fn done') return tuple(batch) def _max_by_axis(the_list): # type: (List[List[int]]) -> List[int] maxes = the_list[0] for sublist in the_list[1:]: for index, item in enumerate(sublist): maxes[index] = max(maxes[index], item) return maxes class NestedTensor(object): def __init__(self, tensors, mask: Optional[Tensor]): self.tensors = tensors self.mask = mask def to(self, device): # type: (Device) -> NestedTensor # noqa cast_tensor = self.tensors.to(device) mask = self.mask if mask is not None: assert mask is not None cast_mask = mask.to(device) else: cast_mask = None return NestedTensor(cast_tensor, cast_mask) # def cuda(self): # tensors = self.tensors.cuda() # mask = self.mask.cuda() # return NestedTensor(tensors, mask) def decompose(self): return self.tensors, self.mask def __repr__(self): return str(self.tensors) def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): # TODO make this more general if tensor_list[0].ndim == 3: # n,c,t if torchvision._is_tracing(): # nested_tensor_from_tensor_list() does not export well to ONNX # call _onnx_nested_tensor_from_tensor_list() instead return _onnx_nested_tensor_from_tensor_list(tensor_list) # TODO make it support different-sized images max_size = _max_by_axis([list(img.shape) for img in tensor_list]) # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) batch_shape = [len(tensor_list)] + max_size b, c, h, w = batch_shape dtype = tensor_list[0].dtype device = tensor_list[0].device tensor = torch.zeros(batch_shape, dtype=dtype, device=device) mask = torch.ones((b, h, w), dtype=torch.bool, device=device) for img, pad_img, m in zip(tensor_list, tensor, mask): pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) m[: img.shape[1], :img.shape[2]] = False elif tensor_list[0].ndim == 2 or tensor_list[0].ndim == 4: max_size = max([video_ft.shape[1] for video_ft in tensor_list]) # [c,t,h,w] or [c,t] # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) if tensor_list[0].ndim == 2: batch_shape = [len(tensor_list), tensor_list[0].shape[0], max_size] else: batch_shape = [len(tensor_list), tensor_list[0].shape[0], max_size, tensor_list[0].shape[2], tensor_list[0].shape[3]] b, c, t = batch_shape[:3] dtype = tensor_list[0].dtype device = tensor_list[0].device tensor = torch.zeros(batch_shape, dtype=dtype, device=device) mask = torch.ones((b, t), dtype=torch.bool, device=device) for video_ft, pad_video_ft, m in zip(tensor_list, tensor, mask): pad_video_ft[: video_ft.shape[0], : video_ft.shape[1]].copy_(video_ft) m[: video_ft.shape[1]] = False else: raise ValueError('not supported') return NestedTensor(tensor, mask) def make_nested_tensor(tensor): b, t = tensor.shape[0], tensor.shape[2] mask = torch.zeros([b, t], dtype=torch.bool, device=tensor.device) return NestedTensor(tensor, mask) # _onnx_nested_tensor_from_tensor_list() is an implementation of # nested_tensor_from_tensor_list() that is supported by ONNX tracing. @torch.jit.unused def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor: max_size = [] for i in range(tensor_list[0].dim()): max_size_i = torch.max(torch.stack( [img.shape[i] for img in tensor_list]).to(torch.float32)).to(torch.int64) max_size.append(max_size_i) max_size = tuple(max_size) # work around for # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) # m[: img.shape[1], :img.shape[2]] = False # which is not yet supported in onnx padded_imgs = [] padded_masks = [] for img in tensor_list: padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] padded_img = torch.nn.functional.pad( img, (0, padding[2], 0, padding[1], 0, padding[0])) padded_imgs.append(padded_img) m = torch.zeros_like(img[0], dtype=torch.int, device=img.device) padded_mask = torch.nn.functional.pad( m, (0, padding[2], 0, padding[1]), "constant", 1) padded_masks.append(padded_mask.to(torch.bool)) tensor = torch.stack(padded_imgs) mask = torch.stack(padded_masks) return NestedTensor(tensor, mask=mask) def setup_for_distributed(is_master): """ This function disables printing when not in master process """ import builtins as __builtin__ builtin_print = __builtin__.print def print(*args, **kwargs): force = kwargs.pop('force', False) if is_master or force: builtin_print(*args, **kwargs) __builtin__.print = print def is_dist_avail_and_initialized(): if not dist.is_available(): return False if not dist.is_initialized(): return False return True def get_world_size(): if not is_dist_avail_and_initialized(): return 1 return dist.get_world_size() def get_rank(): if not is_dist_avail_and_initialized(): return 0 return dist.get_rank() def is_main_process(): return get_rank() == 0 def save_on_master(*args, **kwargs): if is_main_process(): torch.save(*args, **kwargs) def init_distributed_mode(args): if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: args.rank = int(os.environ["RANK"]) args.world_size = int(os.environ['WORLD_SIZE']) args.gpu = int(os.environ['LOCAL_RANK']) elif 'SLURM_PROCID' in os.environ: args.rank = int(os.environ['SLURM_PROCID']) args.gpu = args.rank % torch.cuda.device_count() else: print('Not using distributed mode') args.distributed = False return args.distributed = True torch.cuda.set_device(args.gpu) args.dist_backend = 'nccl' print('| distributed init (rank {}): {}'.format( args.rank, args.dist_url), flush=True) torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) torch.distributed.barrier() setup_for_distributed(args.rank == 0) @torch.no_grad() def accuracy(output, target, topk=(1,)): """Computes the precision@k for the specified values of k""" if target.numel() == 0: return [torch.zeros([], device=output.device)] maxk = max(topk) batch_size = target.size(0) _, pred = output.topk(maxk, 1, True, True) pred = pred.t() correct = pred.eq(target.view(1, -1).expand_as(pred)) res = [] for k in topk: correct_k = correct[:k].view(-1).float().sum(0) res.append(correct_k.mul_(100.0 / batch_size)) return res def inverse_sigmoid(x, eps=1e-5): x = x.clamp(min=0, max=1) x1 = x.clamp(min=eps) x2 = (1 - x).clamp(min=eps) return torch.log(x1/x2) ================================================ FILE: util/segment_ops.py ================================================ # ------------------------------------------------------------------------ # TadTR: End-to-end Temporal Action Detection with Transformer # Copyright (c) 2021. Xiaolong Liu. # ------------------------------------------------------------------------ # Modified from DETR (https://github.com/facebookresearch/detr) # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved # ------------------------------------------------------------------------ """ Utilities for segment manipulation and IoU. """ import torch import numpy as np # from torchvision.ops.boxes import box_area def segment_cw_to_t1t2(x): '''corresponds to box_cxcywh_to_xyxy in detr Params: x: segments in (center, width) format, shape=(*, 2) Returns: segments in (t_start, t_end) format, shape=(*, 2) ''' if not isinstance(x, np.ndarray): x_c, w = x.unbind(-1) b = [(x_c - 0.5 * w), (x_c + 0.5 * w)] return torch.stack(b, dim=-1) else: x_c, w = x[..., 0], x[..., 1] b = [(x_c - 0.5 * w)[..., None], (x_c + 0.5 * w)[..., None]] return np.concatenate(b, axis=-1) def segment_t1t2_to_cw(x): '''corresponds to box_xyxy_to_cxcywh in detr Params: x: segments in (t_start, t_end) format, shape=(*, 2) Returns: segments in (center, width) format, shape=(*, 2) ''' if not isinstance(x, np.ndarray): x1, x2 = x.unbind(-1) b = [(x1 + x2) / 2, (x2 - x1)] return torch.stack(b, dim=-1) else: x1, x2 = x[..., 0], x[..., 1] b = [((x1 + x2) / 2)[..., None], (x2 - x1)[..., None]] return np.concatenate(b, axis=-1) def segment_length(segments): return (segments[:, 1]-segments[:, 0]).clamp(min=0) # modified from torchvision to also return the union def segment_iou_and_union(segments1, segments2): area1 = segment_length(segments1) area2 = segment_length(segments2) l = torch.max(segments1[:, None, 0], segments2[:, 0]) # N,M r = torch.min(segments1[:, None, 1], segments2[:, 1]) # N,M inter = (r - l).clamp(min=0) # [N,M] union = area1[:, None] + area2 - inter iou = inter / union return iou, union def segment_iou(segments1, segments2): """ Temporal IoU between The boxes should be in [x0, y0, x1, y1] format Returns a [N, M] pairwise matrix, where N = len(segments1) and M = len(segments2) """ # degenerate boxes gives inf / nan results # so do an early check assert (segments1[:, 1] >= segments1[:, 0]).all() area1 = segment_length(segments1) area2 = segment_length(segments2) l = torch.max(segments1[:, None, 0], segments2[:, 0]) # N,M r = torch.min(segments1[:, None, 1], segments2[:, 1]) # N,M inter = (r - l).clamp(min=0) # [N,M] union = area1[:, None] + area2 - inter iou = inter / union return iou def temporal_iou_numpy(proposal_min, proposal_max, gt_min, gt_max): """Compute IoU score between a groundtruth instance and the proposals. Args: proposal_min (list[float]): List of temporal anchor min. proposal_max (list[float]): List of temporal anchor max. gt_min (float): Groundtruth temporal box min. gt_max (float): Groundtruth temporal box max. Returns: list[float]: List of iou scores. """ len_anchors = proposal_max - proposal_min int_tmin = np.maximum(proposal_min, gt_min) int_tmax = np.minimum(proposal_max, gt_max) inter_len = np.maximum(int_tmax - int_tmin, 0.) union_len = len_anchors - inter_len + gt_max - gt_min jaccard = np.divide(inter_len, union_len) return jaccard def temporal_iou_numpy(proposal_min, proposal_max, gt_min, gt_max): """Compute IoP score between a groundtruth bbox and the proposals. Compute the IoP which is defined as the overlap ratio with groundtruth proportional to the duration of this proposal. Args: proposal_min (list[float]): List of temporal anchor min. proposal_max (list[float]): List of temporal anchor max. gt_min (float): Groundtruth temporal box min. gt_max (float): Groundtruth temporal box max. Returns: list[float]: List of intersection over anchor scores. """ len_anchors = np.array(proposal_max - proposal_min) int_tmin = np.maximum(proposal_min, gt_min) int_tmax = np.minimum(proposal_max, gt_max) inter_len = np.maximum(int_tmax - int_tmin, 0.) scores = np.divide(inter_len, len_anchors) return scores def soft_nms(proposals, alpha, low_threshold, high_threshold, top_k): """Soft NMS for temporal proposals. Args: proposals (np.ndarray): Proposals generated by network. alpha (float): Alpha value of Gaussian decaying function. low_threshold (float): Low threshold for soft nms. high_threshold (float): High threshold for soft nms. top_k (int): Top k values to be considered. Returns: np.ndarray: The updated proposals. """ proposals = proposals[proposals[:, -1].argsort()[::-1]] tstart = list(proposals[:, 0]) tend = list(proposals[:, 1]) tscore = list(proposals[:, 2]) rstart = [] rend = [] rscore = [] while len(tscore) > 0 and len(rscore) <= top_k: max_index = np.argmax(tscore) max_width = tend[max_index] - tstart[max_index] iou_list = temporal_iou_numpy(tstart[max_index], tend[max_index], np.array(tstart), np.array(tend)) iou_exp_list = np.exp(-np.square(iou_list) / alpha) for idx, _ in enumerate(tscore): if idx != max_index: current_iou = iou_list[idx] if current_iou > low_threshold + (high_threshold - low_threshold) * max_width: tscore[idx] = tscore[idx] * iou_exp_list[idx] rstart.append(tstart[max_index]) rend.append(tend[max_index]) rscore.append(tscore[max_index]) tstart.pop(max_index) tend.pop(max_index) tscore.pop(max_index) rstart = np.array(rstart).reshape(-1, 1) rend = np.array(rend).reshape(-1, 1) rscore = np.array(rscore).reshape(-1, 1) new_proposals = np.concatenate((rstart, rend, rscore), axis=1) return new_proposals def temporal_nms(segments, thresh): """ One-dimensional non-maximal suppression :param segments: [[st, ed, score, ...], ...] :param thresh: :return: """ t1 = segments[:, 0] t2 = segments[:, 1] scores = segments[:, 2] durations = t2 - t1 order = scores.argsort()[::-1] keep = [] while order.size > 0: i = order[0] keep.append(i) tt1 = np.maximum(t1[i], t1[order[1:]]) tt2 = np.minimum(t2[i], t2[order[1:]]) intersection = tt2 - tt1 IoU = intersection / \ (durations[i] + durations[order[1:]] - intersection).astype(float) inds = np.where(IoU <= thresh)[0] order = order[inds + 1] return segments[keep, :]