[
  {
    "path": ".gitignore",
    "content": "# User defined\ndata/\noutputs/\n\n\n# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\nshare/python-wheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.nox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n*.py,cover\n.hypothesis/\n.pytest_cache/\ncover/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\ndb.sqlite3-journal\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\n.pybuilder/\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# IPython\nprofile_default/\nipython_config.py\n\n# pyenv\n#   For a library or package, you might want to ignore these files since the code is\n#   intended to run in multiple environments; otherwise, check them in:\n# .python-version\n\n# pipenv\n#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.\n#   However, in case of collaboration, if having platform-specific dependencies or dependencies\n#   having no cross-platform support, pipenv may install dependencies that don't work, or not\n#   install all needed dependencies.\n#Pipfile.lock\n\n# PEP 582; used by e.g. github.com/David-OConnor/pyflow\n__pypackages__/\n\n# Celery stuff\ncelerybeat-schedule\ncelerybeat.pid\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n.dmypy.json\ndmypy.json\n\n# Pyre type checker\n.pyre/\n\n# pytype static type analyzer\n.pytype/\n\n# Cython debug symbols\ncython_debug/\n"
  },
  {
    "path": "Evaluation/README.md",
    "content": "#ActivityNet Large Scale Activity Recognition Challenge - Evaluation Toolkit\r\nThis is the documentation of the ActivityNet Large Scale Activity Recognition\r\nChallenge Evaluation Toolkit. It includes APIs to evaluate the performance of a method in the two different tasks in the challenge: *untrimmed video classification* and *activity detection*. For more information about the challenge competitions, please read the [guidelines](http://activity-net.org/challenges/2016/guidelines.html).\r\n\r\n##Dependencies\r\nThe Evaluation Toolkit is purely written in Python (>=2.7) and it requires the \r\nfollowing third party libraries:\r\n* [Numpy](http://www.numpy.org/)\r\n* [Pandas](http://pandas.pydata.org/)\r\n\r\n##Getting started\r\nWe include sample prediction files in the folder data to show how to evaluate your prediction results. Please follow this steps to obtain the performance evaluation on the provided sample files:\r\n* Run `git clone` this repository.\r\n* To evaluate classification performance call: `python get_classification_performance.py data/activity_net.v1-3.min.json sample_classification_prediction.json`\r\n* To evaluate detection performance call: `python get_detection_performance.py data/activity_net.v1-3.min.json sample_detection_prediction.json`\r\n\r\n##Contributions and Troubleshooting\r\nWe are welcome to contributions, please keep your pull-request simple so we can go back to you as soon as we can. If you found a bug please open a new issue and describe the problem.\r\n"
  },
  {
    "path": "Evaluation/eval_detection.py",
    "content": "import json\r\nimport sys\r\n\r\nimport urllib.error, urllib.parse\r\n\r\nimport numpy as np\r\nimport pandas as pd\r\n\r\nfrom .utils import get_blocked_videos\r\nfrom .utils import interpolated_prec_rec\r\nfrom .utils import segment_iou\r\nimport pdb\r\nimport traceback\r\nimport logging\r\n\r\n\r\nfrom joblib import Parallel, delayed\r\n\r\n\r\nlogger_initilized = False\r\n\r\n\r\ndef setup_logger(log_file_path, name=None, level=logging.INFO):\r\n    \"\"\"\r\n    Setup a logger that simultaneously output to a file and stdout\r\n    ARGS\r\n        log_file_path: string, path to the logging file\r\n    \"\"\"\r\n    # logging settings\r\n    #   log_formatter = logging.Formatter(\"%(asctime)s [%(levelname)-5.5s]  %(message)s\")\r\n    log_formatter = logging.Formatter(\r\n            \"[%(asctime)s][%(levelname)s] %(pathname)s: %(lineno)4d: %(message)s\",\r\n            datefmt=\"%m/%d %H:%M:%S\")\r\n    root_logger = logging.getLogger(name)\r\n\r\n    if name:\r\n        root_logger.propagate = False\r\n    root_logger.setLevel(level)\r\n    # file handler\r\n    if log_file_path is not None:\r\n        log_file_handler = logging.FileHandler(log_file_path)\r\n        log_file_handler.setFormatter(log_formatter)\r\n        root_logger.addHandler(log_file_handler)\r\n \r\n    log_formatter = logging.Formatter(\r\n            \"[%(asctime)s][%(levelname)s]: %(message)s\",\r\n            datefmt=\"%m/%d %H:%M:%S\")\r\n    log_stream_handler = logging.StreamHandler(sys.stdout)\r\n    log_stream_handler.setFormatter(log_formatter)\r\n    # log_stream_handler.setLevel(logging.INFO)\r\n    root_logger.addHandler(log_stream_handler)\r\n\r\n    logging.info('Log file is %s' % log_file_path)\r\n    global logger_initilized\r\n    logger_initilized = True\r\n    return root_logger\r\n\r\n\r\ndef get_classes(anno_dict):\r\n    if 'classes' in anno_dict:\r\n        classes = anno_dict['classes']\r\n    else:\r\n        \r\n        database = anno_dict['database']\r\n        all_gts = []\r\n        for vid in database:\r\n            all_gts += database[vid]['annotations']\r\n        classes = list(sorted({x['label'] for x in all_gts}))\r\n    return classes\r\n\r\n\r\nclass ANETdetection(object):\r\n\r\n    GROUND_TRUTH_FIELDS = ['database', 'taxonomy', 'version']\r\n    PREDICTION_FIELDS = ['results', 'version', 'external_data']\r\n\r\n    def __init__(self, ground_truth_filename=None, prediction_filename=None,\r\n                 ground_truth_fields=GROUND_TRUTH_FIELDS,\r\n                 prediction_fields=PREDICTION_FIELDS,\r\n                 tiou_thresholds=np.linspace(0.5, 0.95, 10), \r\n                 subset='validation', verbose=False, \r\n                 check_status=False, log_path=None, exclude_videos=None):\r\n        \r\n        if not ground_truth_filename:\r\n            raise IOError('Please input a valid ground truth file.')\r\n        if not prediction_filename:\r\n            raise IOError('Please input a valid prediction file.')\r\n        self.subset = subset\r\n        # if log_path is None:\r\n        if not logger_initilized:\r\n            print('setup logger')\r\n            logger = setup_logger(log_path)\r\n        else:\r\n            logger = logging.getLogger()\r\n        self.logger = logger\r\n        \r\n        self.tiou_thresholds = tiou_thresholds\r\n        self.verbose = verbose\r\n        self.gt_fields = ground_truth_fields\r\n        self.pred_fields = prediction_fields\r\n        self.ap = None\r\n        self.check_status = check_status\r\n        \r\n        self.blocked_videos = exclude_videos if exclude_videos else list()\r\n        # self.blocked_videos = ['video_test_0000270', 'video_test_0001292', 'video_test_0001496']\r\n        # Import ground truth and predictions.\r\n        self.ground_truth, self.activity_index = self._import_ground_truth(\r\n            ground_truth_filename)\r\n        self.prediction = self._import_prediction(prediction_filename)\r\n\r\n        if self.verbose:\r\n            self.logger.info('[INIT] Loaded annotations from {} subset.'.format(subset))\r\n            nr_gt = len(self.ground_truth)\r\n            self.logger.info('\\tNumber of ground truth instances: {}'.format(nr_gt))\r\n            nr_pred = len(self.prediction)\r\n            self.logger.info('\\tNumber of predictions: {}'.format(nr_pred))\r\n            self.logger.info('\\tFixed threshold for tiou score: {}'.format(self.tiou_thresholds))\r\n\r\n    def _import_ground_truth(self, ground_truth_filename):\r\n        \"\"\"Reads ground truth file, checks if it is well formatted, and returns\r\n           the ground truth instances and the activity classes.\r\n\r\n        Parameters\r\n        ----------\r\n        ground_truth_filename : str\r\n            Full path to the ground truth json file.\r\n\r\n        Outputs\r\n        -------\r\n        ground_truth : df\r\n            Data frame containing the ground truth instances.\r\n        activity_index : dict\r\n            Dictionary containing class index.\r\n        \"\"\"\r\n        if isinstance(ground_truth_filename, str):\r\n            with open(ground_truth_filename, 'r') as fobj:\r\n                data = json.load(fobj)\r\n        else:\r\n            data = ground_truth_filename\r\n        # # Checking format\r\n        # if not all([field in list(data.keys()) for field in self.gt_fields]):\r\n        #     raise IOError('Please input a valid ground truth file.')\r\n\r\n        # Read ground truth data.\r\n        # activity_index, cidx = {}, 0\r\n\r\n        class_list = get_classes(data)\r\n        activity_index = {cls_name: idx for idx, cls_name in enumerate(class_list)}\r\n        video_lst, t_start_lst, t_end_lst, label_lst, difficult_lst = [], [], [], [], []\r\n        for videoid, v in data['database'].items():\r\n            if self.subset != v['subset']:\r\n                continue\r\n            if videoid in self.blocked_videos:\r\n                continue\r\n            for ann in v['annotations']:\r\n                # if ann['label'] not in class_list:\r\n                #     class_list.append(ann['label'])\r\n                video_lst.append(videoid)\r\n                t_start_lst.append(float(ann['segment'][0]))\r\n                t_end_lst.append(float(ann['segment'][1]))\r\n                label_lst.append(activity_index[ann['label']])\r\n                difficult = 0 if 'difficult' not in ann else ann['difficult']\r\n                difficult_lst.append(difficult)\r\n\r\n        ground_truth = pd.DataFrame({'video-id': video_lst,\r\n                                     't-start': t_start_lst,\r\n                                     't-end': t_end_lst,\r\n                                     'label': label_lst,\r\n                                     'difficult': difficult_lst})\r\n        self.class_list = [x for x in class_list]\r\n        \r\n        return ground_truth, activity_index\r\n\r\n    def _import_prediction(self, prediction_filename):\r\n        \"\"\"Reads prediction file, checks if it is well formatted, and returns\r\n           the prediction instances.\r\n\r\n        Parameters\r\n        ----------\r\n        prediction_filename : str\r\n            Full path to the prediction json file.\r\n\r\n        Outputs\r\n        -------\r\n        prediction : df\r\n            Data frame containing the prediction instances.\r\n        \"\"\"\r\n        if isinstance(prediction_filename, str):\r\n            with open(prediction_filename, 'r') as fobj:\r\n                data = json.load(fobj)\r\n        else:\r\n            data = prediction_filename\r\n        # Checking format...\r\n        if not all([field in list(data.keys()) for field in self.pred_fields]):\r\n            raise IOError('Please input a valid prediction file.')\r\n\r\n        # Read predicitons.\r\n        video_lst, t_start_lst, t_end_lst = [], [], []\r\n        label_lst, score_lst = [], []\r\n        for videoid, v in data['results'].items():\r\n            if videoid in self.blocked_videos:\r\n                continue\r\n            for result in v:\r\n                label = self.activity_index[result['label']]\r\n                video_lst.append(videoid)\r\n                t_start_lst.append(result['segment'][0])\r\n                t_end_lst.append(result['segment'][1])\r\n                label_lst.append(label)\r\n                score_lst.append(result['score'])\r\n        prediction = pd.DataFrame({'video-id': video_lst,\r\n                                   't-start': t_start_lst,\r\n                                   't-end': t_end_lst,\r\n                                   'label': label_lst,\r\n                                   'score': score_lst})\r\n        return prediction\r\n\r\n    # def wrapper_compute_average_precision(self):\r\n    #     \"\"\"Computes average precision for each class in the subset.\r\n    #     \"\"\"\r\n    #     ap = np.zeros((len(self.tiou_thresholds), len(list(self.activity_index.items()))))\r\n    #     for activity, cidx in self.activity_index.items():\r\n    #         gt_idx = self.ground_truth['label'] == cidx\r\n    #         pred_idx = self.prediction['label'] == cidx\r\n    #         ap[:,cidx] = compute_average_precision_detection(\r\n    #             self.ground_truth.loc[gt_idx].reset_index(drop=True),\r\n    #             self.prediction.loc[pred_idx].reset_index(drop=True),\r\n    #             tiou_thresholds=self.tiou_thresholds)\r\n    #     return ap\r\n\r\n    ################################# copied from GTAD #######################################\r\n    def _get_predictions_with_label(self, prediction_by_label, label_name, cidx):\r\n        \"\"\"Get all predicitons of the given label. Return empty DataFrame if there\r\n        is no predcitions with the given label.\r\n        \"\"\"\r\n        try:\r\n            return prediction_by_label.get_group(cidx).reset_index(drop=True)\r\n        except:\r\n            if self.verbose:\r\n                print('Warning: No predictions of label \\'%s\\' were provdied.' % label_name)\r\n            return pd.DataFrame()\r\n\r\n    def wrapper_compute_average_precision(self):\r\n        \"\"\"Computes average precision for each class in the subset.\r\n        \"\"\"\r\n        ap = np.zeros((len(self.tiou_thresholds), len(self.activity_index)))\r\n\r\n        # Adaptation to query faster\r\n        ground_truth_by_label = self.ground_truth.groupby('label')\r\n        prediction_by_label = self.prediction.groupby('label')\r\n\r\n        results = Parallel(n_jobs=len(self.activity_index))(\r\n                    delayed(compute_average_precision_detection)(\r\n                        ground_truth=ground_truth_by_label.get_group(cidx).reset_index(drop=True),\r\n                        prediction=self._get_predictions_with_label(prediction_by_label, label_name, cidx),\r\n                        tiou_thresholds=self.tiou_thresholds,\r\n                    ) for label_name, cidx in self.activity_index.items())\r\n\r\n        for i, cidx in enumerate(self.activity_index.values()):\r\n            ap[:,cidx] = results[i]\r\n\r\n        return ap\r\n    #################################################################################\r\n\r\n    def evaluate(self):\r\n        \"\"\"Evaluates a prediction file. For the detection task we measure the\r\n        interpolated mean average precision to measure the performance of a\r\n        method.\r\n        \"\"\"\r\n        self.ap = self.wrapper_compute_average_precision()\r\n        self.mAP = self.ap.mean(axis=1)\r\n        if self.verbose:\r\n            self.logger.info('[RESULTS] Performance on ActivityNet detection task.')\r\n            self.logger.info('\\n{}'.format(' '.join(['%.4f' % (x * 1) for x in self.mAP])))\r\n            self.logger.info('\\tAverage-mAP: {}'.format(self.mAP.mean()))\r\n\r\ndef compute_average_precision_detection(ground_truth, prediction, tiou_thresholds=np.linspace(0.5, 0.95, 10)):\r\n    \"\"\"Compute average precision (detection task) between ground truth and\r\n    predictions data frames. If multiple predictions occurs for the same\r\n    predicted segment, only the one with highest score is matches as\r\n    true positive. This code is greatly inspired by Pascal VOC devkit.\r\n\r\n    Parameters\r\n    ----------\r\n    ground_truth : df\r\n        Data frame containing the ground truth instances.\r\n        Required fields: ['video-id', 't-start', 't-end']\r\n    prediction : df\r\n        Data frame containing the prediction instances.\r\n        Required fields: ['video-id, 't-start', 't-end', 'score']\r\n    tiou_thresholds : 1darray, optional\r\n        Temporal intersection over union threshold.\r\n\r\n    Outputs\r\n    -------\r\n    ap : float\r\n        Average precision score.\r\n    \"\"\"\r\n    \r\n    npos = float(len(ground_truth))\r\n    lock_gt = np.ones((len(tiou_thresholds),len(ground_truth))) * -1\r\n    # Sort predictions by decreasing score order.\r\n    sort_idx = prediction['score'].values.argsort()[::-1]\r\n    prediction = prediction.loc[sort_idx].reset_index(drop=True)\r\n\r\n    # Initialize true positive and false positive vectors.\r\n    tp = np.zeros((len(tiou_thresholds), len(prediction)))\r\n    fp = np.zeros((len(tiou_thresholds), len(prediction)))\r\n\r\n    # Adaptation to query faster\r\n    ground_truth_gbvn = ground_truth.groupby('video-id')\r\n    # Assigning true positive to truly grount truth instances.\r\n    for idx, this_pred in prediction.iterrows():\r\n\r\n        try:\r\n            # Check if there is at least one ground truth in the video associated.\r\n            ground_truth_videoid = ground_truth_gbvn.get_group(this_pred['video-id'])\r\n        except Exception as e:\r\n            # print(e)\r\n            fp[:, idx] = 1\r\n            continue\r\n\r\n        this_gt = ground_truth_videoid.reset_index()\r\n        tiou_arr = segment_iou(this_pred[['t-start', 't-end']].values,\r\n                               this_gt[['t-start', 't-end']].values)\r\n        # We would like to retrieve the predictions with highest tiou score.\r\n        tiou_sorted_idx = tiou_arr.argsort()[::-1]\r\n        # matched_to_difficult = False\r\n        for tidx, tiou_thr in enumerate(tiou_thresholds):\r\n            for jdx in tiou_sorted_idx:\r\n                if tiou_arr[jdx] < tiou_thr:\r\n                    fp[tidx, idx] = 1\r\n                    break\r\n                if lock_gt[tidx, this_gt.loc[jdx]['index']] >= 0:\r\n                    continue\r\n                # Assign as true positive after the filters above.\r\n                tp[tidx, idx] = 1\r\n                lock_gt[tidx, this_gt.loc[jdx]['index']] = idx\r\n                break\r\n                    \r\n            if fp[tidx, idx] == 0 and tp[tidx, idx] == 0:\r\n                fp[tidx, idx] = 1\r\n\r\n    ap = np.zeros(len(tiou_thresholds))\r\n\r\n    for tidx in range(len(tiou_thresholds)):\r\n        # Computing prec-rec\r\n        this_tp = np.cumsum(tp[tidx,:]).astype(np.float)\r\n        this_fp = np.cumsum(fp[tidx,:]).astype(np.float)\r\n        rec = this_tp / npos\r\n        prec = this_tp / (this_tp + this_fp)\r\n        ap[tidx] = interpolated_prec_rec(prec, rec)\r\n\r\n    return ap\r\n"
  },
  {
    "path": "Evaluation/utils.py",
    "content": "import json\r\nimport urllib.request, urllib.error, urllib.parse\r\n\r\nimport numpy as np\r\n\r\nAPI = 'http://ec2-52-11-11-89.us-west-2.compute.amazonaws.com/challenge17/api.py'\r\n\r\ndef get_blocked_videos(api=API):\r\n    api_url = '{}?action=get_blocked'.format(api)\r\n    req = urllib.request.Request(api_url)\r\n    response = urllib.request.urlopen(req)\r\n    return json.loads(response.read())\r\n\r\ndef interpolated_prec_rec(prec, rec):\r\n    \"\"\"Interpolated AP - VOCdevkit from VOC 2011.\r\n    \"\"\"\r\n    mprec = np.hstack([[0], prec, [0]])\r\n    mrec = np.hstack([[0], rec, [1]])\r\n    for i in range(len(mprec) - 1)[::-1]:\r\n        mprec[i] = max(mprec[i], mprec[i + 1])\r\n    idx = np.where(mrec[1::] != mrec[0:-1])[0] + 1\r\n    ap = np.sum((mrec[idx] - mrec[idx - 1]) * mprec[idx])\r\n    return ap\r\n\r\ndef segment_iou(target_segment, candidate_segments):\r\n    \"\"\"Compute the temporal intersection over union between a\r\n    target segment and all the test segments.\r\n\r\n    Parameters\r\n    ----------\r\n    target_segment : 1d array\r\n        Temporal target segment containing [starting, ending] times.\r\n    candidate_segments : 2d array\r\n        Temporal candidate segments containing N x [starting, ending] times.\r\n\r\n    Outputs\r\n    -------\r\n    tiou : 1d array\r\n        Temporal intersection over union score of the N's candidate segments.\r\n    \"\"\"\r\n    tt1 = np.maximum(target_segment[0], candidate_segments[:, 0])\r\n    tt2 = np.minimum(target_segment[1], candidate_segments[:, 1])\r\n    # Intersection including Non-negative overlap score.\r\n    segments_intersection = (tt2 - tt1).clip(0)\r\n    # Segment union.\r\n    segments_union = (candidate_segments[:, 1] - candidate_segments[:, 0]) \\\r\n      + (target_segment[1] - target_segment[0]) - segments_intersection\r\n    # Compute overlap as the ratio of the intersection\r\n    # over union of two segments.\r\n    tIoU = segments_intersection.astype(float) / segments_union\r\n    return tIoU\r\n\r\ndef wrapper_segment_iou(target_segments, candidate_segments):\r\n    \"\"\"Compute intersection over union btw segments\r\n    Parameters\r\n    ----------\r\n    target_segments : ndarray\r\n        2-dim array in format [m x 2:=[init, end]]\r\n    candidate_segments : ndarray\r\n        2-dim array in format [n x 2:=[init, end]]\r\n    Outputs\r\n    -------\r\n    tiou : ndarray\r\n        2-dim array [n x m] with IOU ratio.\r\n    Note: It assumes that candidate-segments are more scarce that target-segments\r\n    \"\"\"\r\n    if candidate_segments.ndim != 2 or target_segments.ndim != 2:\r\n        raise ValueError('Dimension of arguments is incorrect')\r\n\r\n    n, m = candidate_segments.shape[0], target_segments.shape[0]\r\n    tiou = np.empty((n, m))\r\n    for i in range(m):\r\n        tiou[:, i] = segment_iou(target_segments[i,:], candidate_segments)\r\n\r\n    return tiou\r\n"
  },
  {
    "path": "LICENSE",
    "content": "Copyright (c) 2021 - 2022, Xiaolong Liu et al. All Rights Reserved.\n\n                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n-----------------------------------------------------------------------\nDeformable DETR (https://github.com/fundamentalvision/Deformable-DETR)\n\nCopyright 2020, SenseTime\n\nLicensed under the Apache License, Version 2.0 (the \"License\");\nyou may not use this file except in compliance with the License.\nYou may obtain a copy of the License at\n\n      http://www.apache.org/licenses/LICENSE-2.0\n\nUnless required by applicable law or agreed to in writing, software\ndistributed under the License is distributed on an \"AS IS\" BASIS,\nWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nSee the License for the specific language governing permissions and\nlimitations under the License.\n\n-----------------------------------------------------------------------\nDETR (https://github.com/facebookresearch/detr)\n\nCopyright 2020 - present, Facebook, Inc\n\nLicensed under the Apache License, Version 2.0 (the \"License\");\nyou may not use this file except in compliance with the License.\nYou may obtain a copy of the License at\n\n   http://www.apache.org/licenses/LICENSE-2.0\n\nUnless required by applicable law or agreed to in writing, software\ndistributed under the License is distributed on an \"AS IS\" BASIS,\nWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nSee the License for the specific language governing permissions and\nlimitations under the License."
  },
  {
    "path": "README.md",
    "content": "# TadTR: End-to-end Temporal Action Detection with Transformer\n\n[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/end-to-end-temporal-action-detection-with/temporal-action-localization-on-thumos14)](https://paperswithcode.com/sota/temporal-action-localization-on-thumos14?p=end-to-end-temporal-action-detection-with)\n\nBy [Xiaolong Liu](https://github.com/xlliu7), [Qimeng Wang](https://scholar.google.com/citations?user=hi7AeE8AAAAJ), [Yao Hu](https://scholar.google.com/citations?user=LIu7k7wAAAAJ), [Xu Tang](https://scholar.google.com/citations?user=grP24aAAAAAJ), [Shiwei Zhang](https://scholar.google.com/citations?user=ZO3OQ-8AAAAJ), [Song Bai](http://songbai.site), [Xiang Bai](https://scholar.google.com/citations?user=UeltiQ4AAAAJ).\n\nThis repo holds the code for TadTR, described in the paper\n[End-to-end temporal action detection with Transformer](https://arxiv.org/abs/2106.10271) published in IEEE Transactions on Image Processing (TIP) 2022.\n\n<!-- _The tech report is out-dated. We have significantly improved TadTR since we uploaded it to arxiv. It achives much better performance now. We'll update the arxiv version recently._  -->\n\nWe have also explored fully end-to-end training from RGB images with TadTR. See our CVPR 2022 work [E2E-TAD][e2e-tad].\n\n\n## Introduction\n\nTadTR is an end-to-end Temporal Action Detection TRansformer. It has the following advantages over previous methods:\n- Simple. It adopts a set-prediction pipeline and achieves TAD with a *single network*. It does not require a separate proposal generation stage.\n- Flexible. It removes hand-crafted design such as anchor setting and NMS.\n- Sparse. It produces very sparse detections (e.g. 10 on ActivityNet), thus requiring lower computation cost.\n- Strong. As a *self-contained* temporal action detector, TadTR achieves state-of-the-art performance on HACS and THUMOS14. It is also much stronger than concurrent Transformer-based methods such as **RTD-Net** and **AGT**.\n\n![](data_intro/arch.png \"Architecture\")\n\n## Updates\n[2023.2.19] Fix a bug a loss caculation ([issue #21](https://github.com/xlliu7/TadTR/issues/21)). Thank [@zachpvin](https://github.com/zachpvin) for raising this issue!\n\n[2022.8.7] Add support for training/testing on THUMOS14!\n\n[2022.7.4] Glad to share that this paper will appear in IEEE Transactions on Image Processing (TIP). Although I am still busy with my thesis, I will try to make the code accessible soon. Thanks for your patience.\n\n[2022.6] Update the technical report of this work on arxiv (now v3).\n\n[2022.3] Our new work [E2E-TAD][e2e-tad] based on TadTR is accepted to CVPR 2022. It supports fully end-to-end training from RGB images.\n\n[2021.9.15] Update the performance on THUMOS14.\n\n[2021.9.1] Add demo code.\n\n[2021.7] Our revised paper was submitted to IEEE Transactions on Image Processing.\n\n[2021.6] Our revised paper was uploaded to arxiv.\n\n[2021.1.21] Our paper was submitted to IJCAI 2021. \n\n## TODOs\n- [x] add model code\n- [x] add inference code\n- [x] add training code\n- [x] support training/inference with video input. See [E2E-TAD][e2e-tad]\n\n## Main Results\n- HACS Segments\n\n|Method|Feature|mAP@0.5|mAP@0.75|mAP@0.95|Avg. mAP|\n| :----: |:----: | :--: | :----: | :---: | :----: |\n|TadTR|I3D RGB|47.14 |32.11 |10.94| 32.09|\n\n\n- THUMOS14\n\n|Method|Feature|mAP@0.3|mAP@0.4|mAP@0.5|mAP@0.6|mAP@0.7|Avg. mAP|\n| :----: |:----: | :--: | :----: | :---: | :----: |:----: | :----: |\n|TadTR|I3D 2stream|74.8 |69.1| 60.1| 46.6| 32.8| 56.7|\n\n- ActivityNet-1.3\n\n|Method|Feature|mAP@0.5|mAP@0.75|mAP@0.95|Avg. mAP|\n| :----: |:----: | :--: | :----: | :---: | :----: |\n|TadTR|TSN 2stream|51.29 |34.99| 9.49| 34.64|\n|TadTR|TSP|53.62| 37.52| 10.56| 36.75|\n\n\n## Install\n### Requirements\n\n* Linux or Windows\n  \n* Python>=3.7\n\n* (Optional) CUDA>=9.2, GCC>=5.4\n  \n* PyTorch>=1.5.1, torchvision>=0.6.1 (following instructions [here](https://pytorch.org/))\n  \n* Other requirements\n    ```bash\n    pip install -r requirements.txt\n    ```\n### Compiling CUDA extensions (Optional)\nThe RoIAlign operator is implemented with CUDA extension.\nIf your machine does have a NVIDIA GPU with CUDA support, you can run this step. Otherwise, please set `disable_cuda=True` in `opts.py`.\n```bash\ncd model/ops;\n\n# If you have multiple installations of CUDA Toolkits, you'd better add a prefix\n# CUDA_HOME=<your_cuda_toolkit_path> to specify the correct version. \npython setup.py build_ext --inplace\n```\n\n### Run a quick test\n```\npython demo.py\n```\n\n## 1.Data Preparation\nCurrently we only support `thumos14`.\n\n### THUMOS14\nDownload all data from [[BaiduDrive(code: adTR)]](https://pan.baidu.com/s/183VprlbKNjMb3Gr-rfmROQ) or [[OneDrive]](https://husteducn-my.sharepoint.com/:f:/g/personal/liuxl_hust_edu_cn/EsMyXDlkrTdBsikoRQSIeUsBkxJJRsplbMyIQVYotiZRIQ?e=QYgiCH).\n\n- Features: Download the I3D features `I3D_2stream_Pth.tar`. It was originally provided by the authors of P-GCN. I have concatenated the RGB and Flow features (drop the tail of the longer one if the lengths are inconsistent) and converted the data to float32 precision to save space.\n- Annotations: The annotations of action instances and the meta information of feature files. Both are in JSON format (`th14_annotations_with_fps_duration.json` and `th14_i3d2s_ft_info.json`).\n- Pre-trained Reference Models: Our pretrained model that use I3D features `thumos14_i3d2s_tadtr_reference.pth`. This model corresponds to the config file `configs/thumos14_i3d2s_tadtr.yml`.\n\nAfter downloading is finished, extract the archived feature files inplace by `cd data;tar -xf I3D_2stream_Pth.tar`. Then put the features, annotations, the model under the `data/thumos14` directory. We expect the following structure in root folder.\n```\n- data\n  - thumos14\n    - I3D_2stream_Pth\n     - xxxxx\n     - xxxxx\n    - th14_annotations_with_fps_duration.json\n    - th14_i3d2s_ft_info.json\n    - thumos14_tadtr_reference.pth\n```\n\n\n## 2.Testing Pre-trained Models\nRun\n```\npython main.py --cfg CFG_PATH --eval --resume CKPT_PATH\n```\nCFG_PATH is the path to the YAML-format config file that defines the experimental setting. For example, `configs/thumos14_i3d2s_tadtr.yml`. CKPT_PATH is the path of the pre-trained model. Alternatively, you can execute the Shell script `bash scripts/test_reference_models.sh thumos14` for simplity.\n\n\n## 3.Training by Yourself\nRun the following command\n```\npython main.py --cfg CFG_PATH\n```\n\nThis codebase supports running on both CPU and GPU. \n- To run on CPU: please add ` --device cpu` to the above command. Also, you need to set `disable_cuda=True` in `opts.py`. The CPU mode does not support actionness regression and the detection performance is lower.\n- To run on GPU: since the model is very lightweight, just one GPU is enough. You may specify the GPU device ID (e.g., 0) to use by the adding the prefix `CUDA_VISIBLE_DEVICES=ID ` before the above command. To run on multiple GPUs, please refer to `scripts/run_parallel.sh`.\n\nDuring training, our code will automatically perform testing every N epochs (N is the `test_interval` in opts.py). Training takes 6~10 minutes on THUMOS14 if you use a modern GPU (e.g. TITAN Xp). You can also monitor the training process with Tensorboard (need to set `cfg.tensorboard=True` in `opts.py`). The tensorboard record and the checkpoint will be saved at `output_dir` (can be modified in config file).\n\nAfter training is done, you can also test your trained model by running\n```\npython main.py --cfg CFG_PATH --eval\n```\nIt will automatically use the best model checkpoint. If you want to manually specify the model checkpoint, run\n```\npython main.py --cfg CFG_PATH --eval --resume CKPT_PATH\n```\n\nNote that the performance of the model trained by your own may be different from the reference model, even though all seeds are fixed. The reason is that TadTR uses the `grid_sample` operator, whoses gradient computation involves the non-deterministic `AtomicAdd` operator. Please refer to [ref1](https://pytorch.org/docs/stable/notes/randomness.html) [ref2](https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html#torch.use_deterministic_algorithms) [ref3(Chinese)](https://zhuanlan.zhihu.com/p/109166845) for details.\n\n## Acknowledgement\nThe code is based on the [DETR](https://github.com/facebookresearch/detr) and [Deformable DETR](https://github.com/fundamentalvision/Deformable-DETR). We also borrow the implementation of the RoIAlign1D from [G-TAD](https://github.com/Frostinassiky/gtad). Thanks for their great works.\n\n## Citing\n```\n@article{liu2022end,\n  title={End-to-end Temporal Action Detection with Transformer},\n  author={Liu, Xiaolong and Wang, Qimeng and Hu, Yao and Tang, Xu and Zhang, Shiwei and Bai, Song and Bai, Xiang},\n  journal={IEEE Transactions on Image Processing (TIP)},\n  year={2022}\n}\n```\n\n## Contact\n\nFor questions and suggestions, please contact Xiaolong Liu by email (\"liuxl at hust dot edu dot cn\").\n\n[e2e-tad]: https://github.com/xlliu7/E2E-TAD\n"
  },
  {
    "path": "configs/thumos14_i3d2s_tadtr.yml",
    "content": "# model setting\nenc_layers: 4\ndec_layers: 4\ndim_feedforward: 1024\nnum_queries: 40\n\n# data setting\ndataset_name: thumos14\nfeature: i3d2s\nfeature_dim: 2048\nonline_slice: true\nslice_len: 128\nslice_overlap: 0.75\ntest_slice_overlap: 0.25\n\n# output\noutput_dir: outputs/thumos14_i3d2s_tadtr\n\n"
  },
  {
    "path": "datasets/__init__.py",
    "content": "from .tad_dataset import build as build_video_dataset\n\n\ndef build_dataset(subset, args, mode):\n    if args.dataset_name in ['activitynet', 'thumos14', 'hacs', 'muses']:\n        return build_video_dataset(args.dataset_name, subset, args, mode)\n    \n    raise ValueError(f'dataset {args.dataset_name} not supported')"
  },
  {
    "path": "datasets/data_utils.py",
    "content": "'''Utilities for data loading'''\n\nimport json\nimport math\nimport logging\nimport os\n\nimport pandas as pd\nimport easydict\nimport yaml\n\nimport numpy as np\n# import cv2\nimport torch\nimport torch.nn.functional as F\n# import ipdb as pdb\n\ndef load_json(path):\n    return json.load(open(path))\n\n\ndef get_valid_anno(gt_instances, slice, thr=0.75,\n        start_getter=lambda x: x['segment'][0],\n        end_getter=lambda x: x['segment'][1]):\n    '''Perform integrity based instance filtering'''\n    start, end = slice\n    kept_instances = []\n    for inst in gt_instances:\n        # ignore insts outside the time window (slice)\n        if end_getter(inst) <= start or start_getter(inst) >= end:\n            continue\n        else:\n            # clamped inst\n            new_start = max(start_getter(inst), start)\n            new_end = min(end_getter(inst), end)\n            integrity = (new_end - new_start) * 1.0 / (end_getter(inst) - start_getter(inst))\n            \n            if integrity >= thr:\n                new_inst = {k:v for k,v in inst.items()}\n                new_inst['segment'] = [new_start - start, new_end - start]\n                kept_instances.append(new_inst)\n    return kept_instances\n\n\ndef get_dataset_dict(video_info_path, video_anno_path, subset, mode='test', exclude_videos=None, online_slice=False, slice_len=None, ignore_empty=True, slice_overlap=0, return_id_list=False):\n    '''\n    Prepare a dict that contains the information of each video, such as duration, annotations.\n    Args:\n        video_info_path: path to the video info file in json format. This file records the length and fps of each video.\n        video_anno_path: path to the ActivityNet-style video annotation in json format.\n        subset: e.g. train, val, test\n        mode: train (for training) or test (for inference).\n        online_slice: cut videos into slices for training and testing. It should be enabled if the videos are too long.\n        slice_len: length of video slices.\n        ignore_empty: ignore video slices that does not contain any action instance. This should be enabled only in the training phase.\n        slice_overlap: overlap ration between adjacent slices (= overlap_length / slice_len)\n\n    Return:\n        dict\n    '''\n    video_ft_info = load_json(video_info_path)\n    anno_data = load_json(video_anno_path)['database']\n\n    video_dict = {}\n    id_list = []\n    cnt = 0\n\n    video_set = set([x for x in anno_data if anno_data[x]['subset'] in subset])\n    video_set = video_set.intersection(video_ft_info.keys())\n\n    if exclude_videos is not None:\n        assert isinstance(exclude_videos, (list, tuple))\n        video_set = video_set.difference(exclude_videos)\n\n    video_list = list(sorted(video_set))\n\n    for video_name in video_list:\n        # remove ambiguous instances on THUMOS14\n        annotations = [x for x in anno_data[video_name]['annotations'] if x['label'] != 'Ambiguous']\n        annotations = list(sorted(annotations, key=lambda x: sum(x['segment'])))\n\n        if video_name in video_ft_info:\n            # video_info records the length in snippets, duration and fps (#frames per second) of the feature/image sequence\n            video_info = video_ft_info[video_name]\n            # number of frames or snippets\n            feature_length = int(video_info['feature_length'])   \n            feature_fps = video_info['feature_fps']\n            feature_second = video_info['feature_second']\n        else:\n            continue\n\n        video_subset = anno_data[video_name]['subset']\n        # For THUMOS14, we crop video into slices of fixed length\n        if online_slice:\n            stride = slice_len * (1 - slice_overlap)\n\n            if feature_length <= slice_len:\n                slices = [[0, feature_length]]\n            else:\n                # stride * (i - 1) + slice_len <= feature_length\n                # i <= (feature_length - slice_len)\n                num_complete_slices = int(math.floor(\n                    (feature_length / slice_len - 1) / (1 - slice_overlap) + 1))\n                slices = [\n                    [int(i * stride), int(i * stride) + slice_len] for i in range(num_complete_slices)]\n                if (num_complete_slices - 1) * stride + slice_len < feature_length:\n                    # if video_name == 'video_test_0000006':\n                    #     pdb.set_trace()\n                    if mode != 'train':\n                        # take the last incomplete slice\n                        last_slice_start = int(stride * num_complete_slices)\n                    else:\n                        # move left to get a complete slice.\n                        # This is a historical issue. The performance might be better\n                        # if we keep the same rule for training and inference \n                        last_slice_start = max(0, feature_length - slice_len)\n                    slices.append([last_slice_start, feature_length])\n            num_kept_slice = 0\n            for slice in slices:\n                time_slices = [slice[0] / video_info['feature_fps'], slice[1] / video_info['feature_fps']]\n                feature_second = time_slices[1] - time_slices[0]\n                # perform integrity-based instance filtering\n                valid_annotations = get_valid_anno(annotations, time_slices)\n                \n                if not ignore_empty or len(valid_annotations) >= 1:\n                    # rename the video slice\n                    new_vid_name = video_name + '_window_{}_{}'.format(*slice)\n                    new_vid_info = {\n                        'annotations': valid_annotations, 'src_vid_name': video_name, \n                        'feature_fps': feature_fps, 'feature_length': slice_len, \n                        'subset': subset, 'feature_second': feature_second, 'time_offset': time_slices[0]}\n                    video_dict[new_vid_name] = new_vid_info\n                    id_list.append(new_vid_name)\n                    num_kept_slice += 1\n            if num_kept_slice > 0:\n                cnt += 1\n        # for ActivityNet and hacs, use the full-length videos as samples\n        else:\n            if not ignore_empty or len(annotations) >= 1:\n                # Remove incorrect annotions on ActivityNet\n                valid_annotations = [x for x in annotations if x['segment'][1] - x['segment'][0] > 0.02]\n\n                if ignore_empty and len(valid_annotations) == 0:\n                    continue\n                \n                video_dict[video_name] = {\n                    'src_vid_name': video_name, 'annotations': valid_annotations, \n                    'feature_fps': feature_fps, 'feature_length': int(feature_length),\n                    'subset': video_subset, 'feature_second': feature_second, 'time_offset': 0}\n                id_list.append(video_name)\n                cnt += 1\n    logging.info('{} videos, {} slices'.format(cnt, len(video_dict)))\n    if return_id_list:\n        return video_dict, id_list\n    else:\n        return video_dict\n\n\ndef load_video_frames(frame_dir, start, seq_len, stride=1, fn_tmpl='img_%07d.jpg'):\n    raise NotImplementedError\n\n\ndef load_feature(ft_path, ft_format, shape=None):\n    if ft_format == 'npy':\n        video_df = np.load(ft_path)\n        if shape == \"CT\":\n            video_df = video_df.T\n    elif ft_format == 'torch':\n        video_df = torch.load(ft_path).numpy()\n\n    else:\n        raise ValueError('unsupported feature format: {}'.format(ft_format))\n    return video_df\n\n\ndef get_dataset_info(dataset, feature):\n    '''get basic information for each dataset'''\n\n    path_info = easydict.EasyDict(yaml.load(open('datasets/path.yml'), yaml.SafeLoader))\n\n    if dataset == 'thumos14':\n        subset_mapping = {'train': 'val', 'val': 'test'}\n        ann_file = path_info['thumos14']['ann_file']\n    \n        if feature == 'i3d2s':\n            feature_info = {'local_path': path_info['thumos14'][feature]['local_path'], 'format': 'torch', 'fn_templ': '%s'}\n            ft_info_file = path_info['thumos14'][feature]['ft_info_file']\n\n        else:\n            raise ValueError('unsupported feature, should be one of [i3d2s]')\n\n    elif dataset == 'activitynet':\n        raise NotImplementedError\n\n    elif dataset == 'hacs':\n        raise NotImplementedError\n    \n    elif dataset == 'muses':\n        raise NotImplementedError\n        \n    else:\n        raise ValueError('unsupported dataset {}'.format(dataset))\n\n    return subset_mapping, feature_info, ann_file, ft_info_file\n\n\n\ndef make_img_transform(*args, **kwargs):\n    raise NotImplementedError"
  },
  {
    "path": "datasets/path.yml",
    "content": "# set the path of features, anno file and feature info file\n\nthumos14:\n  ann_file: 'data/thumos14/th14_annotations_with_fps_duration.json'\n  i3d2s:\n    local_path: data/thumos14/I3D_2stream_Pth\n    ft_info_file: 'data/thumos14/th14_i3d2s_ft_info.json'\n"
  },
  {
    "path": "datasets/tad_dataset.py",
    "content": "# ------------------------------------------------------------------------\n# TadTR: End-to-end Temporal Action Detection with Transformer\n# Copyright (c) 2021 - 2022. Xiaolong Liu.\n# ------------------------------------------------------------------------\n\n'''Universal TAD Dataset loader.'''\n\nimport json\nimport logging\nimport math\nimport os.path as osp\n\nimport numpy as np\nimport pandas as pd\nimport torch\nimport torch.nn.functional as F\nimport torch.utils.data\nimport tqdm\nimport h5py\n\nfrom .data_utils import get_dataset_dict, load_feature, load_video_frames, get_dataset_info, make_img_transform\n# from util.config import cfg\nfrom util.segment_ops import segment_t1t2_to_cw\n\n\n\nclass TADDataset(torch.utils.data.Dataset):\n    def __init__(self, subset, mode, feature_info, ann_file, ft_info_file, transforms, mem_cache=False, online_slice=False, slice_len=None, slice_overlap=0, binary=False, padding=True, input_type='feature', img_stride=1):\n        '''TADDataset\n        Parameters:\n            subset: train/val/test\n            mode: train, or test\n            feature_info: basic info of video features, e.g. path, file format, filename template\n            ann_file: path to the ground truth file\n            ft_info_file: path to the file that describe other information of each video\n            transforms: which transform to use\n            mem_cache: cache features of the whole dataset into memory.\n            binary: transform all gt to binary classes. This is required for training a class-agnostic detector\n            padding: whether to pad the input feature to `slice_len`\n        \n        '''\n\n        super().__init__()\n        self.feature_info = feature_info\n        self.ann_file = ann_file\n        self.ft_info_file = ft_info_file\n        self.subset = subset\n        self.online_slice = online_slice\n        self.slice_len = slice_len\n        self.slice_overlap = slice_overlap\n        self.padding = padding\n        self.mode = mode\n        self.transforms = transforms\n        print('Use data transform {}'.format(self.transforms))\n        self.binary = binary\n        self.is_image_input = input_type == 'image'\n        self.mem_cache = mem_cache\n        self.img_stride = img_stride\n\n        self._prepare()\n\n    def _get_classes(self, anno_dict):\n        '''get class list from the annotation dict'''\n        if 'classes' in anno_dict:\n            classes = anno_dict['classes']\n        else:\n            database = anno_dict['database']\n            all_gts = []\n            for vid in database:\n                all_gts += database[vid]['annotations']\n            classes = list(sorted({x['label'] for x in all_gts}))\n        return classes\n\n    def _prepare(self):\n        '''parse annotation file'''\n        anno_dict = json.load(open(self.ann_file))\n        self.classes = self._get_classes(anno_dict)\n      \n        self.video_dict, self.video_list = get_dataset_dict(self.ft_info_file, self.ann_file, self.subset, mode=self.mode, online_slice=self.online_slice, slice_len=self.slice_len, slice_overlap=self.slice_overlap, ignore_empty=self.mode == 'train', return_id_list=True)\n\n        # video_list = self.video_dict.keys()\n        # self.video_list = list(sorted(video_list))\n     \n        logging.info(\"{} subset video numbers: {}\".format(self.subset,len(self.video_list)))\n        self.anno_dict = anno_dict\n\n        self.cached_data = {}\n\n        # if the features of all videos is saved in one hdf5 file (all in one), e.g. TSP features\n        self.all_video_data = {}\n        feature_info = self.feature_info\n        fn_templ = feature_info['fn_templ']\n        src_video_list = {self.video_dict[k]['src_vid_name'] for k in self.video_list}\n        # \n        if feature_info.get('all_in_one', False):\n            data = h5py.File(feature_info['local_path'][self.subset])\n            for k in src_video_list:\n                self.all_video_data[k] = np.array(data[fn_templ % k]).T\n            if not self.online_slice:\n                self.cached_data = self.all_video_data\n\n    def __len__(self):\n        return len(self.video_list)\n\n    def _get_video_data(self, index):\n        if self.is_image_input:\n            return self._get_img_data(index)\n        else:\n            return self._get_feature_data(index)\n\n    def _get_feature_data(self,index):\n        video_name = self.video_list[index]\n        # directly fetch from memory\n        if video_name in self.cached_data:\n            video_data = self.cached_data[video_name]\n            return torch.Tensor(video_data).float().contiguous()\n\n        src_vid_name = self.video_dict[video_name]['src_vid_name']\n        # retrieve feature info\n        feature_info = self.feature_info\n        # \"ft\" is short for \"feature\"\n        local_ft_dir = feature_info['local_path']\n        ft_format = feature_info['format']\n        local_ft_path = osp.join(local_ft_dir, feature_info['fn_templ'] % src_vid_name) if local_ft_dir else None\n        # the shape of feature sequence, can be TxC (in most cases) or CxT\n        shape = feature_info.get('shape', 'TC')\n\n        if src_vid_name in self.all_video_data:\n            feature_data = self.all_video_data[src_vid_name].T\n        \n        else:\n            feature_data = load_feature(local_ft_path, ft_format, shape)\n\n        feature_data = feature_data.T   # T x C to C x T.\n\n        if self.online_slice:\n            slice_start, slice_end = [int(x) for x in video_name.split('_')[-2:]]\n            assert slice_end  > slice_start\n            assert slice_start < feature_data.shape[1]\n            feature_data = feature_data[:, slice_start:slice_end]\n\n            if self.padding and feature_data.shape[1] < self.slice_len:\n                diff = self.slice_len - feature_data.shape[1]\n                feature_data = np.pad(\n                    feature_data, ((0, 0), (0, diff)), mode='constant')\n\n                # IMPORATANT: if padded is done, the length info must be modified\n                self.video_dict[video_name]['feature_length'] = self.slice_len\n                self.video_dict[video_name]['feature_second'] = self.slice_len / self.video_dict[video_name]['feature_fps']\n\n        if self.mem_cache and video_name not in self.cached_data:\n            self.cached_data[video_name] = feature_data\n\n        feature_data = torch.Tensor(feature_data).float().contiguous()\n        return feature_data\n\n    def _get_img_data(self, index):\n        '''have not been tested'''\n        raise NotImplementedError\n\n    def _get_train_label(self, video_name):\n        '''get normalized target'''\n        video_info = self.video_dict[video_name]\n        video_labels = video_info['annotations']\n        feature_second = video_info['feature_second']\n      \n        target = {\n            'segments': [], 'labels': [],\n            'orig_labels': [], 'video_id': video_name,\n            'video_duration': feature_second,   # only used in inference\n            'feature_fps': video_info['feature_fps'],\n            }\n        for j in range(len(video_labels)):\n            tmp_info=video_labels[j]\n           \n            segment = tmp_info['segment'] \n            # special rule for thumos14, treat ambiguous instances as negatives\n            if tmp_info['label'] not in self.classes:\n                continue\n            # the label id of first forground class is 0\n            label_id = self.classes.index(tmp_info['label'])\n            target['orig_labels'].append(label_id)\n\n            if self.binary:\n                label_id = 0\n            target['segments'].append(segment)\n            target['labels'].append(label_id)\n\n        # normalized the coordinate\n        target['segments'] = np.array(target['segments']) / feature_second\n        \n        if len(target['segments']) > 0:\n            target['segments'] = segment_t1t2_to_cw(target['segments'])\n\n            # convert to torch format\n            for k, dtype in zip(['segments', 'labels'], ['float32', 'int64']):\n                if not isinstance(target[k], torch.Tensor):\n                    target[k] = torch.from_numpy(np.array(target[k], dtype=dtype))\n       \n        return target\n\n    def __getitem__(self, index):\n        # index = index % len(self.video_list)\n        video_data = self._get_video_data(index)\n        video_name = self.video_list[index]\n\n        target =  self._get_train_label(video_name)\n        \n        return video_data, target\n       \n\ndef build(dataset, subset, args, mode):\n    '''build TADDataset'''\n    subset_mapping, feature_info, ann_file, ft_info_file = get_dataset_info(dataset, args.feature)\n    transforms = None\n    if args.input_type == 'image':\n        transforms = make_img_transform(mode)\n    else:\n        transforms = None\n    return TADDataset(\n        subset_mapping[subset], mode, feature_info, ann_file, ft_info_file, transforms,\n        online_slice=args.online_slice, slice_len=args.slice_len, slice_overlap=args.slice_overlap if mode=='train' else args.test_slice_overlap, \n        binary=args.binary,\n        input_type=args.input_type)\n"
  },
  {
    "path": "datasets/tad_eval.py",
    "content": "# TadTR: End-to-end Temporal Action Detection with Transformer\n\nimport json\nimport os.path as osp\nimport os\nimport pandas as pd\nimport time\nimport numpy as np\nimport logging\nimport concurrent.futures\nimport sys\nimport logging\n# import ipdb as pdb\nimport pickle\n\nfrom opts import cfg\n\nfrom Evaluation.eval_detection import compute_average_precision_detection\n# from Evaluation.eval_proposal import average_recall_vs_avg_nr_proposals\nimport matplotlib.pyplot as plt\n# from util.proposal_utils import soft_nms\nfrom .data_utils import get_dataset_dict\nfrom util.misc import all_gather\nfrom util.segment_ops import soft_nms, temporal_nms\n\n\ndef eval_ap(iou, cls, gt, predition):\n    ap = compute_average_precision_detection(gt, predition, iou)\n    sys.stdout.flush()\n    return cls, ap\n\n\ndef apply_nms(dets_arr, nms_thr=0.4, use_soft_nms=False):\n    # the last column are class ids\n    unique_classes = np.unique(dets_arr[:, 3])\n    output_dets = []\n    for cls in unique_classes:\n        this_cls_dets = dets_arr[dets_arr[:,3] == cls]\n        if not use_soft_nms:\n            this_cls_dets_kept = temporal_nms(this_cls_dets, nms_thr)\n        else:\n            classes = this_cls_dets[:, [3]]\n            this_cls_dets_kept = soft_nms(this_cls_dets, 0.8, 0, 0, 100)\n            this_cls_dets_kept = np.concatenate((this_cls_dets_kept, classes), -1)\n        output_dets.append(this_cls_dets_kept)\n    output_dets = np.concatenate(output_dets, axis=0)\n    sort_idx = output_dets[:, 2].argsort()[::-1]\n    output_dets = output_dets[sort_idx, :]\n    return output_dets\n\n\nclass TADEvaluator(object):\n    def __init__(self, dataset_name, subset, video_dict=None, nms_mode=['raw'], iou_range=[0.5], epoch=None, num_workers=None):\n        '''dataset_name:  thumos14, activitynet or hacs\n        subset: val or test\n        video_dict: the dataset dict created in video_dataset.py\n        iou_range: [0.3:0.7:0.1] for thumos14; [0.5:0.95:0.05] for anet and hacs.\n        '''\n\n        self.epoch = epoch\n        self.iou_range = iou_range\n        self.nms_mode = nms_mode\n        self.dataset_name = dataset_name\n        self.ignored_videos = list()\n\n        if dataset_name == 'thumos14':\n            subset_mapping = {'train': 'val', 'val': 'test'}\n            anno_file = 'data/thumos14/th14_annotations_with_fps_duration.json'\n            # follow SSN/PGCN/AFSD/MUSES to remove three falsely annotated videos\n            self.ignored_videos = ['video_test_0000270', 'video_test_0001292', 'video_test_0001496']\n        else:\n            raise NotImplementedError\n        anno_dict = json.load(open(anno_file))\n        classes = self._get_classes(anno_dict)\n        num_classes = len(classes)\n        \n        database = anno_dict['database']\n        all_gt = []\n\n        unique_video_list = [x for x in database if database[x]['subset'] in subset_mapping[subset]]\n\n        for vid in unique_video_list:\n            if vid in self.ignored_videos:\n                continue\n            this_gts = [x for x in database[vid]['annotations'] if x['label'] != 'Ambiguous']\n            all_gt += [[vid, classes.index(x['label']), x['segment'][0], x['segment'][1]] for x in this_gts]\n\n        all_gt = pd.DataFrame(all_gt, columns=[\"video-id\", \"cls\",\"t-start\", \"t-end\"])\n        self.video_ids = all_gt['video-id'].unique().tolist()\n        logging.info('{} ground truth instances from {} videos'.format(len(all_gt), len(self.video_ids)))\n\n        # per class ground truth\n        gt_by_cls = []\n        for cls in range(num_classes):\n            gt_by_cls.append(all_gt[all_gt.cls == cls].reset_index(drop=True).drop('cls', 1))\n\n        self.gt_by_cls = gt_by_cls\n        self.all_pred = {k: [] for k in self.nms_mode}\n        self.num_classes = num_classes\n        self.classes = classes\n        self.anno_dict = anno_dict\n        self.all_gt = all_gt\n        self.num_workers = num_classes if num_workers is None else num_workers\n        self.video_dict = video_dict\n        self.stats = {k: dict() for k in self.nms_mode}\n        self.subset = subset\n\n    def _get_classes(self, anno_dict):\n        if 'classes' in anno_dict:\n            classes = anno_dict['classes']\n        else:\n            \n            database = anno_dict['database']\n            all_gts = []\n            for vid in database:\n                all_gts += database[vid]['annotations']\n            classes = list(sorted({x['label'] for x in all_gts}))\n        return classes\n\n    def update(self, pred, assign_cls_labels=False):\n        '''pred: a dict of predictions for each video. For each video, the predictions are in a dict with these fields: scores, labels, segments\n        assign_cls_labels: manually assign class labels to the detections. This is necessary when the predictions are class-agnostic.\n        '''\n        pred_numpy = {k: {kk: vv.detach().cpu().numpy() for kk, vv in v.items()} for k,v in pred.items()}\n        for k, v in pred_numpy.items():\n            # pdb.set_trace()\n            if 'window' not in k:\n                this_dets = [\n                    [v['segments'][i, 0], \n                     v['segments'][i, 1],\n                     v['scores'][i], v['labels'][i]]\n                     for i in range(len(v['scores']))]\n                video_id = k\n            else:\n                window_start = self.video_dict[k]['time_offset']\n                video_id = self.video_dict[k]['src_vid_name']\n                this_dets = [\n                    [v['segments'][i, 0] + window_start, \n                     v['segments'][i, 1] + window_start, \n                     v['scores'][i],\n                     v['labels'][i]]\n                    for i in range(len(v['scores']))]\n            \n            # ignore videos that are not in ground truth set\n            if video_id not in self.video_ids:\n                continue\n            this_dets = np.array(this_dets)   # start, end, score, label\n            \n            for nms_mode in self.nms_mode:\n                input_dets = np.copy(this_dets)\n                # if nms_mode == 'nms' and not (cfg.TEST_SLICE_OVERLAP > 0 and self.dataset_name == 'thumos14'):  # when cfg.TEST_SLICE_OVERLAP > 0, only do nms at summarization\n                #     dets = apply_nms(input_dets, nms_thr=cfg.nms_thr, use_soft_nms=self.dataset_name=='activitynet' and assign_cls_labels)\n                # else:\n                if True:\n                    sort_idx = input_dets[:, 2].argsort()[::-1]\n                    dets = input_dets[sort_idx, :]\n\n                # only keep top 200 detections per video\n                dets = dets[:200, :]\n\n                # On ActivityNet, follow the tradition to use external video label\n                if assign_cls_labels:\n                        raise NotImplementedError\n                self.all_pred[nms_mode] += [[video_id, k] + det for det in dets.tolist()]\n\n\n    def nms_whole_dataset(self):\n        video_ids = list(set([v['src_vid_name'] for k, v in self.video_dict.items()]))\n        all_pred = []\n        for vid in video_ids:\n            this_dets = self.all_pred['nms'][self.all_pred['nms']['video-id'] == vid][['t-start', 't-end', 'score', 'cls']].values\n            \n            this_dets = apply_nms(this_dets)[:200, ...]\n            this_dets = [[vid] + x.tolist() for x in this_dets]\n            all_pred += this_dets\n        self.all_pred['nms'] = pd.DataFrame(all_pred, columns=[\"video-id\", \"t-start\", \"t-end\", \"score\", \"cls\"])\n\n    def cross_window_fusion(self):\n        '''\n        merge detections in the overlapped regions of adjacent windows. Only used for THUMOS14\n        '''\n        # video_ids = list(set([v['src_vid_name'] for k, v in self.video_dict.items()]))\n        all_pred = []\n\n        video_ids = self.all_pred['raw']['video-id'].unique()\n        vid = video_ids[0]\n\n        for vid in video_ids:\n            this_dets = self.all_pred['raw'][self.all_pred['raw']['video-id'] == vid]\n            slice_ids = this_dets['slice-id'].unique().tolist()\n            if len(slice_ids) > 1:\n                slice_sorted = sorted(slice_ids, key=lambda k: int(k.split('_')[4]))\n               \n                overlap_region_time_list = []\n                for i in range(0, len(slice_sorted) - 1):\n                    slice_name = slice_sorted[i]\n                    feature_fps = self.video_dict[slice_name]['feature_fps']\n                    time_base = 0  # self.video_dict[slice_name]['time_base']\n                    # parse the temporal coordinate from name\n                    cur_slice = [int(x) for x in slice_sorted[i].split('_')[4:6]]\n                    next_slice = [int(x) for x in slice_sorted[i+1].split('_')[4:6]]\n                    overlap_region_time = [next_slice[0], cur_slice[1]]\n                    # add time offset of each window/slice\n                    overlap_region_time = [time_base + overlap_region_time[iii] / feature_fps for iii in range(2)]\n                    overlap_region_time_list.append(overlap_region_time)\n                \n                mask_union = None\n                processed_dets = []\n                for overlap_region_time in overlap_region_time_list:\n                    inters = np.minimum(this_dets['t-end'], overlap_region_time[1]) - np.maximum(this_dets['t-start'], overlap_region_time[0])\n                    # we only perform NMS to the overlapped regions\n                    mask = inters > 0\n                    overlap_dets = this_dets[mask]\n                    overlap_dets_arr = overlap_dets[['t-start', 't-end', 'score', 'cls']].values\n                    if len(overlap_dets) > 0:\n                        kept_dets_arr = apply_nms(np.concatenate((overlap_dets_arr, np.arange(len(overlap_dets_arr))[:, None]), axis=1))\n                        processed_dets.append(overlap_dets.iloc[kept_dets_arr[:, -1].astype('int64')])\n                    \n                    if mask_union is not None:\n                        mask_union = mask_union | mask\n                    else:\n                        mask_union = mask\n                # instances not in overlapped region\n                processed_dets.append(this_dets[~mask_union])\n                all_pred += processed_dets\n            else:\n                all_pred.append(this_dets)\n\n        all_pred = pd.concat(all_pred)\n        self.all_pred['raw'] = all_pred\n\n    def accumulate(self, test_slice_overlap=0):\n        '''accumulate detections in all videos'''\n        for nms_mode in self.nms_mode:\n            self.all_pred[nms_mode] = pd.DataFrame(self.all_pred[nms_mode], columns=[\"video-id\", \"slice-id\", \"t-start\", \"t-end\", \"score\", \"cls\"])\n        \n        self.pred_by_cls = {}\n        for nms_mode in self.nms_mode:\n            if self.dataset_name == 'thumos14' and nms_mode == 'raw' and test_slice_overlap > 0:\n                self.cross_window_fusion()\n            # if you really want to use NMS\n            if self.dataset_name == 'thumos14' and nms_mode == 'nms' and test_slice_overlap > 0:\n                self.nms_whole_dataset()\n\n            self.pred_by_cls[nms_mode] = [self.all_pred[nms_mode][self.all_pred[nms_mode].cls == cls].reset_index(drop=True).drop('cls', 1) for cls in range(self.num_classes)]\n\n    def import_prediction(self):\n        pass\n\n    def format_arr(self, arr, format='{:.2f}'):\n        line = ' '.join([format.format(x) for x in arr])\n        return line\n\n    def synchronize_between_processes(self):\n        mode = self.nms_mode[0]\n        print(\n            len(self.all_pred[mode]),\n            len({x[0] for x in self.all_pred[mode]})\n        )\n        self.all_pred = merge_distributed(self.all_pred)\n\n    def summarize(self):\n        '''Compute mAP and collect stats'''\n        if self.dataset_name in ['thumos14', 'muses']:\n            # 0.3~0.7 avg\n            display_iou_thr_inds = [0, 1, 2, 3, 4]\n        else:\n            # 0.5 0.75 0.95 avg\n            display_iou_thr_inds = [0, 5, 9]\n        \n        for nms_mode in self.nms_mode:\n            logging.info(\n                'mode={} {} predictions from {} videos'.format(\n                    nms_mode,\n                    len(self.all_pred[nms_mode]),\n                    len(self.all_pred[nms_mode]['video-id'].unique()))\n            )\n\n        header = ' '.join('%.2f' % self.iou_range[i] for i in display_iou_thr_inds) + ' avg'  # 0 5 9\n        lines = []\n        for nms_mode in self.nms_mode:\n            per_iou_ap = self.compute_map(nms_mode)\n            line = ' '.join(['%.2f' % (100*per_iou_ap[i]) for i in display_iou_thr_inds]) + ' %.2f' % (100*per_iou_ap.mean()) + ' {} epoch{}'.format(nms_mode, self.epoch)\n            lines.append(line)\n        msg = header\n        for l in lines:\n            msg += '\\n' + l\n        logging.info('\\n' + msg)\n\n        for nms_mode in self.nms_mode:\n            if self.dataset_name == 'thumos14':\n                self.stats[nms_mode]['AP50'] = self.stats[nms_mode]['per_iou_ap'][2]\n            else:\n                self.stats[nms_mode]['AP50'] = self.stats[nms_mode]['per_iou_ap'][0]\n        self.stats_summary = msg\n\n    def compute_map(self, nms_mode):\n        '''Compute mean average precision'''\n        start_time = time.time()\n\n        gt_by_cls, pred_by_cls = self.gt_by_cls, self.pred_by_cls[nms_mode]\n\n        iou_range = self.iou_range\n        num_classes = self.num_classes\n        ap_values = np.zeros((num_classes, len(iou_range)))\n\n        with concurrent.futures.ProcessPoolExecutor(min(self.num_workers, 8)) as p:\n            futures = []\n            for cls in range(len(pred_by_cls)):\n                if len(gt_by_cls[cls]) == 0:\n                    logging.info('no gt for class {}'.format(self.classes[cls]))\n                if len(pred_by_cls[cls]) == 0:\n                    logging.info('no prediction for class {}'.format(self.classes[cls]))\n                futures.append(p.submit(eval_ap, iou_range, cls, gt_by_cls[cls], pred_by_cls[cls]))\n            for f in concurrent.futures.as_completed(futures):\n                x = f.result()\n                ap_values[x[0], :] = x[1]\n\n        per_iou_ap = ap_values.mean(axis=0)\n        per_cls_ap = ap_values.mean(axis=1)\n        mAP = per_cls_ap.mean()\n       \n        self.stats[nms_mode]['mAP'] = mAP\n        self.stats[nms_mode]['ap_values'] = ap_values\n        self.stats[nms_mode]['per_iou_ap'] = per_iou_ap\n        self.stats[nms_mode]['per_cls_ap'] = per_cls_ap\n        return per_iou_ap\n\n    def dump_to_json(self, dets, save_path):\n        result_dict = {}\n        videos = dets['video-id'].unique()\n        for video in videos:\n            this_detections = dets[dets['video-id'] == video]\n            det_list = []\n            for idx, row in this_detections.iterrows():\n                det_list.append(\n                    {'segment': [float(row['t-start']), float(row['t-end'])], 'label': self.classes[int(row['cls'])], 'score': float(row['score'])}\n                )\n            \n            video_id = video[2:] if video.startswith('v_') else video\n            result_dict[video_id] = det_list\n\n        # the standard detection format for ActivityNet\n        output_dict={\n            \"version\": \"VERSION 1.3\",\n            \"results\": result_dict,\n            \"external_data\":{}}\n        if save_path:\n            dirname = osp.dirname(save_path)\n            if not osp.exists(dirname):\n                os.makedirs(dirname)\n            with open(save_path, 'w') as f:\n                json.dump(output_dict, f)\n        # return output_dict\n\n    def dump_detection(self, save_path=None):\n        for nms_mode in self.nms_mode:\n            logging.info(\n                'dump detection result in JSON format to {}'.format(save_path.format(nms_mode)))\n            self.dump_to_json(self.all_pred[nms_mode], save_path.format(nms_mode))\n\n\ndef merge_distributed(all_pred):\n    '''gather outputs from different nodes at distributed mode'''\n    all_pred_gathered = all_gather(all_pred)\n    \n    merged_all_pred = {k: [] for k in all_pred}\n    for p in all_pred_gathered:\n        for k in p:\n            merged_all_pred[k] += p[k]\n\n    return merged_all_pred\n\n    \nif __name__ == '__main__':\n    pass\n\n\n"
  },
  {
    "path": "demo.py",
    "content": "# ------------------------------------------------------------------------\n# TadTR: End-to-end Temporal Action Detection with Transformer\n# Copyright (c) 2021. Xiaolong Liu.\n# ------------------------------------------------------------------------\n\n\nfrom models import build_model\nfrom opts import update_cfg_from_file\nfrom util.misc import NestedTensor\nimport torch\nimport time\nimport pdb\n\n\n# @torch.no_grad()\ndef demo(args, cfg):\n    device = torch.device(args.device)\n    model, _, _ = build_model(cfg)\n\n    bs, t = 1, 100\n    x = torch.rand([bs, cfg.feature_dim, t]).to(device)\n    mask = torch.ones([bs, t], dtype=torch.bool).to(device)\n    samples = NestedTensor(x, mask)\n    targets = [\n        {\n            'labels': torch.LongTensor([0, 0]).to(device),\n            'segments': torch.FloatTensor([[0.5, 0.2], [0.7, 0.3]]).to(device),\n            'orig_size': 100.0\n        } for i in range(bs)]\n\n    model.to(device)\n\n    outputs = model(samples)\n    \n    # orig_target_sizes = torch.FloatTensor(\n    #         [t[\"orig_size\"] for t in targets]).cuda()\n    # results = postprocessor(outputs, orig_target_sizes)\n    print('Passed')\n\n\nif __name__ == '__main__':\n    from opts import get_args_parser, cfg, update_cfg_with_args\n    args = get_args_parser().parse_args()\n\n    if args.cfg:\n        update_cfg_from_file(cfg, args.cfg)\n    update_cfg_with_args(cfg, args.opt)\n\n    if cfg.disable_cuda:\n        cfg.act_reg = False\n    demo(args, cfg)\n"
  },
  {
    "path": "docs/1_train_on_your_dataset.md",
    "content": "# Train and Evaluate TadTR on Your Dataset\n\nTODO"
  },
  {
    "path": "engine.py",
    "content": "# ------------------------------------------------------------------------\n# TadTR: End-to-end Temporal Action Detection with Transformer\n# ------------------------------------------------------------------------\n# Modified from DETR (https://github.com/facebookresearch/detr)\n# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n# ------------------------------------------------------------------------\n\n\"\"\"\nTrain and eval functions used in main.py\n\"\"\"\nimport math\nimport os.path as osp\nimport sys\nfrom typing import Iterable\nimport tqdm\nimport logging\n\nimport torch\n\nimport util.misc as utils\nfrom datasets.tad_eval import TADEvaluator\nimport pickle\n\ndef train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,\n                    data_loader: Iterable, optimizer: torch.optim.Optimizer,\n                    device: torch.device, epoch: int, cfg, max_norm: float = 0):\n    model.train()\n    criterion.train()\n\n    metric_logger = utils.MetricLogger(delimiter=\"  \")\n    metric_logger.add_meter('lr', utils.SmoothedValue(\n        window_size=1, fmt='{value:.6f}'))\n    header = 'Epoch: [{}]'.format(epoch)\n    print_freq = 20\n    cnt = 0\n\n    for samples, targets in metric_logger.log_every(data_loader, print_freq, header):\n        samples = samples.to(device)\n        targets = [{k: v.to(device) if k in ['segments', 'labels']\n                    else v for k, v in t.items()} for t in targets]\n\n        outputs = model((samples.tensors, samples.mask))\n        loss_dict = criterion(outputs, targets)\n        weight_dict = criterion.weight_dict\n        losses = sum(loss_dict[k] * weight_dict[k]\n                     for k in loss_dict.keys() if k in weight_dict)\n        # reduce losses over all GPUs for logging purposes\n        loss_dict_reduced = utils.reduce_dict(loss_dict)\n        # loss of each type\n        loss_dict_reduced_unscaled = {f'{k}_unscaled': v\n                                      for k, v in loss_dict_reduced.items()}\n        # weighted_loss of each type\n        loss_dict_reduced_scaled = {k: v * weight_dict[k]\n                                    for k, v in loss_dict_reduced.items() if k in weight_dict}\n        losses_reduced_scaled = sum(loss_dict_reduced_scaled.values())\n\n        loss_value = losses_reduced_scaled.item()\n\n        if not math.isfinite(loss_value):\n            logging.info(\"Loss is {}, stopping training\".format(loss_value))\n            logging.info(str(loss_dict_reduced))\n            sys.exit(1)\n\n        losses.backward()\n        if (cnt + 1) % cfg.iter_size == 0:\n            # scale gradients when iter size is functioning\n            if cfg.iter_size != 1:\n                for g in optimizer.param_groups:\n                    for p in g['params']:\n                        p.grad /= cfg.iter_size\n\n            if max_norm > 0:\n                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)\n            optimizer.step()\n            optimizer.zero_grad()\n\n        metric_logger.update(\n            loss=loss_value, **loss_dict_reduced_scaled, **loss_dict_reduced_unscaled)\n        metric_logger.update(lr=optimizer.param_groups[0][\"lr\"])\n\n        cnt += 1\n\n    optimizer.zero_grad()\n    # gather the stats from all processes\n    metric_logger.synchronize_between_processes()\n    logging.info(f\"Averaged stats:{metric_logger}\")\n    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}\n\n\ndef to_device(t, device):\n    if isinstance(t, (list, tuple)):\n        return t\n    else:\n        return t.to(device)\n\n\n@torch.no_grad()\ndef test(model, criterion, postprocessor, data_loader, base_ds, device, output_dir, cfg, subset='val', epoch=None, test_mode=False):\n    '''\n    Run inference and evaluation. Do not compute loss\n    test_mode: indicates that we are evaluating specific epoch during testing\n    '''\n    model.eval()\n    criterion.eval()\n\n    metric_logger = utils.MetricLogger(delimiter=\"  \")\n    metric_logger.add_meter('class_error', utils.SmoothedValue(\n        window_size=1, fmt='{value:.2f}'))\n\n    iou_range = [0.3, 0.4, 0.5, 0.6, 0.7] if cfg.dataset_name == 'thumos14' else [\n        num/100 for num in range(50, 100, 5)]\n    # logging.info('iou range {}'.format(iou_range))\n\n    # action_evaluator = None\n    action_evaluator = TADEvaluator(cfg.dataset_name, subset, base_ds, nms_mode=[\n                                          'raw'], iou_range=iou_range, epoch=epoch)\n\n    # raw_res = []\n    cnt = 0\n    for (samples, targets) in tqdm.tqdm(data_loader, total=len(data_loader)):\n        samples = samples.to(device)\n        outputs = model((samples.tensors, samples.mask))\n\n        # raw_res.append((outputs, targets))\n        video_duration = torch.FloatTensor(\n            [t[\"video_duration\"] for t in targets]).to(device)\n        results = postprocessor(outputs, video_duration, fuse_score=cfg.act_reg)\n\n        res = {target['video_id']: output for target,\n               output in zip(targets, results)}\n        if action_evaluator is not None:\n            action_evaluator.update(res, assign_cls_labels=cfg.binary)\n        # if cnt >= 9:\n        #     break\n        cnt += 1\n\n    # accumulate predictions from all videos\n    if action_evaluator is not None:\n        action_evaluator.synchronize_between_processes()\n        action_evaluator.accumulate(cfg.test_slice_overlap)\n        # dump detections\n        if test_mode:\n            save_path = osp.join('outputs', 'detection_{}.json')\n            action_evaluator.dump_detection(save_path)\n        action_evaluator.summarize()\n\n    stats = {}\n\n    if action_evaluator is not None:\n        for k, v in action_evaluator.stats.items():\n            for vk, vv in v.items():\n                stats[vk + '_' + k] = vv\n\n        mAP_values = ' '.join([f'{k}: {100*v:.2f}'.format(k, v)\n                              for k, v in stats.items() if k.startswith('mAP')])\n        logging.info(mAP_values)\n\n        stats['stats_summary'] = action_evaluator.stats_summary\n\n    # with open('raw_outputs.pkl', 'wb') as f:\n    #     pickle.dump(raw_res, f)\n\n    return stats\n"
  },
  {
    "path": "main.py",
    "content": "# ------------------------------------------------------------------------\n# TadTR: End-to-end Temporal Action Detection with Transformer\n# Copyright (c) 2021 - 2012. Xiaolong Liu\n# ------------------------------------------------------------------------\n# Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)\n# Copyright (c) 2020 SenseTime. All Rights Reserved.\n# Licensed under the Apache License, Version 2.0\n# ------------------------------------------------------------------------\n# and DETR (https://github.com/facebookresearch/detr)\n# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n# ------------------------------------------------------------------------\n'''Entry for training and testing'''\n\nimport datetime\nimport json\nimport random\nimport time\nfrom pathlib import Path\nimport re\nimport os\nimport logging\nimport sys\nimport os.path as osp\n\nimport numpy as np\nimport torch\nfrom torch.utils.data import DataLoader, DistributedSampler\n\nfrom opts import get_args_parser, cfg, update_cfg_with_args, update_cfg_from_file\nimport util.misc as utils\nfrom datasets import build_dataset\nfrom engine import train_one_epoch, test\nfrom models import build_model\nif cfg.tensorboard:\n    from torch.utils.tensorboard import SummaryWriter\n\n        \n\ndef main(args):\n    from util.logger import setup_logger\n\n    if args.cfg is not None:\n        update_cfg_from_file(cfg, args.cfg)\n\n    update_cfg_with_args(cfg, args.opt)\n\n    if cfg.output_dir:\n        Path(cfg.output_dir).mkdir(parents=True, exist_ok=True)\n\n    # The actionness regression module requires CUDA support\n    # If your machine does not have CUDA enabled, this module will be disabled.\n    if cfg.disable_cuda:\n        cfg.act_reg = False\n\n    utils.init_distributed_mode(args)\n\n    if not args.eval:\n        mode = 'train'\n    else:\n        mode = 'test'\n\n    # Logs will be saved in log_path\n    log_path = os.path.join(cfg.output_dir, mode + '.log')\n    setup_logger(log_path)\n\n    logging.info(\"git:\\n  {}\\n\".format(utils.get_sha()))\n\n    logging.info(' '.join(sys.argv))\n\n    with open(osp.join(cfg.output_dir, mode + '_cmd.txt'), 'w') as f:\n        f.write(' '.join(sys.argv) + '\\n')\n    logging.info(str(args))\n    logging.info(str(cfg))\n\n    device = torch.device(args.device)\n\n    # fix the seed\n    seed = args.seed + utils.get_rank()\n    torch.manual_seed(seed)\n    np.random.seed(seed)\n    random.seed(seed)\n\n    torch.cuda.manual_seed(seed)\n    torch.cuda.manual_seed_all(seed)\n\n    if cfg.input_type == 'image':\n        # We plan to support image input in the future\n        raise NotImplementedError\n\n    model, criterion, postprocessors = build_model(cfg)\n\n    model.to(device)\n    model_without_ddp = model\n\n    if args.distributed:\n        model = torch.nn.parallel.DistributedDataParallel(\n            model, device_ids=[args.gpu], find_unused_parameters=True)\n        model_without_ddp = model.module\n    elif args.multi_gpu:\n        model = torch.nn.DataParallel(model)\n        model_without_ddp = model.module\n\n    n_parameters = sum(p.numel() for p in model.parameters())\n    logging.info('number of params: {}'.format(n_parameters))\n\n    def match_name_keywords(n, name_keywords):\n        out = False\n        for b in name_keywords:\n            if b in n:\n                out = True\n                break\n        return out\n\n    param_dicts = [\n        # non-backbone, non-offset\n        {\n            \"params\":\n                [p for n, p in model_without_ddp.named_parameters()\n                 if not match_name_keywords(n, cfg.lr_backbone_names) and not match_name_keywords(n, cfg.lr_linear_proj_names) and p.requires_grad],\n            \"lr\": cfg.lr,\n            \"initial_lr\": cfg.lr\n        },\n        # backbone\n        {\n            \"params\": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, cfg.lr_backbone_names) and p.requires_grad],\n            \"lr\": cfg.lr_backbone,\n            \"initial_lr\": cfg.lr_backbone\n        },\n        # offset\n        {\n            \"params\": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, cfg.lr_linear_proj_names) and p.requires_grad],\n            \"lr\": cfg.lr * cfg.lr_linear_proj_mult,\n            \"initial_lr\": cfg.lr * cfg.lr_linear_proj_mult\n        }\n    ]\n\n    optimizer = torch.optim.__dict__[cfg.optimizer](param_dicts, lr=cfg.lr,\n                                                     weight_decay=cfg.weight_decay)\n\n    output_dir = Path(cfg.output_dir)\n\n    if args.resume == 'latest':\n        args.resume = osp.join(cfg.output_dir, 'checkpoint.pth')\n    elif args.resume == 'best':\n        args.resume = osp.join(cfg.output_dir, 'model_best.pth')\n\n    if 'model_best.pth' in os.listdir(cfg.output_dir) and not args.resume and not args.eval:\n        # for many times, my trained models were accidentally overwrittern by new models😂. So I add this to avoid that\n        logging.error(\n            'Danger! You are overwriting an existing output dir {}, probably because you forget to change the output_dir option'.format(cfg.output_dir))\n        confirm = input('confirm: y/n')\n        if confirm != 'y':\n            return\n\n    last_epoch = -1\n\n    if args.resume:\n        checkpoint = torch.load(args.resume, map_location='cpu')\n        last_epoch = checkpoint['epoch']\n\n    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(\n        optimizer, cfg.lr_step, last_epoch=last_epoch)\n\n    dataset_val = build_dataset(subset=cfg.test_set, args=cfg, mode='val')\n    if not args.eval:\n        dataset_train = build_dataset(subset='train', args=cfg, mode='train')\n\n    if args.distributed:\n        if not args.eval:\n            sampler_train = DistributedSampler(dataset_train)\n        sampler_val = DistributedSampler(dataset_val, shuffle=False)\n\n    else:\n        if not args.eval:\n            sampler_train = torch.utils.data.RandomSampler(dataset_train)\n        sampler_val = torch.utils.data.SequentialSampler(dataset_val)\n\n    if not args.eval:\n        batch_sampler_train = torch.utils.data.BatchSampler(\n            sampler_train, cfg.batch_size, drop_last=True)\n\n        data_loader_train = DataLoader(dataset_train,\n                                       batch_sampler=batch_sampler_train,\n                                       collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True)\n\n    data_loader_val = DataLoader(dataset_val, cfg.batch_size, sampler=sampler_val,\n                                 drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True)\n\n    base_ds = dataset_val.video_dict\n\n    if not args.eval and cfg.tensorboard and utils.is_main_process():\n        smry_writer = SummaryWriter(output_dir)\n    else:\n        smry_writer = None\n\n    best_metric = -1\n    best_metric_txt = ''\n\n    if args.eval and not args.resume:\n        args.resume = osp.join(output_dir, 'model_best.pth')\n\n    # start training from this epoch. You do not to set this option.\n    start_epoch = 0\n    if args.resume:\n        print('loading checkpint {}'.format(args.resume))\n        if args.resume.startswith('https'):\n            checkpoint = torch.hub.load_state_dict_from_url(\n                args.resume, map_location='cpu', check_hash=True)\n        else:\n            checkpoint = torch.load(args.resume, map_location='cpu')\n        \n        model_without_ddp.load_state_dict(checkpoint['model'], strict=False)\n\n        if 'epoch' in checkpoint:\n            start_epoch = checkpoint['epoch'] + 1\n\n        if 'best_metric' in checkpoint:\n            best_metric = checkpoint['best_metric']\n\n    if args.eval:\n        test_stats = test(model, criterion, postprocessors,\n                          data_loader_val, base_ds, device, cfg.output_dir, cfg, subset=cfg.test_set, epoch=checkpoint['epoch'], test_mode=True)\n\n        return\n\n    logging.info(\"Start training\")\n    start_time = time.time()\n\n    for epoch in range(start_epoch, cfg.epochs):\n        if args.distributed:\n            sampler_train.set_epoch(epoch)\n\n        for group in optimizer.param_groups:\n            logging.info('lr={}'.format(group['lr']))\n        train_stats = train_one_epoch(\n            model, criterion, data_loader_train, optimizer, device, epoch, cfg,\n            cfg.clip_max_norm)\n\n        lr_scheduler.step()\n\n        if cfg.output_dir:\n            # save checkpoint every `cfg.ckpt_interval` epochs, also when reducing the learning rate\n            checkpoint_paths = [output_dir / 'checkpoint.pth']\n            if (epoch + 1) in cfg.lr_step or (epoch + 1) % cfg.ckpt_interval == 0:\n                checkpoint_paths.append(\n                    output_dir / f'checkpoint{epoch:04}.pth')\n            ckpt = {\n                'model': model_without_ddp.state_dict(),\n                'epoch': epoch,\n                'args': args,\n                'cfg': cfg,\n                'best_metric': best_metric,\n            }\n            for checkpoint_path in checkpoint_paths:\n                utils.save_on_master(ckpt, checkpoint_path)\n\n        if (epoch + 1) % cfg.test_interval == 0:\n            test_stats = test(\n                model, criterion, postprocessors, data_loader_val, base_ds, device, cfg.output_dir, cfg, epoch=epoch\n            )\n            prime_metric = 'mAP_raw'\n            if test_stats[prime_metric] > best_metric:\n                best_metric = test_stats[prime_metric]\n                best_metric_txt = test_stats['stats_summary']\n                logging.info(\n                    'new best metric {:.4f}@epoch{}'.format(best_metric, epoch))\n                if cfg.output_dir:\n                    ckpt['best_metric'] = best_metric\n                    best_ckpt_path = output_dir / 'model_best.pth'\n                    utils.save_on_master(ckpt, best_ckpt_path)\n\n        else:\n            test_stats = {}\n\n        log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},\n                     **{f'test_{k}': v for k, v in test_stats.items()},\n                     'epoch': epoch,\n                     'n_parameters': n_parameters}\n\n        if cfg.output_dir and utils.is_main_process():\n            for k, v in log_stats.items():\n                if isinstance(v, np.ndarray):\n                    log_stats[k] = v.tolist()\n            with (output_dir / \"log.txt\").open(\"a\") as f:\n                f.write(json.dumps(log_stats) + \"\\n\")\n            if smry_writer:\n                for k, v in log_stats.items():\n                    if re.findall('loss_\\S+unscaled', k) or k.endswith('loss') or 'lr' in k or 'AP50' in k or 'AP75' in k or 'AP95' in k or 'mAP' in k or 'AR' in k:\n                        smry_writer.add_scalar(k, v, epoch)\n\n    total_time = time.time() - start_time\n    total_time_str = str(datetime.timedelta(seconds=int(total_time)))\n    if utils.is_main_process():\n        logging.info('Training time {}'.format(total_time_str))\n        logging.info(str(\n            ['{}:{}'.format(k, v) for k, v in test_stats.items() if 'AP' in k or 'AR' in k]))\n        if smry_writer is not None:\n            smry_writer.close()\n    logging.info('best det result\\n{}'.format(best_metric_txt))\n    logging.info(log_path)\n\n\nif __name__ == '__main__':\n    import argparse\n    parser = argparse.ArgumentParser(\n        'TadTR training and evaluation script', parents=[get_args_parser()])\n    args = parser.parse_args()\n\n    s_ = time.time()\n    main(args)\n    logging.info('main takes {:.3f} seconds'.format(time.time() - s_))\n"
  },
  {
    "path": "models/__init__.py",
    "content": "# ------------------------------------------------------------------------\n# TadTR: End-to-end Temporal Action Detection with Transformer\n# Copyright (c) 2021. Xiaolong Liu.\n# ------------------------------------------------------------------------\n\n'''build models'''\n\nfrom .tadtr import build\n\ndef build_model(args):\n    return build(args)\n"
  },
  {
    "path": "models/custom_loss.py",
    "content": "# Mostly copied from DETR (https://github.com/facebookresearch/detr)\n# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n\n'''Focal loss implementation'''\n\n\nimport torch\nimport torch.nn.functional as F\n\n\ndef sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):\n    \"\"\"\n    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.\n    Args:\n        inputs: A float tensor of arbitrary shape.\n                The predictions for each example.\n        targets: A float tensor with the same shape as inputs. Stores the binary\n                 classification label for each element in inputs\n                (0 for the negative class and 1 for the positive class).\n        alpha: (optional) Weighting factor in range (0,1) to balance\n                positive vs negative examples. Default = -1 (no weighting).\n        gamma: Exponent of the modulating factor (1 - p_t) to\n               balance easy vs hard examples.\n    Returns:\n        Loss tensor\n    \"\"\"\n    prob = inputs.sigmoid()\n    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction=\"none\")\n    p_t = prob * targets + (1 - prob) * (1 - targets)\n    loss = ce_loss * ((1 - p_t) ** gamma)\n\n    if alpha >= 0:\n        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)\n        loss = alpha_t * loss\n\n    return loss.mean(1).sum() / num_boxes\n\n\nif __name__ == \"__main__\":\n    import numpy as np\n    pred = torch.from_numpy(np.random.random([8, 2]))\n    target = torch.from_numpy(np.random.random(8) > 0.5).long()\n    loss = sigmoid_focal_loss(pred, target)\n    \n\n"
  },
  {
    "path": "models/matcher.py",
    "content": "# ------------------------------------------------------------------------\n# TadTR: End-to-end Temporal Action Detection with Transformer\n# ------------------------------------------------------------------------\n# Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)\n# Copyright (c) 2020 SenseTime. All Rights Reserved.\n# ------------------------------------------------------------------------\n# Modified from DETR (https://github.com/facebookresearch/detr)\n# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.\n# ------------------------------------------------------------------------\n\n# Modified from DETR (https://github.com/facebookresearch/detr)\n# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.\n\nimport torch\nfrom scipy.optimize import linear_sum_assignment\nfrom torch import nn\n\nfrom util.segment_ops import segment_cw_to_t1t2, segment_iou\nimport pdb\n\n\nclass HungarianMatcher(nn.Module):\n    \"\"\"This class computes an assignment between the targets and the predictions of the network\n    For efficiency reasons, the targets don't include the no_object. Because of this, in general,\n    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,\n    while the others are un-matched (and thus treated as non-objects).\n    \"\"\"\n\n    def __init__(self, cost_class: float = 1, cost_seg: float = 1, cost_iou: float = 1):\n        \"\"\"Creates the matcher\n        Params:\n            cost_class: This is the relative weight of the classification error in the matching cost\n            cost_seg: This is the relative weight of the L1 error of the segment coordinates in the matching cost\n            cost_iou: This is the relative weight of the iou loss of the segment in the matching cost\n        \"\"\"\n        super().__init__()\n        self.cost_class = cost_class\n        self.cost_seg = cost_seg\n        self.cost_iou = cost_iou\n        assert cost_class != 0 or cost_seg!= 0 or cost_iou != 0, \"all costs cant be 0\"\n\n    @torch.no_grad()\n    def forward(self, outputs, targets):\n        \"\"\" Performs the matching\n        Params:\n            outputs: This is a dict that contains at least these entries:\n                 \"pred_logits\": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits\n                 \"pred_segments\": Tensor of dim [batch_size, num_queries, 2] with the predicted segment coordinates\n\n            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:\n                 \"labels\": Tensor of dim [num_target_segments] (where num_target_segments is the number of ground-truth\n                           objects in the target) containing the class labels\n                 \"segments\": Tensor of dim [num_target_segments, 2] containing the target segment coordinates\n\n        Returns:\n            A list of size batch_size, containing tuples of (index_i, index_j) where:\n                - index_i is the indices of the selected predictions (in order)\n                - index_j is the indices of the corresponding selected targets (in order)\n            For each batch element, it holds:\n                len(index_i) = len(index_j) = min(num_queries, num_target_segments)\n        \"\"\"\n        bs, num_queries = outputs[\"pred_logits\"].shape[:2]\n\n        # We flatten to compute the cost matrices in a batch\n        out_prob = outputs[\"pred_logits\"].flatten(0, 1).sigmoid()  #  [batch_size * num_queries, num_classes]\n        out_seg = outputs[\"pred_segments\"].flatten(0, 1)  # [batch_size * num_queries, 2]\n\n        # Also concat the target labels and segments\n        tgt_ids = torch.cat([v[\"labels\"] for v in targets])  # shape = n1+n2+...\n        tgt_seg = torch.cat([v[\"segments\"] for v in targets])\n\n        # Compute the classification cost.\n        alpha = 0.25\n        gamma = 2.0\n        neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())\n        pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())\n        cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]\n        \n        # Compute the L1 cost between segments\n        cost_seg = torch.cdist(out_seg, tgt_seg, p=1)\n\n        # Compute the iou cost betwen segments\n        cost_iou = -segment_iou(segment_cw_to_t1t2(out_seg), segment_cw_to_t1t2(tgt_seg))\n\n        # Final cost matrix, [bs x nq, batch_ngt]\n        C = self.cost_seg * cost_seg + self.cost_class * cost_class + self.cost_iou * cost_iou\n        C = C.view(bs, num_queries, -1).cpu()\n\n        sizes = [len(v[\"segments\"]) for v in targets]\n        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]\n        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]\n\n\ndef build_matcher(args):\n    return HungarianMatcher(cost_class=args.set_cost_class, cost_seg=args.set_cost_seg, cost_iou=args.set_cost_iou)"
  },
  {
    "path": "models/ops/roi_align/__init__.py",
    "content": "from .roi_align import ROIAlign\n\n# __all__ = ['roi_pool', 'ROIAlign']"
  },
  {
    "path": "models/ops/roi_align/roi_align.py",
    "content": "# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.\nimport torch\nfrom torch import nn\nfrom torch.autograd import Function\nfrom torch.autograd.function import once_differentiable\nfrom torch.nn.modules.utils import _pair\n\nfrom . import Align1D as _align_1d\n\nclass _Align1D(Function):\n    @staticmethod\n    def forward(ctx, input, roi, feature_dim, ratio):\n        ctx.save_for_backward(roi)\n        ctx.feature_dim = feature_dim\n        ctx.input_shape = input.size()\n        ctx.sampling_ratio = ratio\n        output = _align_1d.forward(\n            input, roi, feature_dim, ratio\n        )\n        return output\n\n    @staticmethod\n    @once_differentiable\n    def backward(ctx, grad_output):\n        rois, = ctx.saved_tensors\n        feature_dim = ctx.feature_dim\n        bs, ch, t = ctx.input_shape\n        ratio = ctx.sampling_ratio\n        grad_input = _align_1d.backward(\n            grad_output,\n            rois,\n            feature_dim,\n            bs,\n            ch,\n            t,\n            ratio\n        )\n        return grad_input, None, None, None, None\n\n\nalign1d = _Align1D.apply\n\n\nclass ROIAlign(nn.Module):\n    def __init__(self, feature_dim, ratio=0):\n        super(ROIAlign, self).__init__()\n        self.feature_dim = feature_dim\n        self.ratio = ratio\n\n    def forward(self, input, rois):\n        # print('- input shape is', input.shape)\n        # print('- input mean is', input.mean())\n        # print('- rois shape is', rois.shape)\n        # print('- rois is on', rois.get_device())\n        assert input.device==rois.device, 'Align operation requires ' + \\\n\t\t\t'both feature and roi are on the same device! ' + \\\n            'Get feature on {} but roi on {}'.format(input.device,rois.device)\n\n        out = align1d(input, rois, self.feature_dim, self.ratio)\n        # print('- output shape is', out.shape)\n        # print('- output mean is', out.mean())\n        return out\n\n    def __repr__(self):\n        tmpstr = self.__class__.__name__ + \"(\"\n        tmpstr += \"feature_dim=\" + str(self.feature_dim)\n        tmpstr += \"sampling_ratio=\" + str(self.ratio)\n        tmpstr += \")\"\n        return tmpstr\n\nif __name__ == \"__main__\":\n    layer = Align1DLayer(16)\n    # layer = torch.nn.DataParallel(layer, device_ids=[0,1])\n    input = torch.tensor([[[1.,2,3,4,5,6,7,8,9,10],[11,12,13,14,15,16,17,18,19,20]]]).cuda()\n    proposal = torch.tensor([[0,-0.5,9.5],[0,0.1,0.9]]).cuda()\n    print(input.shape, proposal.shape)\n    output = layer(input, proposal)\n    print(\"output has shape {}, with mean {}\".format(output.shape, torch.mean(output)))\n    print(output)"
  },
  {
    "path": "models/ops/roi_align/src/roi_align_cuda.cpp",
    "content": "#include <torch/extension.h>\n\n#include <vector>\n\n// CUDA forward declarations\nat::Tensor Align_forward_cuda(const at::Tensor& input,\n                                 const at::Tensor& rois,\n                                 const float spatial_scale,\n                                 const int pooled_height,\n                                 const int sampling_ratio);\n\nat::Tensor Align_backward_cuda(const at::Tensor& grad,\n                                  const at::Tensor& rois,\n                                  const float spatial_scale,\n                                  const int pooled_height,\n                                  const int batch_size,\n                                  const int channels,\n                                  const int height,\n                                  const int sampling_ratio);\n\n// C++ interface\nat::Tensor Align_forward(const at::Tensor& input, // (bs,ch,t)\n                                 const at::Tensor& rois, // (bs, start, end)\n                                 const int pooled_height,\n                                 const int sampling_ratio){\n    return Align_forward_cuda( input, rois, 1.0, pooled_height, sampling_ratio);\n                                     }\n\nat::Tensor Align_backward(const at::Tensor& grad,\n                                  const at::Tensor& rois,\n                                  const int pooled_height,\n                                  const int batch_size,\n                                  const int channels,\n                                  const int height,\n                                  const int sampling_ratio){\n    return Align_backward_cuda(grad, rois, 1.0, pooled_height, batch_size, channels, height, sampling_ratio);\n                                      }\n\nPYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {\n  m.def(\"forward\", &Align_forward, \"Align forward (CUDA)\");\n  m.def(\"backward\", &Align_backward, \"Align backward (CUDA)\");\n}"
  },
  {
    "path": "models/ops/roi_align/src/roi_align_kernel.cu",
    "content": "// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.\n// Modifies by Frost for 1D ussage\n#include <ATen/ATen.h>\n#include <ATen/cuda/CUDAContext.h>\n\n#include <THC/THC.h>\n#include <THC/THCAtomics.cuh>\n#include <THC/THCDeviceUtils.cuh>\n\n// TODO make it in a common file\n#define CUDA_1D_KERNEL_LOOP(i, n)                            \\\n  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \\\n       i += blockDim.x * gridDim.x)\n\n\ntemplate <typename T>\n__device__ T linear_interpolate(const T* bottom_data,\n    const int height,\n    T t,\n    const int index /* index for debug only*/) {\n\n  // deal with cases that inverse elements are out of feature map boundary\n  if (t < -1.0 || t > height) {\n    //empty\n    return 0;\n  }\n\n  if (t <= 0) t = 0;\n\n  int t_low = (int) t;\n  int t_high;\n\n  // get closest integers to t\n  if (t_low >= height - 1) {\n    t_high = t_low = height - 1;\n    t = (T) t_low;\n  } else {\n    t_high = t_low + 1;\n  }\n\n  // get the distance to t\n  T lt = t - t_low;\n  T ht = 1. - lt;\n\n  // do linear interpolation\n  T v1 = bottom_data[t_low];\n  T v2 = bottom_data[t_high];\n  T w1 = ht, w2 = lt;\n\n  T val = (w1 * v1 + w2 * v2);\n  // printf(\"Check Linear Interpolate: w1=%f, v1=%f, w2=%f, v2=%f \\n\", w1, v1, w2, v2);\n  return val;\n}\n\ntemplate <typename T>\n__global__ void Align1DForward(const int nthreads, const T* bottom_data,\n    const T spatial_scale, const int channels,\n    const int height,\n    const int pooled_height, \n    const int sampling_ratio,\n    const T* bottom_rois, T* top_data) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, pt) is an element in the pooled output\n    int pt = index % pooled_height;\n    int c = (index / pooled_height) % channels;\n    int n = index / pooled_height / channels;\n\n    // printf(\"Debug Main Loop: get pt, c, n are %d, %d, %d \\n\", pt, c, n);\n\n    const T* offset_bottom_rois = bottom_rois + n * 3;\n    int roi_batch_ind = offset_bottom_rois[0];\n\n    // Do not using rounding; this implementation detail is critical\n    T roi_start = offset_bottom_rois[1] * spatial_scale;\n    T roi_end = offset_bottom_rois[2] * spatial_scale;\n    // printf(\"Debug roi boundary: w1,  w2,  is  %f, %f \\n\", roi_start,roi_end,);\n\n    // Force malformed ROIs to be 1x1\n    T roi_height = max(roi_end- roi_start, (T)1.);\n    T bin_size = static_cast<T>(roi_height) / static_cast<T>(pooled_height);\n\n    const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height;\n\n    // We use roi_bin_grid to sample the grid and mimic integral\n    int roi_bin_grid = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2\n\n    // We do average (integral) pooling inside a bin\n    const T count = roi_bin_grid; // e.g. = 4\n\n    T output_val = 0.;\n    for (int it = 0; it < roi_bin_grid; it ++) // e.g., it = 0, 1\n    {\n      const T t = roi_start + pt * bin_size + static_cast<T>(it + .5f) * bin_size / static_cast<T>(roi_bin_grid); // e.g., 0.5, 1.5\n\n      T val = linear_interpolate(offset_bottom_data, height, t, index);\n      // printf(\"Debug linear_interpolate: input=height:%d, t:%f, ... ; output=val:%f \\n\", height, t, val);\n      output_val += val;\n    }\n    output_val /= count;\n\n    top_data[index] = output_val;\n  }\n}\n\n\ntemplate <typename T>\n__device__ void linear_interpolate_gradient(\n    const int height, \n    T t,\n    T & w1, T & w2,\n    int & t_low, int & t_high, \n    const int index /* index for debug only*/) {\n\n  // deal with cases that inverse elements are out of feature map boundary\n  if (t < -1.0 || t > height) {\n    //empty\n    w1 = w2 = 0.;\n    t_low = t_high = -1;\n    return;\n  }\n\n  if (t <= 0) t = 0;\n\n  t_low = (int) t;\n\n  if (t_low >= height - 1) {\n    t_high = t_low = height - 1;\n    t = (T) t_low;\n  } else {\n    t_high = t_low + 1;\n  }\n\n  T lt = t - t_low;\n  T ht = 1. - lt;\n\n  // T val = (w1 * v1 + w2 * v2);\n  // T w1 = ht, w2 = lt;\n  w1 = ht , w2 = lt;\n\n  return;\n}\n\ntemplate <typename T>\n__global__ void Align1DBackwardFeature(const int nthreads, const T* top_diff,\n    const int num_rois, const T spatial_scale,\n    const int channels, const int height,\n    const int pooled_height,\n    const int sampling_ratio,\n    T* bottom_diff,\n    const T* bottom_rois) {\n  CUDA_1D_KERNEL_LOOP(index, nthreads) {\n    // (n, c, pt) is an element in the pooled output\n    int pt = (index ) % pooled_height;\n    int c = (index / pooled_height) % channels;\n    int n = index / pooled_height / channels;\n\n    const T* offset_bottom_rois = bottom_rois + n * 3;\n    int roi_batch_ind = offset_bottom_rois[0];\n\n    // Do not using rounding; this implementation detail is critical\n    T roi_start= offset_bottom_rois[1] * spatial_scale;\n    T roi_end= offset_bottom_rois[2] * spatial_scale;\n\n    // Force malformed ROIs to be 1x1\n    T roi_height = max(roi_end- roi_start, (T)1.);\n    T bin_size = static_cast<T>(roi_height) / static_cast<T>(pooled_height);\n\n    T* offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height;\n\n    int top_offset    = (n * channels + c) * pooled_height;\n    const T* offset_top_diff = top_diff + top_offset;\n    const T top_diff_this_bin = offset_top_diff[pt];\n\n    // We use roi_bin_grid to sample the grid and mimic integral\n    int roi_bin_grid= (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2\n\n    // We do average (integral) pooling inside a bin\n    const T count = roi_bin_grid; // e.g. = 4\n\n    for (int it = 0; it < roi_bin_grid; it ++) // e.g., iy = 0, 1\n    {\n      const T t = roi_start+ pt * bin_size+ static_cast<T>(it + .5f) * bin_size/ static_cast<T>(roi_bin_grid); // e.g., 0.5, 1.5\n\n      T w1, w2;\n      int t_low, t_high;\n\n      linear_interpolate_gradient(height, t, w1, w2, t_low, t_high, index);\n\n      T g1 = top_diff_this_bin * w1 / count;\n      T g2 = top_diff_this_bin * w2 / count;\n\n      if (t_low >= 0 && t_high >= 0)\n      {\n          atomicAdd(offset_bottom_diff + t_low, static_cast<T>(g1));\n          atomicAdd(offset_bottom_diff + t_high, static_cast<T>(g2));\n      } // if\n    } // it\n  } // CUDA_1D_KERNEL_LOOP\n} // RoIAlignBackward\n\n\nat::Tensor Align_forward_cuda(const at::Tensor& input,\n                                 const at::Tensor& rois,\n                                 const float spatial_scale,\n                                 const int pooled_height,\n                                 const int sampling_ratio) {\n  AT_ASSERTM(input.type().is_cuda(), \"input must be a CUDA tensor\");\n  AT_ASSERTM(rois.type().is_cuda(), \"rois must be a CUDA tensor\");\n  auto num_rois = rois.size(0);\n  auto channels = input.size(1);\n  auto height = input.size(2);\n\n  auto output = at::empty({num_rois, channels, pooled_height}, input.options());\n  auto output_size = num_rois * pooled_height * channels;\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  dim3 grid(std::min(THCCeilDiv((long)output_size, 512L), 4096L));\n  dim3 block(512);\n\n  // printf(\"Debug main function: height:%d\\n\", height);\n\n  if (output.numel() == 0) {\n    THCudaCheck(cudaGetLastError());\n    return output;\n  }\n\n  AT_DISPATCH_FLOATING_TYPES(input.type(), \"Align1D_forward\", [&] {\n    Align1DForward<scalar_t><<<grid, block, 0, stream>>>(\n         output_size,\n         input.contiguous().data<scalar_t>(),\n         spatial_scale,\n         channels,\n         height,\n         pooled_height,\n         sampling_ratio,\n         rois.contiguous().data<scalar_t>(),\n         output.data<scalar_t>());\n  });\n  THCudaCheck(cudaGetLastError());\n  return output;\n}\n\n// TODO remove the dependency on input and use instead its sizes -> save memory\nat::Tensor Align_backward_cuda(const at::Tensor& grad,\n                                  const at::Tensor& rois,\n                                  const float spatial_scale,\n                                  const int pooled_height,\n                                  const int batch_size,\n                                  const int channels,\n                                  const int height,\n                                  const int sampling_ratio) {\n  AT_ASSERTM(grad.type().is_cuda(), \"grad must be a CUDA tensor\");\n  AT_ASSERTM(rois.type().is_cuda(), \"rois must be a CUDA tensor\");\n\n  auto num_rois = rois.size(0);\n  auto grad_input = at::zeros({batch_size, channels, height}, grad.options());\n\n  cudaStream_t stream = at::cuda::getCurrentCUDAStream();\n\n  dim3 grid(std::min(THCCeilDiv((long)grad.numel(), 512L), 4096L));\n  dim3 block(512);\n\n  // handle possibly empty gradients\n  if (grad.numel() == 0) {\n    THCudaCheck(cudaGetLastError());\n    return grad_input;\n  }\n\n  AT_DISPATCH_FLOATING_TYPES(grad.type(), \"ROIAlign_backward\", [&] {\n    Align1DBackwardFeature<scalar_t><<<grid, block, 0, stream>>>(\n         grad.numel(),\n         grad.contiguous().data<scalar_t>(),\n         num_rois,\n         spatial_scale,\n         channels,\n         height,\n         pooled_height,\n         sampling_ratio,\n         grad_input.data<scalar_t>(),\n         rois.contiguous().data<scalar_t>());\n  });\n  THCudaCheck(cudaGetLastError());\n  return grad_input;\n}"
  },
  {
    "path": "models/ops/setup.py",
    "content": "# ------------------------------------------------------------------------\n# TadTR: End-to-end Temporal Action Detection with Transformer\n# Copyright (c) 2021. Xiaolong Liu.\n# ------------------------------------------------------------------------\n# Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)\n# Copyright (c) 2020 SenseTime. All Rights Reserved.\n# Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n# ------------------------------------------------------------------------------------------------\n# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0\n# ------------------------------------------------------------------------------------------------\n\nimport os\nimport glob\nimport pdb\n\nimport torch\n\nfrom torch.utils.cpp_extension import CUDA_HOME\nfrom torch.utils.cpp_extension import CppExtension\nfrom torch.utils.cpp_extension import CUDAExtension\n\nfrom setuptools import find_packages\nfrom setuptools import setup\n\nrequirements = [\"torch\", \"torchvision\"]\n\n\ndef get_sources(extensions_dir):\n    main_file = glob.glob(os.path.join(extensions_dir, \"*.cpp\"))\n    source_cpu = glob.glob(os.path.join(extensions_dir, \"cpu\", \"*.cpp\"))\n    source_cuda = glob.glob(os.path.join(extensions_dir, \"cuda\", \"*.cu\"))\n    return main_file + source_cpu + source_cuda\n\n\ndef get_extensions():\n    this_dir = os.path.dirname(os.path.abspath(__file__))\n   \n    extra_compile_args = {\"cxx\": []}\n    define_macros = []\n\n    if torch.cuda.is_available() and CUDA_HOME is not None:\n        define_macros += [(\"WITH_CUDA\", None)]\n        extra_compile_args[\"nvcc\"] = [\n            \"-DCUDA_HAS_FP16=1\",\n            \"-D__CUDA_NO_HALF_OPERATORS__\",\n            \"-D__CUDA_NO_HALF_CONVERSIONS__\",\n            \"-D__CUDA_NO_HALF2_OPERATORS__\",\n        ]\n    else:\n        raise NotImplementedError('Cuda is not availabel')\n\n\n    ext_modules = [\n        # Temporal Deformable Attention, optional\n        # CUDAExtension(\n        #     \"temporal_deform_attn.TemporalDeformableAttention\",\n        #     get_sources(os.path.join(this_dir, \"temporal_deform_attn/src\")),\n        #     include_dirs=[os.path.join(this_dir, \"temporal_deform_attn/src\")],\n        #     define_macros=define_macros,\n        #     extra_compile_args=extra_compile_args\n        # ),\n\n        CUDAExtension('roi_align.Align1D', [\n            'roi_align/src/roi_align_cuda.cpp',\n            'roi_align/src/roi_align_kernel.cu'])\n    ]\n    return ext_modules\n\nsetup(\n    name=\"TadTR_release\",\n    version=\"1.0\",\n    author=\"Xiaolong Liu\",\n    description=\"PyTorch Wrapper for CUDA Functions of TadTR\",\n    packages=find_packages(exclude=(\"configs\", \"tests\",)),\n    ext_modules=get_extensions(),\n    cmdclass={\"build_ext\": torch.utils.cpp_extension.BuildExtension},\n)\n"
  },
  {
    "path": "models/ops/temporal_deform_attn/__init__.py",
    "content": "# ------------------------------------------------------------------------\n# TadTR: End-to-end Temporal Action Detection with Transformer\n# Copyright (c) 2021. Xiaolong Liu.\n# ------------------------------------------------------------------------------------------------\n# Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)\n# Copyright (c) 2020 SenseTime. All Rights Reserved.\n# Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n# ------------------------------------------------------------------------------------------------\n# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0\n# ------------------------------------------------------------------------------------------------\n\nfrom .temporal_deform_attn import DeformAttn\n"
  },
  {
    "path": "models/ops/temporal_deform_attn/temporal_deform_attn.py",
    "content": "# ------------------------------------------------------------------------\n# TadTR: End-to-end Temporal Action Detection with Transformer\n# Copyright (c) 2021. Xiaolong Liu.\n# ------------------------------------------------------------------------------------------------\n# Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)\n# Copyright (c) 2020 SenseTime. All Rights Reserved.\n# Licensed under the Apache License, Version 2.0 [see LICENSE for details]\n# ------------------------------------------------------------------------------------------------\n# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0\n# ------------------------------------------------------------------------------------------------\n\nfrom __future__ import absolute_import\nfrom __future__ import print_function\nfrom __future__ import division\n\nfrom opts import cfg\n\n# if not cfg.disable_cuda:\n#     from .functions import TDAFunction\n\nimport warnings\nimport math\nimport pdb\n\nimport torch\nfrom torch import nn\nimport torch.nn.functional as F\nfrom torch.nn.init import xavier_uniform_, constant_\n\n\n\ndef _is_power_of_2(n):\n    if (not isinstance(n, int)) or (n < 0):\n        raise ValueError(\n            \"invalid input for _is_power_of_2: {} (type: {})\".format(n, type(n)))\n    return (n & (n-1) == 0) and n != 0\n\n\nclass DeformAttn(nn.Module):\n    def __init__(self, d_model=256, n_levels=1, n_heads=8, n_points=4):\n        \"\"\"\n        Deformable Attention Module\n        :param d_model      hidden dimension\n        :param n_levels     number of feature levels\n        :param n_heads      number of attention heads\n        :param n_points     number of sampling points per attention head\n        \"\"\"\n        super().__init__()\n        if d_model % n_heads != 0:\n            raise ValueError(\n                'd_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))\n        _d_per_head = d_model // n_heads\n        # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation\n        if not _is_power_of_2(_d_per_head):\n            warnings.warn(\"You'd better set d_model in DeformAttn to make the dimension of each attention head a power of 2 \"\n                          \"which is more efficient in our CUDA implementation.\")\n\n        assert n_levels == 1, 'multi-level attention is not supported!'\n\n        self.seq2col_step = 64\n\n        self.d_model = d_model\n        self.n_levels = n_levels\n        self.n_heads = n_heads\n        self.n_points = n_points\n\n        self.sampling_offsets = nn.Linear(\n            d_model, n_heads * n_levels * n_points)\n        self.attention_weights = nn.Linear(\n            d_model, n_heads * n_levels * n_points)\n        self.value_proj = nn.Linear(d_model, d_model)\n        self.output_proj = nn.Linear(d_model, d_model)\n\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        constant_(self.sampling_offsets.weight.data, 0.)\n        # Initial offsets:\n        # (1, 0, -1, 0, -1, 0, 1, 0)\n        thetas = torch.arange(\n            self.n_heads, dtype=torch.float32) * (4.0 * math.pi / self.n_heads)\n        grid_init = thetas.cos()[:, None]\n\n        grid_init = grid_init.view(self.n_heads, 1, 1, 1).repeat(\n            1, self.n_levels, self.n_points, 1)\n        for i in range(self.n_points):\n            grid_init[:, :, i, :] *= i + 1\n\n        with torch.no_grad():\n            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))\n        constant_(self.attention_weights.weight.data, 0.)\n        constant_(self.attention_weights.bias.data, 0.)\n        xavier_uniform_(self.value_proj.weight.data)\n        constant_(self.value_proj.bias.data, 0.)\n        xavier_uniform_(self.output_proj.weight.data)\n        constant_(self.output_proj.bias.data, 0.)\n\n    def forward(self, query, reference_points, input_flatten, input_temporal_lens, input_level_start_index, input_padding_mask=None):\n        \"\"\"\n        :param query (= src + pos)         (N, Length_{query}, C)\n        :param reference_points            (N, Length_{query}, n_levels, 1), range in [0, 1], left (0), right (1), including padding area\n                                        or (N, Length_{query}, n_levels, 2), add additional (t) to form reference segments\n        :param input_flatten (=src)        (N, \\sum_{l=0}^{L-1} T_l, C)\n        :param input_temporal_lens         (n_levels), [T_0, T_1, ..., T_(L-1)]\n        :param input_level_start_index     (n_levels, ), [0, T_0, T_1, T_2, ..., T_{L-1}]\n        :param input_padding_mask          (N, \\sum_{l=0}^{L-1} T_l), True for padding elements, False for non-padding elements\n\n        :return output                     (N, Length_{query}, C)\n        \"\"\"\n        N, Len_q, _ = query.shape\n        N, Len_in, _ = input_flatten.shape\n        assert input_temporal_lens.sum() == Len_in\n\n        value = self.value_proj(input_flatten)\n        if input_padding_mask is not None:\n            value = value.masked_fill(input_padding_mask[..., None], float(0))\n        value = value.view(N, Len_in, self.n_heads,\n                           self.d_model // self.n_heads)\n        # the predicted offset in temporal axis. They are *absolute* values, not normalized\n        sampling_offsets = self.sampling_offsets(query).view(\n            N, Len_q, self.n_heads, self.n_levels, self.n_points, 1)\n        attention_weights = self.attention_weights(query).view(\n            N, Len_q, self.n_heads, self.n_levels * self.n_points)\n        attention_weights = F.softmax(\n            attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)\n\n        if reference_points.shape[-1] == 1:\n            # the reference points are normalized, but the offset are unnormalized\n            # so we need to normalize the offsets\n            offset_normalizer = input_temporal_lens[..., None]\n            # (N, Length_{query}, n_heads, n_levels, n_points, 1)\n            sampling_locations = reference_points[:, :, None, :, None, :] \\\n                + sampling_offsets / \\\n                offset_normalizer[None, None, None, :, None, :]\n        # deform attention in the l-th (l >= 2) decoder layer when segment refinement is enabled\n        elif reference_points.shape[-1] == 2:\n            # offsets are related with the size of the reference segment\n            sampling_locations = reference_points[:, :, None, :, None, :1] \\\n                + sampling_offsets / self.n_points * \\\n                reference_points[:, :, None, :, None, 1:] * 0.5\n\n        else:\n            raise ValueError(\n                'Last dim of reference_points must be 1 or 2, but get {} instead.'.format(reference_points.shape[-1]))\n        if cfg.dfm_att_backend == 'pytorch' or cfg.disable_cuda:\n            # Implementation with PyTorch grid_sample operator. \n            # Note that grid_sample only supports image inputs. We need to view the sequence as an image with height=1\n            sampling_locations = torch.cat((sampling_locations, torch.ones_like(sampling_locations)*0.5), dim=-1)\n            input_spatial_shapes = torch.stack((torch.ones_like(input_temporal_lens), input_temporal_lens), dim=-1)\n            output = deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)\n        else:\n            raise NotImplementedError\n            # # CUDA implementation. You will get identical results with the pytorch implementation\n            # output = TDAFunction.apply(\n            #     value, input_temporal_lens, input_level_start_index, sampling_locations, attention_weights, self.seq2col_step)\n        output = self.output_proj(output)\n        return output, (sampling_locations, attention_weights)\n\n\ndef deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):\n    '''deformable attention implemeted with grid_sample.'''\n    N_, S_, M_, D_ = value.shape\n    _, Lq_, M_, L_, P_, _ = sampling_locations.shape\n    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)\n    sampling_grids = 2 * sampling_locations - 1\n    sampling_value_list = []\n    for lid_, (H_, W_) in enumerate(value_spatial_shapes):\n        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_\n        value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)\n        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2\n        sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)\n        # N_*M_, D_, Lq_, P_\n        sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,\n                                          mode='bilinear', padding_mode='zeros', align_corners=False)\n        sampling_value_list.append(sampling_value_l_)\n    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)\n    attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)\n    output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)\n    return output.transpose(1, 2).contiguous()"
  },
  {
    "path": "models/position_encoding.py",
    "content": "# ------------------------------------------------------------------------\n# TadTR: End-to-end Temporal Action Detection with Transformer\n# Copyright (c) 2021. Xiaolong Liu.#\n#  ------------------------------------------------------------------------\n# Modified from DETR (https://github.com/facebookresearch/detr)\n# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n# ------------------------------------------------------------------------\n\n\n\"\"\"\nPositional encodings for the transformer.\n\"\"\"\nimport math\nimport torch\nfrom torch import nn\n\nfrom util.misc import NestedTensor\n\n\nclass PositionEmbeddingSine(nn.Module):\n    \"\"\"\n    This is a more standard version of the position embedding, very similar to the one\n    used by the Attention is all you need paper, generalized to work on videos.\n    \"\"\"\n    def __init__(self, num_pos_feats=256, temperature=10000, normalize=False, scale=None):\n        super().__init__()\n        self.num_pos_feats = num_pos_feats\n        self.temperature = temperature\n        self.normalize = normalize\n        if scale is not None and normalize is False:\n            raise ValueError(\"normalize should be True if scale is passed\")\n        if scale is None:\n            scale = 2 * math.pi\n        self.scale = scale\n\n    def forward(self, tensor_list: NestedTensor):\n        x = tensor_list.tensors\n        mask = tensor_list.mask\n        assert mask is not None\n        not_mask = ~mask\n        x_embed = not_mask.cumsum(1, dtype=torch.float32)  # N x T\n        if self.normalize:\n            eps = 1e-6\n            x_embed = x_embed / (x_embed[:, -1:] + eps) * self.scale\n\n        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)\n        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)\n\n        pos_x = x_embed[:, :, None] / dim_t  # N x T x C\n        # n,c,t\n        pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)\n        pos = pos_x.permute(0, 2, 1)    # N x C x T\n        return pos\n\n\ndef build_position_encoding(args):\n    feat_dim = args.hidden_dim\n    if args.position_embedding in ('v2', 'sine'):\n        position_embedding = PositionEmbeddingSine(feat_dim, normalize=True)\n    else:\n        raise ValueError(f\"not supported {args.position_embedding}\")\n\n    return position_embedding\n"
  },
  {
    "path": "models/tadtr.py",
    "content": "# ------------------------------------------------------------------------\n# TadTR: End-to-end Temporal Action Detection with Transformer\n# Copyright (c) 2021. Xiaolong Liu.\n# ------------------------------------------------------------------------\n# Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)\n# Copyright (c) 2020 SenseTime. All Rights Reserved.\n# Licensed under the Apache License, Version 2.0\n# ------------------------------------------------------------------------\n# and DETR (https://github.com/facebookresearch/detr)\n# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n# ------------------------------------------------------------------------\n\n\"\"\"\nTadTR model and criterion classes.\n\"\"\"\nimport math\nimport copy\n\nimport torch\nimport torch.nn.functional as F\nfrom torch import nn\n\nfrom util import segment_ops\nfrom util.misc import (NestedTensor, nested_tensor_from_tensor_list,\n                       accuracy, get_world_size,\n                       is_dist_avail_and_initialized, inverse_sigmoid)\nfrom models.matcher import build_matcher\nfrom models.position_encoding import build_position_encoding\nfrom .custom_loss import sigmoid_focal_loss\nfrom .transformer import build_deformable_transformer\nfrom opts import cfg\n\nif not cfg.disable_cuda:\n    from models.ops.roi_align import ROIAlign\n\n\ndef _get_clones(module, N):\n    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])\n\n\ndef get_norm(norm_type, dim, num_groups=None):\n    if norm_type == 'gn':\n        assert num_groups is not None, 'num_groups must be specified'\n        return nn.GroupNorm(num_groups, dim)\n    elif norm_type == 'bn':\n        return nn.BatchNorm1d(dim)\n    else:\n        raise NotImplementedError\n\n\nclass TadTR(nn.Module):\n    \"\"\" This is the TadTR module that performs temporal action detection \"\"\"\n\n    def __init__(self, position_embedding, transformer, num_classes, num_queries, aux_loss=True, with_segment_refine=True, with_act_reg=True):\n        \"\"\" Initializes the model.\n        Parameters:\n            backbone: torch module of the backbone to be used. See backbone.py\n            transformer: torch module of the transformer architecture. See deformable_transformer.py\n            num_classes: number of action classes\n            num_queries: number of action queries, ie detection slot. This is the maximal number of actions\n                         TadTR can detect in a single video.\n            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.\n            with_segment_refine: iterative segment refinement\n        \"\"\"\n        super().__init__()\n        self.num_queries = num_queries\n        self.transformer = transformer\n        hidden_dim = transformer.d_model\n        self.class_embed = nn.Linear(hidden_dim, num_classes)\n        self.segment_embed = MLP(hidden_dim, hidden_dim, 2, 3)\n        self.query_embed = nn.Embedding(num_queries, hidden_dim*2)\n\n        self.input_proj = nn.ModuleList([\n            nn.Sequential(\n                nn.Conv1d(2048, hidden_dim, kernel_size=1),\n                nn.GroupNorm(32, hidden_dim),\n            )])\n        # self.backbone = backbone\n        self.position_embedding = position_embedding\n        self.aux_loss = aux_loss\n        self.with_segment_refine = with_segment_refine\n        self.with_act_reg = with_act_reg\n\n        prior_prob = 0.01\n        bias_value = -math.log((1 - prior_prob) / prior_prob)\n        self.class_embed.bias.data = torch.ones(num_classes) * bias_value\n        nn.init.constant_(self.segment_embed.layers[-1].weight.data, 0)\n        nn.init.constant_(self.segment_embed.layers[-1].bias.data, 0)\n        for proj in self.input_proj:\n            nn.init.xavier_uniform_(proj[0].weight, gain=1)\n            nn.init.constant_(proj[0].bias, 0)\n\n        num_pred = transformer.decoder.num_layers\n        if with_segment_refine:\n            self.class_embed = _get_clones(self.class_embed, num_pred)\n            self.segment_embed = _get_clones(self.segment_embed, num_pred)\n            nn.init.constant_(\n                self.segment_embed[0].layers[-1].bias.data[1:], -2.0)\n            # hack implementation for segment refinement\n            self.transformer.decoder.segment_embed = self.segment_embed\n        else:\n            nn.init.constant_(\n                self.segment_embed.layers[-1].bias.data[1:], -2.0)\n            self.class_embed = nn.ModuleList(\n                [self.class_embed for _ in range(num_pred)])\n            self.segment_embed = nn.ModuleList(\n                [self.segment_embed for _ in range(num_pred)])\n            self.transformer.decoder.segment_embed = None\n\n        if with_act_reg:\n            # RoIAlign params\n            self.roi_size = 16\n            self.roi_scale = 0\n            self.roi_extractor = ROIAlign(self.roi_size, self.roi_scale)\n            self.actionness_pred = nn.Sequential(\n                nn.Linear(self.roi_size * hidden_dim, hidden_dim),\n                nn.ReLU(inplace=True),\n                nn.Linear(hidden_dim, hidden_dim),\n                nn.ReLU(inplace=True),\n                nn.Linear(hidden_dim, 1),\n                nn.Sigmoid()\n            )\n\n    def _to_roi_align_format(self, rois, T, scale_factor=1):\n        '''Convert RoIs to RoIAlign format.\n        Params:\n            RoIs: normalized segments coordinates, shape (batch_size, num_segments, 4)\n            T: length of the video feature sequence\n        '''\n        # transform to absolute axis\n        B, N = rois.shape[:2]\n        rois_center = rois[:, :, 0:1]\n        rois_size = rois[:, :, 1:2] * scale_factor\n        rois_abs = torch.cat(\n            (rois_center - rois_size/2, rois_center + rois_size/2), dim=2) * T\n        # expand the RoIs\n        rois_abs = torch.clamp(rois_abs, min=0, max=T)  # (N, T, 2)\n        # add batch index\n        batch_ind = torch.arange(0, B).view((B, 1, 1)).to(rois_abs.device)\n        batch_ind = batch_ind.repeat(1, N, 1)\n        rois_abs = torch.cat((batch_ind, rois_abs), dim=2)\n        # NOTE: stop gradient here to stablize training\n        return rois_abs.view((B*N, 3)).detach()\n\n    def forward(self, samples):\n        \"\"\" The forward expects a NestedTensor, which consists of:\n               - samples.tensors: batched images, of shape [batch_size x 3 x H x W]\n               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels\n            or a tuple of tensors and mask\n\n            It returns a dict with the following elements:\n               - \"pred_logits\": the classification logits (including no-action) for all queries.\n                                Shape= [batch_size x num_queries x (num_classes + 1)]\n               - \"pred_segments\": The normalized segments coordinates for all queries, represented as\n                               (center, width). These values are normalized in [0, 1],\n                               relative to the size of each individual image (disregarding possible padding).\n                               See PostProcess for information on how to retrieve the unnormalized segment.\n               - \"aux_outputs\": Optional, only returned when auxilary losses are activated. It is a list of\n                                dictionnaries containing the two above keys for each decoder layer.\n        \"\"\"\n        if not isinstance(samples, NestedTensor):\n            if isinstance(samples, (list, tuple)):\n                samples = NestedTensor(*samples)\n            else:\n                samples = nested_tensor_from_tensor_list(samples)  # (n, c, t)\n\n        pos = [self.position_embedding(samples)]\n        src, mask = samples.tensors, samples.mask\n        srcs = [self.input_proj[0](src)]\n        masks = [mask]\n\n        query_embeds = self.query_embed.weight\n        hs, init_reference, inter_references, memory = self.transformer(\n            srcs, masks, pos, query_embeds)\n\n        outputs_classes = []\n        outputs_coords = []\n        # gather outputs from each decoder layer\n        for lvl in range(hs.shape[0]):\n            if lvl == 0:\n                reference = init_reference\n            else:\n                reference = inter_references[lvl - 1]\n\n            reference = inverse_sigmoid(reference)\n            outputs_class = self.class_embed[lvl](hs[lvl])\n            tmp = self.segment_embed[lvl](hs[lvl])\n            # the l-th layer (l >= 2)\n            if reference.shape[-1] == 2:\n                tmp += reference\n            # the first layer\n            else:\n                assert reference.shape[-1] == 1\n                tmp[..., 0] += reference[..., 0]\n            outputs_coord = tmp.sigmoid()\n            outputs_classes.append(outputs_class)\n            outputs_coords.append(outputs_coord)\n        outputs_class = torch.stack(outputs_classes)\n        outputs_coord = torch.stack(outputs_coords)\n\n        if not self.with_act_reg:\n            out = {'pred_logits': outputs_class[-1],\n                   'pred_segments': outputs_coord[-1]}\n        else:\n            # perform RoIAlign\n            B, N = outputs_coord[-1].shape[:2]\n            origin_feat = memory\n\n            rois = self._to_roi_align_format(\n                outputs_coord[-1], origin_feat.shape[2], scale_factor=1.5)\n            roi_features = self.roi_extractor(origin_feat, rois)\n            roi_features = roi_features.view((B, N, -1))\n            pred_actionness = self.actionness_pred(roi_features)\n\n            last_layer_cls = outputs_class[-1]\n            last_layer_reg = outputs_coord[-1]\n\n            out = {'pred_logits': last_layer_cls,\n                   'pred_segments': last_layer_reg, 'pred_actionness': pred_actionness}\n\n        if self.aux_loss:\n            out['aux_outputs'] = self._set_aux_loss(\n                outputs_class, outputs_coord)\n\n        return out\n\n    @torch.jit.unused\n    def _set_aux_loss(self, outputs_class, outputs_coord):\n        # this is a workaround to make torchscript happy, as torchscript\n        # doesn't support dictionary with non-homogeneous values, such\n        # as a dict having both a Tensor and a list.\n        return [{'pred_logits': a, 'pred_segments': b}\n                for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]\n\n\nclass SetCriterion(nn.Module):\n    \"\"\" This class computes the loss for TadTR.\n    The process happens in two steps:\n        1) we compute hungarian assignment between ground truth segments and the outputs of the model\n        2) we supervise each pair of matched ground-truth / prediction (supervise class and segment)\n    \"\"\"\n\n    def __init__(self, num_classes, matcher, weight_dict, losses, focal_alpha=0.25):\n        \"\"\" Create the criterion.\n        Parameters:\n            num_classes: number of action categories, omitting the special no-action category\n            matcher: module able to compute a matching between targets and proposals\n            weight_dict: dict containing as key the names of the losses and as values their relative weight.\n            losses: list of all the losses to be applied. See get_loss for list of available losses.\n            focal_alpha: alpha in Focal Loss\n        \"\"\"\n        super().__init__()\n        self.num_classes = num_classes\n        self.matcher = matcher\n        self.weight_dict = weight_dict\n        self.losses = losses\n        self.focal_alpha = focal_alpha\n\n    def loss_labels(self, outputs, targets, indices, num_segments, log=True):\n        \"\"\"Classification loss (NLL)\n        targets dicts must contain the key \"labels\" containing a tensor of dim [nb_target_segments]\n        \"\"\"\n        assert 'pred_logits' in outputs\n        src_logits = outputs['pred_logits']\n\n        idx = self._get_src_permutation_idx(indices)\n        target_classes_o = torch.cat([t[\"labels\"][J] for t, (_, J) in zip(targets, indices)])\n        target_classes = torch.full(src_logits.shape[:2], self.num_classes,\n                                    dtype=torch.int64, device=src_logits.device)\n        target_classes[idx] = target_classes_o\n\n        target_classes_onehot = torch.zeros([src_logits.shape[0], src_logits.shape[1], src_logits.shape[2] + 1],\n                                            dtype=src_logits.dtype, layout=src_logits.layout, device=src_logits.device)\n        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)\n\n        target_classes_onehot = target_classes_onehot[:,:,:-1]\n        loss_ce = sigmoid_focal_loss(src_logits, target_classes_onehot, num_segments, alpha=self.focal_alpha, gamma=2) * src_logits.shape[1]  # nq\n        losses = {'loss_ce': loss_ce}\n\n        if log:\n            # TODO this should probably be a separate loss, not hacked in this one here\n            losses['class_error'] = 100 - accuracy(src_logits[idx], target_classes_o)[0]\n\n        return losses\n\n    def loss_segments(self, outputs, targets, indices, num_segments):\n        \"\"\"Compute the losses related to the segmentes, the L1 regression loss and the IoU loss\n           targets dicts must contain the key \"segments\" containing a tensor of dim [nb_target_segments, 2]\n           The target segments are expected in format (center, width), normalized by the video length.\n        \"\"\"\n        assert 'pred_segments' in outputs\n        idx = self._get_src_permutation_idx(indices)\n        src_segments = outputs['pred_segments'][idx]\n        target_segments = torch.cat([t['segments'][i] for t, (_, i) in zip(targets, indices)], dim=0)\n\n        loss_segment = F.l1_loss(src_segments, target_segments, reduction='none')\n\n        losses = {}\n        losses['loss_segments'] = loss_segment.sum() / num_segments\n\n        loss_iou = 1 - torch.diag(segment_ops.segment_iou(\n            segment_ops.segment_cw_to_t1t2(src_segments),\n            segment_ops.segment_cw_to_t1t2(target_segments)))\n        losses['loss_iou'] = loss_iou.sum() / num_segments\n        return losses\n\n    def loss_actionness(self, outputs, targets, indices, num_segments):\n        \"\"\"Compute the actionness regression loss\n           targets dicts must contain the key \"segments\" containing a tensor of dim [nb_target_segments, 2]\n           The target segments are expected in format (center, width), normalized by the video length.\n        \"\"\"\n        assert 'pred_segments' in outputs\n        assert 'pred_actionness' in outputs\n        src_segments = outputs['pred_segments'].view((-1, 2))\n        target_segments = torch.cat([t['segments'] for t in targets], dim=0)\n\n        losses = {}\n\n        iou_mat = segment_ops.segment_iou(\n            segment_ops.segment_cw_to_t1t2(src_segments),\n            segment_ops.segment_cw_to_t1t2(target_segments))\n\n        gt_iou = iou_mat.max(dim=1)[0]\n        pred_actionness = outputs['pred_actionness']\n        loss_actionness = F.l1_loss(pred_actionness.view(-1), gt_iou.view(-1).detach())   \n\n        losses['loss_actionness'] = loss_actionness\n        return losses\n\n    def _get_src_permutation_idx(self, indices):\n        # permute predictions following indices\n        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])\n        src_idx = torch.cat([src for (src, _) in indices])\n        return batch_idx, src_idx\n\n    def _get_tgt_permutation_idx(self, indices):\n        # permute targets following indices\n        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])\n        tgt_idx = torch.cat([tgt for (_, tgt) in indices])\n        return batch_idx, tgt_idx\n\n    def get_loss(self, loss, outputs, targets, indices, num_segments, **kwargs):\n        loss_map = {\n            'labels': self.loss_labels,\n            'segments': self.loss_segments,\n            'actionness': self.loss_actionness,\n        }\n\n        assert loss in loss_map, f'do you really want to compute {loss} loss?'\n        return loss_map[loss](outputs, targets, indices, num_segments, **kwargs)\n\n    def forward(self, outputs, targets):\n        \"\"\" This performs the loss computation.\n        Parameters:\n             outputs: dict of tensors, see the output specification of the model for the format\n             targets: list of dicts, such that len(targets) == batch_size.\n                      The expected keys in each dict depends on the losses applied, see each loss' doc\n        \"\"\"\n        outputs_without_aux = {k: v for k, v in outputs.items() if k != 'aux_outputs'}\n\n        # Retrieve the matching between the outputs of the last layer and the targets\n        indices = self.matcher(outputs_without_aux, targets)\n\n        # Compute the average number of target segments accross all nodes, for normalization purposes\n        num_segments = sum(len(t[\"labels\"]) for t in targets)\n        num_segments = torch.as_tensor([num_segments], dtype=torch.float, device=next(iter(outputs.values())).device)\n        if is_dist_avail_and_initialized():\n            torch.distributed.all_reduce(num_segments)\n        num_segments = torch.clamp(num_segments / get_world_size(), min=1).item()\n\n        # Compute all the requested losses\n        losses = {}\n        for loss in self.losses:\n            kwargs = {}\n            losses.update(self.get_loss(loss, outputs, targets, indices, num_segments, **kwargs))\n\n        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.\n        if 'aux_outputs' in outputs:\n            for i, aux_outputs in enumerate(outputs['aux_outputs']):\n                indices = self.matcher(aux_outputs, targets)\n                for loss in self.losses:\n                    # we do not compute actionness loss for aux outputs\n                    if 'actionness' in loss:\n                        continue\n         \n                    kwargs = {}\n                    if loss == 'labels':\n                        # Logging is enabled only for the last layer\n                        kwargs['log'] = False\n                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_segments, **kwargs)\n                    l_dict = {k + f'_{i}': v for k, v in l_dict.items()}\n                    losses.update(l_dict)\n\n        self.indices = indices\n        return losses\n\n\nclass PostProcess(nn.Module):\n    \"\"\" This module converts the model's output into the format expected by the TADEvaluator\"\"\"\n\n    @torch.no_grad()\n    def forward(self, outputs, target_sizes, fuse_score=True):\n        \"\"\" Perform the computation\n        Parameters:\n            outputs: raw outputs of the model\n            target_sizes: tensor of dimension [batch_size] containing the duration of each video of the batch\n        \"\"\"\n        out_logits, out_segments = outputs['pred_logits'], outputs['pred_segments']\n\n        assert len(out_logits) == len(target_sizes)\n        # assert target_sizes.shape[1] == 1\n\n        prob = out_logits.sigmoid()   # [bs, nq, C]\n        if fuse_score:\n            prob *= outputs['pred_actionness']\n\n        segments = segment_ops.segment_cw_to_t1t2(out_segments)   # bs, nq, 2\n\n        if cfg.postproc_rank == 1:     # default\n            # sort across different instances, pick top 100 at most\n            topk_values, topk_indexes = torch.topk(prob.view(\n                out_logits.shape[0], -1), min(cfg.postproc_ins_topk, prob.shape[1]*prob.shape[2]), dim=1)\n            scores = topk_values\n            topk_segments = topk_indexes // out_logits.shape[2]\n            labels = topk_indexes % out_logits.shape[2]\n\n            # bs, nq, 2; bs, num, 2\n            segments = torch.gather(\n                segments, 1, topk_segments.unsqueeze(-1).repeat(1, 1, 2))\n            query_ids = topk_segments\n        else:\n            # pick topk classes for each query\n            # pdb.set_trace()\n            scores, labels = torch.topk(prob, cfg.postproc_cls_topk, dim=-1)\n            scores, labels = scores.flatten(1), labels.flatten(1)\n            # (bs, nq, 1, 2)\n            segments = segments[:, [\n                i//cfg.postproc_cls_topk for i in range(cfg.postproc_cls_topk*segments.shape[1])], :]\n            query_ids = (torch.arange(0, cfg.postproc_cls_topk*segments.shape[1], 1, dtype=labels.dtype,\n                         device=labels.device) // cfg.postproc_cls_topk)[None, :].repeat(labels.shape[0], 1)\n\n        # from normalized [0, 1] to absolute [0, length] coordinates\n        vid_length = target_sizes\n        scale_fct = torch.stack([vid_length, vid_length], dim=1)\n        segments = segments * scale_fct[:, None, :]\n\n        results = [{'scores': s, 'labels': l, 'segments': b, 'query_ids': q}\n                   for s, l, b, q in zip(scores, labels, segments, query_ids)]\n\n        return results\n\n\nclass MLP(nn.Module):\n    \"\"\" Very simple multi-layer perceptron (also called FFN)\"\"\"\n\n    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):\n        super().__init__()\n        self.num_layers = num_layers\n        h = [hidden_dim] * (num_layers - 1)\n        self.layers = nn.ModuleList(nn.Linear(n, k)\n                                    for n, k in zip([input_dim] + h, h + [output_dim]))\n\n    def forward(self, x):\n        for i, layer in enumerate(self.layers):\n            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)\n        return x\n\n\ndef build(args):\n    if args.binary:\n        num_classes = 1\n    else:\n        if args.dataset_name == 'thumos14':\n            num_classes = 20\n        elif args.dataset_name == 'muses':\n            num_classes = 25\n        elif args.dataset_name in ['activitynet', 'hacs']:\n            num_classes = 200\n        else:\n            raise ValueError('unknown dataset {}'.format(args.dataset_name))\n\n    pos_embed = build_position_encoding(args)\n    transformer = build_deformable_transformer(args)\n\n    model = TadTR(\n        pos_embed,\n        transformer,\n        num_classes=num_classes,\n        num_queries=args.num_queries,\n        aux_loss=args.aux_loss,\n        with_segment_refine=args.seg_refine,\n        with_act_reg=args.act_reg\n    )\n\n    matcher = build_matcher(args)\n    losses = ['labels', 'segments']\n\n    weight_dict = {\n        'loss_ce': args.cls_loss_coef, \n        'loss_segments': args.seg_loss_coef,\n        'loss_iou': args.iou_loss_coef}\n\n    if args.act_reg:\n        weight_dict['loss_actionness'] = args.act_loss_coef\n        losses.append('actionness')\n\n    if args.aux_loss:\n        aux_weight_dict = {}\n        for i in range(args.dec_layers - 1):\n            aux_weight_dict.update({k + f'_{i}': v for k, v in weight_dict.items()})\n        aux_weight_dict.update({k + f'_enc': v for k, v in weight_dict.items()})\n        weight_dict.update(aux_weight_dict)\n\n    criterion = SetCriterion(num_classes, matcher,\n        weight_dict, losses, focal_alpha=args.focal_alpha)\n\n    postprocessor = PostProcess()\n\n    return model, criterion, postprocessor\n"
  },
  {
    "path": "models/transformer.py",
    "content": "# ------------------------------------------------------------------------\n# TadTR: End-to-end Temporal Action Detection with Transformer\n# Copyright (c) 2021. Xiaolong Liu.\n# ------------------------------------------------------------------------\n# Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)\n# Copyright (c) 2020 SenseTime. All Rights Reserved.\n# Licensed under the Apache License, Version 2.0\n# ------------------------------------------------------------------------\n# and DETR (https://github.com/facebookresearch/detr)\n# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n# ------------------------------------------------------------------------\n\nimport copy\n\nimport torch\nimport torch.nn.functional as F\nfrom torch import nn, Tensor\nfrom torch.nn.init import xavier_uniform_, constant_, uniform_, normal_\n\nfrom util.misc import inverse_sigmoid\nfrom models.ops.temporal_deform_attn import DeformAttn\nfrom opts import cfg\n\n\nclass DeformableTransformer(nn.Module):\n    def __init__(self, d_model=256, nhead=8,\n                 num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=1024, dropout=0.1,\n                 activation=\"relu\", return_intermediate_dec=False,\n                 num_feature_levels=4, dec_n_points=4,  enc_n_points=4):\n        super().__init__()\n\n        self.d_model = d_model\n        self.nhead = nhead\n\n        encoder_layer = DeformableTransformerEncoderLayer(d_model, dim_feedforward,\n                                                        dropout, activation,\n                                                        num_feature_levels, nhead, enc_n_points)\n        self.encoder = DeformableTransformerEncoder(encoder_layer, num_encoder_layers)\n\n        decoder_layer = DeformableTransformerDecoderLayer(d_model, dim_feedforward,\n                                                          dropout, activation,\n                                                          num_feature_levels, nhead, dec_n_points)\n        self.decoder = DeformableTransformerDecoder(decoder_layer, num_decoder_layers, return_intermediate_dec)\n\n        self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))\n\n        self.reference_points = nn.Linear(d_model, 1)\n\n        self._reset_parameters()\n\n    def _reset_parameters(self):\n        for p in self.parameters():\n            if p.dim() > 1:\n                nn.init.xavier_uniform_(p)\n        for m in self.modules():\n            if isinstance(m, DeformAttn):\n                m._reset_parameters()\n\n        xavier_uniform_(self.reference_points.weight.data, gain=1.0)\n        constant_(self.reference_points.bias.data, 0.)\n        normal_(self.level_embed)\n\n    def get_valid_ratio(self, mask):\n        _, T = mask.shape\n        valid_T = torch.sum(~mask, 1)\n        valid_ratio = valid_T.float() / T\n        return valid_ratio    # shape=(bs)\n\n    def forward(self, srcs, masks, pos_embeds, query_embed=None):\n        '''\n        Params:\n            srcs: list of Tensor with shape (bs, c, t)\n            masks: list of Tensor with shape (bs, t)\n            pos_embeds: list of Tensor with shape (bs, c, t)\n            query_embed: list of Tensor with shape (nq, 2c)\n        Returns:\n            hs: list, per layer output of decoder\n            init_reference_out: reference points predicted from query embeddings\n            inter_references_out: reference points predicted from each decoder layer\n            memory: (bs, c, t), final output of the encoder\n        '''\n        assert query_embed is not None\n        # prepare input for encoder\n        src_flatten = []\n        mask_flatten = []\n        lvl_pos_embed_flatten = []\n        temporal_lens = []\n        for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):\n            bs, c, t = src.shape\n            temporal_lens.append(t)\n            # (bs, c, t) => (bs, t, c)\n            src = src.transpose(1, 2)   \n            pos_embed = pos_embed.transpose(1, 2)\n            lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)\n            lvl_pos_embed_flatten.append(lvl_pos_embed)\n            src_flatten.append(src)\n            mask_flatten.append(mask)\n\n        src_flatten = torch.cat(src_flatten, 1)\n        mask_flatten = torch.cat(mask_flatten, 1)\n        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)\n        temporal_lens = torch.as_tensor(temporal_lens, dtype=torch.long, device=src_flatten.device)\n        level_start_index = torch.cat((temporal_lens.new_zeros((1, )), temporal_lens.cumsum(0)[:-1]))\n        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)   # (bs, nlevels)\n\n        # deformable encoder\n        memory = self.encoder(src_flatten, temporal_lens, level_start_index, valid_ratios, \n            lvl_pos_embed_flatten if cfg.use_pos_embed else None, \n            mask_flatten)  # shape=(bs, t, c)\n\n        bs, _, c = memory.shape\n        \n        query_embed, tgt = torch.split(query_embed, c, dim=1)\n        query_embed = query_embed.unsqueeze(0).expand(bs, -1, -1)\n        tgt = tgt.unsqueeze(0).expand(bs, -1, -1)\n        reference_points = self.reference_points(query_embed).sigmoid()\n        init_reference_out = reference_points\n\n        # decoder\n        hs, inter_references = self.decoder(tgt, reference_points, memory,\n                                            temporal_lens, level_start_index, valid_ratios, query_embed, mask_flatten)\n        inter_references_out = inter_references \n        return hs, init_reference_out, inter_references_out, memory.transpose(1, 2)\n\n\nclass DeformableTransformerEncoderLayer(nn.Module):\n    def __init__(self,\n                 d_model=256, d_ffn=1024,\n                 dropout=0.1, activation=\"relu\",\n                 n_levels=4, n_heads=8, n_points=4):\n        super().__init__()\n\n        # self attention\n        self.self_attn = DeformAttn(d_model, n_levels, n_heads, n_points)\n        self.dropout1 = nn.Dropout(dropout)\n        self.norm1 = nn.LayerNorm(d_model)\n\n        # ffn\n        self.linear1 = nn.Linear(d_model, d_ffn)\n        self.activation = _get_activation_fn(activation)\n        self.dropout2 = nn.Dropout(dropout)\n        self.linear2 = nn.Linear(d_ffn, d_model)\n        self.dropout3 = nn.Dropout(dropout)\n        self.norm2 = nn.LayerNorm(d_model)\n\n    @staticmethod\n    def with_pos_embed(tensor, pos):\n        return tensor if pos is None else tensor + pos\n\n    def forward_ffn(self, src):\n        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))\n        src = src + self.dropout3(src2)\n        src = self.norm2(src)\n        return src\n\n    def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, padding_mask=None):\n        # self attention\n        src2, _ = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index, padding_mask)\n        src = src + self.dropout1(src2)\n        src = self.norm1(src)\n\n        # ffn\n        src = self.forward_ffn(src)\n\n        return src\n\n\nclass DeformableTransformerEncoder(nn.Module):\n    def __init__(self, encoder_layer, num_layers):\n        super().__init__()\n        self.layers = _get_clones(encoder_layer, num_layers)\n        self.num_layers = num_layers\n\n    @staticmethod\n    def get_reference_points(spatial_shapes, valid_ratios, device):\n        reference_points_list = []\n        for lvl, T_ in enumerate(spatial_shapes):\n            ref = torch.linspace(0.5, T_ - 0.5, T_, dtype=torch.float32, device=device)  # (t,)\n            ref = ref[None] / (valid_ratios[:, None, lvl] * T_)                          # (bs, t)\n            reference_points_list.append(ref)\n        reference_points = torch.cat(reference_points_list, 1)\n        reference_points = reference_points[:, :, None] * valid_ratios[:, None]          # (N, t, n_levels)\n        return reference_points[..., None]                                               # (N, t, n_levels, 1)\n\n    def forward(self, src, temporal_lens, level_start_index, valid_ratios, pos=None, padding_mask=None):\n        '''\n        src: shape=(bs, t, c)\n        temporal_lens: shape=(n_levels). content: [t1, t2, t3, ...]\n        level_start_index: shape=(n_levels,). [0, t1, t1+t2, ...]\n        valid_ratios: shape=(bs, n_levels).\n        '''\n        output = src\n        # (bs, t, levels, 1)\n        reference_points = self.get_reference_points(temporal_lens, valid_ratios, device=src.device)\n        for _, layer in enumerate(self.layers):\n            output = layer(output, pos, reference_points, temporal_lens, level_start_index, padding_mask)\n        return output\n\n\nclass DeformableTransformerDecoderLayer(nn.Module):\n    def __init__(self, d_model=256, d_ffn=1024,\n                 dropout=0.1, activation=\"relu\",\n                 n_levels=4, n_heads=8, n_points=4):\n        super().__init__()\n\n        # cross attention\n        self.cross_attn = DeformAttn(d_model, n_levels, n_heads, n_points)\n        self.dropout1 = nn.Dropout(dropout)\n        self.norm1 = nn.LayerNorm(d_model)\n\n        # self attention\n        self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)\n        self.dropout2 = nn.Dropout(dropout)\n        self.norm2 = nn.LayerNorm(d_model)\n\n        # ffn\n        self.linear1 = nn.Linear(d_model, d_ffn)\n        self.activation = _get_activation_fn(activation)\n        self.dropout3 = nn.Dropout(dropout)\n        self.linear2 = nn.Linear(d_ffn, d_model)\n        self.dropout4 = nn.Dropout(dropout)\n        self.norm3 = nn.LayerNorm(d_model)\n\n    @staticmethod\n    def with_pos_embed(tensor, pos):\n        return tensor if pos is None else tensor + pos\n\n    def forward_ffn(self, tgt):\n        tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))\n        tgt = tgt + self.dropout4(tgt2)\n        tgt = self.norm3(tgt)\n        return tgt\n\n    def forward(self, tgt, query_pos, reference_points, src, src_spatial_shapes, level_start_index, src_padding_mask=None):\n        if not cfg.disable_query_self_att:\n            # self attention\n            q = k = self.with_pos_embed(tgt, query_pos)\n\n            tgt2 = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), tgt.transpose(0, 1))[0].transpose(0, 1)\n            tgt = tgt + self.dropout2(tgt2)\n            tgt = self.norm2(tgt)\n\n        else:\n            pass\n        # cross attention\n        tgt2, _ = self.cross_attn(self.with_pos_embed(tgt, query_pos),\n                               reference_points,\n                               src, src_spatial_shapes, level_start_index, src_padding_mask)\n        tgt = tgt + self.dropout1(tgt2)\n        tgt = self.norm1(tgt)\n\n        # ffn\n        tgt = self.forward_ffn(tgt)\n\n        return tgt\n\n\nclass DeformableTransformerDecoder(nn.Module):\n    def __init__(self, decoder_layer, num_layers, return_intermediate=False):\n        super().__init__()\n        self.layers = _get_clones(decoder_layer, num_layers)\n        self.num_layers = num_layers\n        self.return_intermediate = return_intermediate\n        # hack implementation for iterative bounding box refinement and two-stage Deformable DETR\n        self.segment_embed = None\n        self.class_embed = None\n\n    def forward(self, tgt, reference_points, src, src_spatial_shapes, src_level_start_index, src_valid_ratios,\n                query_pos=None, src_padding_mask=None):\n        '''\n        tgt: [bs, nq, C]\n        reference_points: [bs, nq, 1 or 2]\n        src: [bs, T, C]\n        src_valid_ratios: [bs, levels]\n        '''\n        output = tgt\n        intermediate = []\n        intermediate_reference_points = []\n        for lid, layer in enumerate(self.layers):\n            # (bs, nq, 1, 1 or 2) x (bs, 1, num_level, 1) => (bs, nq, num_level, 1 or 2)\n            reference_points_input = reference_points[:, :, None] * src_valid_ratios[:, None,:, None]\n            output = layer(output, query_pos, reference_points_input, src, src_spatial_shapes, src_level_start_index, src_padding_mask)\n            \n            # hack implementation for segment refinement\n            if self.segment_embed is not None:\n                # update the reference point/segment of the next layer according to the output from the current layer\n                tmp = self.segment_embed[lid](output)\n                if reference_points.shape[-1] == 2:\n                    new_reference_points = tmp + inverse_sigmoid(reference_points)\n                    new_reference_points = new_reference_points.sigmoid()\n                else:\n                    # at the 0-th decoder layer\n                    # d^(n+1) = delta_d^(n+1)\n                    # c^(n+1) = sigmoid( inverse_sigmoid(c^(n)) + delta_c^(n+1))\n                    assert reference_points.shape[-1] == 1\n                    new_reference_points = tmp\n                    new_reference_points[..., :1] = tmp[..., :1] + inverse_sigmoid(reference_points)\n                    new_reference_points = new_reference_points.sigmoid()\n                reference_points = new_reference_points.detach()\n\n            if self.return_intermediate:\n                intermediate.append(output)\n                intermediate_reference_points.append(reference_points)\n        if self.return_intermediate:\n            return torch.stack(intermediate), torch.stack(intermediate_reference_points)\n\n        return output, reference_points\n\n\ndef _get_clones(module, N):\n    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])\n\n\ndef _get_activation_fn(activation):\n    \"\"\"Return an activation function given a string\"\"\"\n    if activation == \"relu\":\n        return F.relu\n    if activation == \"gelu\":\n        return F.gelu\n    if activation == \"glu\":\n        return F.glu\n    if activation == \"leaky_relu\":\n        return F.leaky_relu\n    raise RuntimeError(F\"activation should be relu/gelu, not {activation}.\")\n\n\ndef build_deformable_transformer(args):\n    return DeformableTransformer(\n        d_model=args.hidden_dim,\n        nhead=args.nheads,\n        num_encoder_layers=args.enc_layers,\n        num_decoder_layers=args.dec_layers,\n        dim_feedforward=args.dim_feedforward,\n        dropout=args.dropout,\n        activation=args.activation,\n        return_intermediate_dec=True,\n        num_feature_levels=1,\n        dec_n_points=args.dec_n_points,\n        enc_n_points=args.enc_n_points)\n\n\n"
  },
  {
    "path": "opts.py",
    "content": "# ------------------------------------------------------------------------\n# TadTR: End-to-end Temporal Action Detection with Transformer\n# Copyright (c) 2021 - 2022. Xiaolong Liu.\n# ------------------------------------------------------------------------\n\n\nimport argparse\nfrom easydict import EasyDict\nimport yaml\n\n\n\ndef str2bool(x):\n    if x.lower() in ['true', 't', '1', 'y']:\n        return True\n    else:\n        return False\n\n\ndef get_args_parser():\n    parser = argparse.ArgumentParser('TadTR', add_help=False)\n\n    parser.add_argument('--cfg', type=str, help='the config file to use')\n\n    parser.add_argument('--device', default='cuda',\n                        help='device to use for training / testing')\n    parser.add_argument('--seed', default=42, type=int)\n\n    parser.add_argument('--resume', default='', help='resume from checkpoint')\n    \n    parser.add_argument('--eval', action='store_true', help='perform testing')\n    parser.add_argument('--num_workers', default=2, type=int, help='number of dataloader workers')\n\n    # Multi-GPU training\n    # We support both DataParallel and Distributed DataParallel (DDP)\n    parser.add_argument('--multi_gpu', action='store_true', help='use nn.DataParallel')\n    parser.add_argument('--world_size', default=1, type=int,\n                        help='number of distributed processes')\n    parser.add_argument('--dist_url', default='env://',\n                        help='url used to set up distributed training')\n\n    # Other options\n    parser.add_argument('opt', nargs=argparse.REMAINDER,\n                        help='Command arguments that override configs')\n    return parser\n\n\n\ncfg = EasyDict()\n# ---- Basic option ----\n# whether to enable tensorboard\ncfg.tensorboard = False\n# Disable CUDA extensions so that we can run the model on CPU\ncfg.disable_cuda = False\n# The backend of deformable attention, pytorch or CUDA\ncfg.dfm_att_backend = 'pytorch'\n\n# path where to save, empty for no saving\ncfg.output_dir = ''\n\n\n# # ------ Data options ------\ncfg.dataset_name = 'thumos14'\n# Use feature input or raw image input (jointly train the video encoder and the detection head). Choices: {feature, image}\ncfg.input_type = 'feature'   \n# Which kind of feature to use. e.g. i3d, tsn.\ncfg.feature = 'i3d2s'\n# dimension (channels) of the video feature\ncfg.feature_dim = 2048\n# Perform binary detection (proposal generation) only \ncfg.binary = False\n# Testing on Which subset 'val' or 'test' (For Anet and HACS). Note that we rename the training/validation/testing subsets for all datasets. For example, the validation subset used for training on THUMOS14 is renamed as 'train' subset.\ncfg.test_set = 'val'\n# whether to crop video into windows (A window is also called a slice in this codebase). Required for THUMOS14\ncfg.online_slice = False\n# length of video slices. For feature input, the length is for feature sequence. For video input, the length is for frame sequence.\ncfg.slice_len = None\n# overlap ratio (=overlap_length/slice_length) between adjacent slices during training\ncfg.slice_overlap = 0\n# overlap ratio between adjacent slices during inference \ncfg.test_slice_overlap = 0\n\n\n# ---- Model option --------\n# Name of the convolutional backbone to use. If we use video features as input, backbone should be 'none' \ncfg.backbone = 'none'\n\n# whether to use position embedding\ncfg.use_pos_embed = True\n# Type of positional embedding to use on top of the video features. Only support sine embedding.\ncfg.position_embedding = \"sine\"\n\n# Number of encoding layers in the transformer\ncfg.enc_layers = 2\n# Number of decoding layers in the transformer\ncfg.dec_layers = 4\n# Intermediate size of the feedforward layers in the transformer blocks\ncfg.dim_feedforward = 2048\n# Size of the embeddings (dimension of the transformer)\ncfg.hidden_dim = 256\n# Dropout applied in the transformer\ncfg.dropout = 0.1\n# Number of attention heads inside the transformer's attentions\ncfg.nheads = 8\n# Number of sampled points per head for deformable attention in the encoder\ncfg.enc_n_points = 4\n# Number of sampled points per head for deformable attention in the decoder\ncfg.dec_n_points = 4\n# Number of action queries\ncfg.num_queries = 30\n# Transformer activation type, relu|leaky_relu|gelu\ncfg.activation = 'relu'\n# Whether to enable segment refinement mechanism\ncfg.seg_refine = True\n# Whether to enable actionness regression head\ncfg.act_reg = True\n# whether to disable self-attention between action queries\ncfg.disable_query_self_att = False\n\n\n# ----- Loss and matcher setting -------\n# Enable auxiliary decoding losses (loss at each layer)\ncfg.aux_loss = True\n\n# Loss weight \ncfg.act_loss_coef = 4\ncfg.cls_loss_coef = 2\ncfg.seg_loss_coef = 5\ncfg.iou_loss_coef = 2\n# Relative classification weight of the no-action class\ncfg.eos_coef = 0.1\n# For focal loss\ncfg.focal_alpha = 0.25\n\n# Set cost weight\ncfg.set_cost_class = 6    # Class coefficient \ncfg.set_cost_seg = 5      # Segment L1 coefficient \ncfg.set_cost_iou = 2      # Segment IoU coefficient\n\n\n# ----- Training option -------\n# base learning rate. If you set lr in yaml file, don't use this format, use 0.0002 instead\ncfg.lr = 2e-4\n\n# Valid only when the input is video frames\n# specify the name pattern of the backbone layers.\ncfg.lr_backbone_names = ['backbone']\n# learning rate of backbone layers\ncfg.lr_backbone = 1e-5\n\n# special linear projection layers that need to use smaller lr\ncfg.lr_linear_proj_names = ['reference_points', 'sampling_offsets']\ncfg.lr_linear_proj_mult = 0.1\n\n# which optimizer to use, choose from ['AdamW', 'Adam', 'SGD']\ncfg.optimizer = 'AdamW'\ncfg.batch_size = 16\ncfg.weight_decay = 1e-4\n# gradient clipping max norm\ncfg.clip_max_norm = 0.1\n\n# maximum number of training epochs\ncfg.epochs = 16\n\n# when to decay lr\ncfg.lr_step = [14]\n# save checkpoint every N epochs. Set it to a small value if you want to save intermediate models\ncfg.ckpt_interval = 10\n# update parameters every N forward-backward passes. N=1 (default)\ncfg.iter_size = 1\n# test model every N epochs. N=1 (default)\ncfg.test_interval = 1\n\n\n# ----- Postproc option -------\n# How to rank the predicted instances. \n# 1: for each query, generate a instance for each class; then pick top-scored instance from the whole set\n# 2: pick top classes for each query\ncfg.postproc_rank = 1\n# for each query, pick top k classes; keep all queries\n# this setting is useful for debug\ncfg.postproc_cls_topk = 1\n# for each video, pick topk detections\ncfg.postproc_ins_topk = 100\n# IoU threshold for NMS. Note that NMS is not necessary.\ncfg.nms_thr = 0.4\n\n\n\ndef update_cfg_with_args(cfg, arg_list):\n    from ast import literal_eval\n    for i in range(0, len(arg_list), 2):\n        cur_entry = cfg\n        key_parts = arg_list[i].split('.')\n        for k in key_parts[:-1]:\n            cur_entry = cur_entry[k]\n        node = key_parts[-1]\n        try:\n            cur_entry[node] = literal_eval(arg_list[i+1])\n        except:\n            # print(f'literal_eval({arg_list[i+1]}) failed, directly take the value')\n            cur_entry[node] = arg_list[i+1]\n\n\ndef update_cfg_from_file(cfg, cfg_path):\n    import os\n    assert os.path.exists(cfg_path), 'cfg_path is invalid'\n    cfg_from_file = yaml.load(open(cfg_path), yaml.FullLoader)\n    cfg.update(cfg_from_file)"
  },
  {
    "path": "requirements.txt",
    "content": "torch>=1.5.1\ntorchvision>=0.6.1\nscipy\ntqdm\neasydict\nPyYAML\nnumpy\npandas\n"
  },
  {
    "path": "scripts/run_parallel.sh",
    "content": "# Run on two GPUs in non-distributed mode (more convenient)\nCUDA_VISIBLE_DEVICES=0,1 python -u main.py --cfg \"CFG_PATH\" --multi_gpu\n\n# Run on two GPUs in distributed mode (more powerful)\nMASTER_PORT=29510\nCUDA_VISIBLE_DEVICES=0,1 python -u -m torch.distributed.launch --nproc_per_node=2 --master_port ${MASTER_PORT} --use_env main.py --cfg \"CFG_PATH\"\n"
  },
  {
    "path": "scripts/test_reference_models.sh",
    "content": "dataset=$1\n   \nif [[ $dataset = thumos14 ]];then\n\n    CUDA_VISIBLE_DEVICES=0 python main.py --cfg configs/thumos14_i3d2s_tadtr.yml --eval --resume data/thumos14/thumos14_i3d2s_tadtr_reference.pth\nelse\n    echo \"Unsupported dataset ${dataset}. Exit\"\nfi\n"
  },
  {
    "path": "util/__init__.py",
    "content": "# ------------------------------------------------------------------------\n# TadTR: End-to-end Temporal Action Detection with Transformer\n# Copyright (c) 2021. Xiaolong Liu.\n# ------------------------------------------------------------------------"
  },
  {
    "path": "util/logger.py",
    "content": "# ------------------------------------------------------------------------\n# TadTR: End-to-end Temporal Action Detection with Transformer\n# Copyright (c) 2021. Xiaolong Liu.\n# ------------------------------------------------------------------------\n\n\nimport builtins\nimport logging\nimport sys\nfrom .misc import is_main_process\n\n\ndef _suppress_print():\n    \"\"\"\n    Suppresses printing from the current process.\n    \"\"\"\n\n    def print_pass(*objects, sep=\" \", end=\"\\n\", file=sys.stdout, flush=False):\n        pass\n\n    builtins.print = print_pass\n\n\ndef setup_logger(log_file_path, name=None, level=logging.INFO):\n    \"\"\"\n    Setup a logger that simultaneously output to a file and stdout\n    ARGS\n      log_file_path: string, path to the logging file\n    \"\"\"\n    if is_main_process():\n        print('this is master process, set up logger')\n        # logging settings\n        #   log_formatter = logging.Formatter(\"%(asctime)s [%(levelname)-5.5s]  %(message)s\")\n        log_formatter = logging.Formatter(\n            \"[%(asctime)s][%(levelname)s] %(pathname)s: %(lineno)4d: %(message)s\",\n            datefmt=\"%m/%d %H:%M:%S\")\n        root_logger = logging.getLogger(name)\n        if name:\n            root_logger.propagate = False\n        root_logger.setLevel(level)\n        # file handler\n        if log_file_path is not None:\n            log_file_handler = logging.FileHandler(log_file_path)\n            log_file_handler.setFormatter(log_formatter)\n           \n            root_logger.addHandler(log_file_handler)\n\n        # stdout handler\n        log_formatter = logging.Formatter(\n            \"[%(asctime)s][%(levelname)s]: %(message)s\",\n            datefmt=\"%m/%d %H:%M:%S\")\n        log_stream_handler = logging.StreamHandler(sys.stdout)\n        log_stream_handler.setFormatter(log_formatter)\n        root_logger.addHandler(log_stream_handler)\n\n        logging.info('Log file is %s' % log_file_path)\n        return root_logger\n\n    else:\n        print('this is not a master process, suppress print')\n        _suppress_print()\n"
  },
  {
    "path": "util/misc.py",
    "content": "# ------------------------------------------------------------------------\n# TadTR: End-to-end Temporal Action Detection with Transformer\n# Copyright (c) 2021. Xiaolong Liu.\n# ------------------------------------------------------------------------\n# Modified from DETR (https://github.com/facebookresearch/detr)\n# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n# ------------------------------------------------------------------------\n\n\"\"\"\nMisc functions, including distributed helpers.\n\nMostly copy-paste from torchvision references.\n\"\"\"\nimport os\nimport subprocess\nimport time\nfrom collections import defaultdict, deque\nimport datetime\nimport pickle\nfrom typing import Optional, List\n\nimport torch\nimport torch.distributed as dist\nfrom torch import Tensor\nimport logging\n\n# needed due to empty tensor bug in pytorch and torchvision 0.5\nimport torchvision\n\n\ndef mkdir_if_not_exist(dirname):\n    if not os.path.exists(dirname):\n        os.makedirs(dirname)\n\n\nclass SmoothedValue(object):\n    \"\"\"Track a series of values and provide access to smoothed values over a\n    window or the global series average.\n    \"\"\"\n\n    def __init__(self, window_size=20, fmt=None):\n        if fmt is None:\n            fmt = \"{median:.4f} ({global_avg:.4f})\"\n        self.deque = deque(maxlen=window_size)\n        self.total = 0.0\n        self.count = 0\n        self.fmt = fmt\n\n    def update(self, value, n=1):\n        self.deque.append(value)\n        self.count += n\n        self.total += value * n\n\n    def synchronize_between_processes(self):\n        \"\"\"\n        Warning: does not synchronize the deque!\n        \"\"\"\n        if not is_dist_avail_and_initialized():\n            return\n        t = torch.tensor([self.count, self.total],\n                         dtype=torch.float64, device='cuda')\n        dist.barrier()\n        dist.all_reduce(t)\n        t = t.tolist()\n        self.count = int(t[0])\n        self.total = t[1]\n\n    @property\n    def median(self):\n        d = torch.tensor(list(self.deque))\n        return d.median().item()\n\n    @property\n    def avg(self):\n        d = torch.tensor(list(self.deque), dtype=torch.float32)\n        return d.mean().item()\n\n    @property\n    def global_avg(self):\n        return self.total / self.count\n\n    @property\n    def max(self):\n        return max(self.deque)\n\n    @property\n    def value(self):\n        return self.deque[-1]\n\n    def __str__(self):\n        return self.fmt.format(\n            median=self.median,\n            avg=self.avg,\n            global_avg=self.global_avg,\n            max=self.max,\n            value=self.value)\n\n\ndef all_gather(data):\n    \"\"\"\n    Run all_gather on arbitrary picklable data (not necessarily tensors)\n    Args:\n        data: any picklable object\n    Returns:\n        list[data]: list of data gathered from each rank\n    \"\"\"\n    world_size = get_world_size()\n    if world_size == 1:\n        return [data]\n\n    # serialized to a Tensor\n    buffer = pickle.dumps(data)\n    storage = torch.ByteStorage.from_buffer(buffer)\n    tensor = torch.ByteTensor(storage).to(\"cuda\")\n\n    # obtain Tensor size of each rank\n    local_size = torch.tensor([tensor.numel()], device=\"cuda\")\n    size_list = [torch.tensor([0], device=\"cuda\") for _ in range(world_size)]\n    dist.all_gather(size_list, local_size)\n    size_list = [int(size.item()) for size in size_list]\n    max_size = max(size_list)\n\n    # receiving Tensor from all ranks\n    # we pad the tensor because torch all_gather does not support\n    # gathering tensors of different shapes\n    tensor_list = []\n    for _ in size_list:\n        tensor_list.append(torch.empty(\n            (max_size,), dtype=torch.uint8, device=\"cuda\"))\n    if local_size != max_size:\n        padding = torch.empty(size=(max_size - local_size,),\n                              dtype=torch.uint8, device=\"cuda\")\n        tensor = torch.cat((tensor, padding), dim=0)\n    dist.all_gather(tensor_list, tensor)\n\n    data_list = []\n    for size, tensor in zip(size_list, tensor_list):\n        buffer = tensor.cpu().numpy().tobytes()[:size]\n        data_list.append(pickle.loads(buffer))\n\n    return data_list\n\n\ndef reduce_dict(input_dict, average=True):\n    \"\"\"\n    Args:\n        input_dict (dict): all the values will be reduced\n        average (bool): whether to do average or sum\n    Reduce the values in the dictionary from all processes so that all processes\n    have the averaged results. Returns a dict with the same fields as\n    input_dict, after reduction.\n    \"\"\"\n    world_size = get_world_size()\n    if world_size < 2:\n        return input_dict\n    with torch.no_grad():\n        names = []\n        values = []\n        # sort the keys so that they are consistent across processes\n        for k in sorted(input_dict.keys()):\n            names.append(k)\n            values.append(input_dict[k])\n        values = torch.stack(values, dim=0)\n        dist.all_reduce(values)\n        if average:\n            values /= world_size\n        reduced_dict = {k: v for k, v in zip(names, values)}\n    return reduced_dict\n\n\nclass MetricLogger(object):\n    def __init__(self, delimiter=\"\\t\"):\n        self.meters = defaultdict(SmoothedValue)\n        self.delimiter = delimiter\n\n    def update(self, **kwargs):\n        for k, v in kwargs.items():\n            if isinstance(v, torch.Tensor):\n                v = v.item()\n            assert isinstance(v, (float, int))\n            self.meters[k].update(v)\n\n    def __getattr__(self, attr):\n        if attr in self.meters:\n            return self.meters[attr]\n        if attr in self.__dict__:\n            return self.__dict__[attr]\n        raise AttributeError(\"'{}' object has no attribute '{}'\".format(\n            type(self).__name__, attr))\n\n    def __str__(self):\n        loss_str = []\n        for name, meter in self.meters.items():\n            loss_str.append(\n                \"{}: {}\".format(name, str(meter))\n            )\n        return self.delimiter.join(loss_str)\n\n    def synchronize_between_processes(self):\n        for meter in self.meters.values():\n            meter.synchronize_between_processes()\n\n    def add_meter(self, name, meter):\n        self.meters[name] = meter\n\n    def log_every(self, iterable, print_freq, header=None):\n        i = 0\n        if not header:\n            header = ''\n        start_time = time.time()\n        end = time.time()\n        iter_time = SmoothedValue(fmt='{avg:.4f}')\n        data_time = SmoothedValue(fmt='{avg:.4f}')\n        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'\n        if torch.cuda.is_available():\n            log_msg = self.delimiter.join([\n                header,\n                '[{0' + space_fmt + '}/{1}]',\n                'eta: {eta}',\n                '{meters}',\n                'time: {time}',\n                'data: {data}',\n                'max mem: {memory:.0f}'\n            ])\n        else:\n            log_msg = self.delimiter.join([\n                header,\n                '[{0' + space_fmt + '}/{1}]',\n                'eta: {eta}',\n                '{meters}',\n                'time: {time}',\n                'data: {data}'\n            ])\n        MB = 1024.0 * 1024.0\n        for obj in iterable:\n            data_time.update(time.time() - end)\n            yield obj\n            iter_time.update(time.time() - end)\n            if i % print_freq == 0 or i == len(iterable) - 1:\n                eta_seconds = iter_time.global_avg * (len(iterable) - i)\n                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))\n                if torch.cuda.is_available():\n                    logging.info(log_msg.format(\n                        i, len(iterable), eta=eta_string,\n                        meters=str(self),\n                        time=str(iter_time), data=str(data_time),\n                        memory=torch.cuda.max_memory_allocated() / MB))\n                else:\n                    logging.info(log_msg.format(\n                        i, len(iterable), eta=eta_string,\n                        meters=str(self),\n                        time=str(iter_time), data=str(data_time)))\n            i += 1\n            end = time.time()\n        total_time = time.time() - start_time\n        total_time_str = str(datetime.timedelta(seconds=int(total_time)))\n        logging.info('{} Total time: {} ({:.4f} s / it)'.format(\n            header, total_time_str, total_time / len(iterable)))\n\n\ndef get_sha():\n    cwd = os.path.dirname(os.path.abspath(__file__))\n\n    def _run(command):\n        return subprocess.check_output(command, cwd=cwd).decode('ascii').strip()\n    sha = 'N/A'\n    diff = \"clean\"\n    branch = 'N/A'\n    try:\n        sha = _run(['git', 'rev-parse', 'HEAD'])\n        subprocess.check_output(['git', 'diff'], cwd=cwd)\n        diff = _run(['git', 'diff-index', 'HEAD'])\n        diff = \"has uncommited changes\" if diff else \"clean\"\n        branch = _run(['git', 'rev-parse', '--abbrev-ref', 'HEAD'])\n    except Exception:\n        pass\n    message = f\"sha: {sha}, status: {diff}, branch: {branch}\"\n    return message\n\n\ndef collate_fn(batch):\n    batch = list(zip(*batch))\n    batch[0] = nested_tensor_from_tensor_list(batch[0])\n    # print('collate_fn done')\n\n    return tuple(batch)\n\n\ndef _max_by_axis(the_list):\n    # type: (List[List[int]]) -> List[int]\n    maxes = the_list[0]\n    for sublist in the_list[1:]:\n        for index, item in enumerate(sublist):\n            maxes[index] = max(maxes[index], item)\n    return maxes\n\n\nclass NestedTensor(object):\n    def __init__(self, tensors, mask: Optional[Tensor]):\n        self.tensors = tensors\n        self.mask = mask\n\n    def to(self, device):\n        # type: (Device) -> NestedTensor # noqa\n        cast_tensor = self.tensors.to(device)\n        mask = self.mask\n        if mask is not None:\n            assert mask is not None\n            cast_mask = mask.to(device)\n        else:\n            cast_mask = None\n        return NestedTensor(cast_tensor, cast_mask)\n\n    # def cuda(self):\n    #     tensors = self.tensors.cuda()\n    #     mask = self.mask.cuda()\n    #     return NestedTensor(tensors, mask)\n\n    def decompose(self):\n        return self.tensors, self.mask\n\n    def __repr__(self):\n        return str(self.tensors)\n\n\ndef nested_tensor_from_tensor_list(tensor_list: List[Tensor]):\n    # TODO make this more general\n    if tensor_list[0].ndim == 3:  # n,c,t\n        if torchvision._is_tracing():\n            # nested_tensor_from_tensor_list() does not export well to ONNX\n            # call _onnx_nested_tensor_from_tensor_list() instead\n            return _onnx_nested_tensor_from_tensor_list(tensor_list)\n\n        # TODO make it support different-sized images\n        max_size = _max_by_axis([list(img.shape) for img in tensor_list])\n        # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))\n        batch_shape = [len(tensor_list)] + max_size\n        b, c, h, w = batch_shape\n        dtype = tensor_list[0].dtype\n        device = tensor_list[0].device\n        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)\n        mask = torch.ones((b, h, w), dtype=torch.bool, device=device)\n        for img, pad_img, m in zip(tensor_list, tensor, mask):\n            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)\n            m[: img.shape[1], :img.shape[2]] = False\n    elif tensor_list[0].ndim == 2 or tensor_list[0].ndim == 4:\n        max_size = max([video_ft.shape[1]\n                       for video_ft in tensor_list])  # [c,t,h,w] or [c,t]\n        # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))\n        if tensor_list[0].ndim == 2:\n            batch_shape = [len(tensor_list), tensor_list[0].shape[0], max_size]\n        else:\n            batch_shape = [len(tensor_list), tensor_list[0].shape[0],\n                           max_size, tensor_list[0].shape[2], tensor_list[0].shape[3]]\n        b, c, t = batch_shape[:3]\n        dtype = tensor_list[0].dtype\n        device = tensor_list[0].device\n        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)\n        mask = torch.ones((b, t), dtype=torch.bool, device=device)\n        for video_ft, pad_video_ft, m in zip(tensor_list, tensor, mask):\n            pad_video_ft[: video_ft.shape[0],\n                         : video_ft.shape[1]].copy_(video_ft)\n            m[: video_ft.shape[1]] = False\n\n    else:\n        raise ValueError('not supported')\n    return NestedTensor(tensor, mask)\n\n\ndef make_nested_tensor(tensor):\n    b, t = tensor.shape[0], tensor.shape[2]\n    mask = torch.zeros([b, t], dtype=torch.bool, device=tensor.device)\n    return NestedTensor(tensor, mask)\n\n\n# _onnx_nested_tensor_from_tensor_list() is an implementation of\n# nested_tensor_from_tensor_list() that is supported by ONNX tracing.\n@torch.jit.unused\ndef _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:\n    max_size = []\n    for i in range(tensor_list[0].dim()):\n        max_size_i = torch.max(torch.stack(\n            [img.shape[i] for img in tensor_list]).to(torch.float32)).to(torch.int64)\n        max_size.append(max_size_i)\n    max_size = tuple(max_size)\n\n    # work around for\n    # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)\n    # m[: img.shape[1], :img.shape[2]] = False\n    # which is not yet supported in onnx\n    padded_imgs = []\n    padded_masks = []\n    for img in tensor_list:\n        padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]\n        padded_img = torch.nn.functional.pad(\n            img, (0, padding[2], 0, padding[1], 0, padding[0]))\n        padded_imgs.append(padded_img)\n\n        m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)\n        padded_mask = torch.nn.functional.pad(\n            m, (0, padding[2], 0, padding[1]), \"constant\", 1)\n        padded_masks.append(padded_mask.to(torch.bool))\n\n    tensor = torch.stack(padded_imgs)\n    mask = torch.stack(padded_masks)\n\n    return NestedTensor(tensor, mask=mask)\n\n\ndef setup_for_distributed(is_master):\n    \"\"\"\n    This function disables printing when not in master process\n    \"\"\"\n    import builtins as __builtin__\n    builtin_print = __builtin__.print\n\n    def print(*args, **kwargs):\n        force = kwargs.pop('force', False)\n        if is_master or force:\n            builtin_print(*args, **kwargs)\n\n    __builtin__.print = print\n\n\ndef is_dist_avail_and_initialized():\n    if not dist.is_available():\n        return False\n    if not dist.is_initialized():\n        return False\n    return True\n\n\ndef get_world_size():\n    if not is_dist_avail_and_initialized():\n        return 1\n    return dist.get_world_size()\n\n\ndef get_rank():\n    if not is_dist_avail_and_initialized():\n        return 0\n    return dist.get_rank()\n\n\ndef is_main_process():\n    return get_rank() == 0\n\n\ndef save_on_master(*args, **kwargs):\n    if is_main_process():\n        torch.save(*args, **kwargs)\n\n\ndef init_distributed_mode(args):\n    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:\n        args.rank = int(os.environ[\"RANK\"])\n        args.world_size = int(os.environ['WORLD_SIZE'])\n        args.gpu = int(os.environ['LOCAL_RANK'])\n    elif 'SLURM_PROCID' in os.environ:\n        args.rank = int(os.environ['SLURM_PROCID'])\n        args.gpu = args.rank % torch.cuda.device_count()\n    else:\n        print('Not using distributed mode')\n        args.distributed = False\n        return\n\n    args.distributed = True\n\n    torch.cuda.set_device(args.gpu)\n    args.dist_backend = 'nccl'\n    print('| distributed init (rank {}): {}'.format(\n        args.rank, args.dist_url), flush=True)\n    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,\n                                         world_size=args.world_size, rank=args.rank)\n    torch.distributed.barrier()\n    setup_for_distributed(args.rank == 0)\n\n\n@torch.no_grad()\ndef accuracy(output, target, topk=(1,)):\n    \"\"\"Computes the precision@k for the specified values of k\"\"\"\n    if target.numel() == 0:\n        return [torch.zeros([], device=output.device)]\n    maxk = max(topk)\n    batch_size = target.size(0)\n\n    _, pred = output.topk(maxk, 1, True, True)\n    pred = pred.t()\n    correct = pred.eq(target.view(1, -1).expand_as(pred))\n\n    res = []\n    for k in topk:\n        correct_k = correct[:k].view(-1).float().sum(0)\n        res.append(correct_k.mul_(100.0 / batch_size))\n    return res\n\n\ndef inverse_sigmoid(x, eps=1e-5):\n    x = x.clamp(min=0, max=1)\n    x1 = x.clamp(min=eps)\n    x2 = (1 - x).clamp(min=eps)\n    return torch.log(x1/x2)\n"
  },
  {
    "path": "util/segment_ops.py",
    "content": "# ------------------------------------------------------------------------\n# TadTR: End-to-end Temporal Action Detection with Transformer\n# Copyright (c) 2021. Xiaolong Liu.\n# ------------------------------------------------------------------------\n# Modified from DETR (https://github.com/facebookresearch/detr)\n# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved\n# ------------------------------------------------------------------------\n\n\"\"\"\nUtilities for segment manipulation and IoU.\n\"\"\"\nimport torch\nimport numpy as np\n# from torchvision.ops.boxes import box_area\n\n\ndef segment_cw_to_t1t2(x):\n    '''corresponds to box_cxcywh_to_xyxy in detr\n    Params:\n        x: segments in (center, width) format, shape=(*, 2)\n    Returns:\n        segments in (t_start, t_end) format, shape=(*, 2)\n    '''\n    if not isinstance(x, np.ndarray):\n        x_c, w = x.unbind(-1)\n        b = [(x_c - 0.5 * w), (x_c + 0.5 * w)]\n        return torch.stack(b, dim=-1)\n    else:\n        x_c, w = x[..., 0], x[..., 1]\n        b = [(x_c - 0.5 * w)[..., None], (x_c + 0.5 * w)[..., None]]\n        return np.concatenate(b, axis=-1)\n\n\ndef segment_t1t2_to_cw(x):\n    '''corresponds to box_xyxy_to_cxcywh in detr\n    Params:\n        x: segments in (t_start, t_end) format, shape=(*, 2)\n    Returns:\n        segments in (center, width) format, shape=(*, 2)\n    '''\n    if not isinstance(x, np.ndarray):\n        x1, x2 = x.unbind(-1)\n        b = [(x1 + x2) / 2, (x2 - x1)]\n        return torch.stack(b, dim=-1)\n    else:\n        x1, x2 = x[..., 0], x[..., 1]\n        b = [((x1 + x2) / 2)[..., None], (x2 - x1)[..., None]]\n        return np.concatenate(b, axis=-1)\n\n\ndef segment_length(segments):\n    return (segments[:, 1]-segments[:, 0]).clamp(min=0)\n\n\n# modified from torchvision to also return the union\ndef segment_iou_and_union(segments1, segments2):\n    area1 = segment_length(segments1)\n    area2 = segment_length(segments2)\n\n    l = torch.max(segments1[:, None, 0], segments2[:, 0])  # N,M\n    r = torch.min(segments1[:, None, 1], segments2[:, 1])  # N,M\n    inter = (r - l).clamp(min=0)  # [N,M]\n\n    union = area1[:, None] + area2 - inter\n\n    iou = inter / union\n    return iou, union\n\n\ndef segment_iou(segments1, segments2):\n    \"\"\"\n    Temporal IoU between \n\n    The boxes should be in [x0, y0, x1, y1] format\n\n    Returns a [N, M] pairwise matrix, where N = len(segments1)\n    and M = len(segments2)\n    \"\"\"\n    # degenerate boxes gives inf / nan results\n    # so do an early check\n    assert (segments1[:, 1] >= segments1[:, 0]).all()\n\n    area1 = segment_length(segments1)\n    area2 = segment_length(segments2)\n\n    l = torch.max(segments1[:, None, 0], segments2[:, 0])  # N,M\n    r = torch.min(segments1[:, None, 1], segments2[:, 1])  # N,M\n    inter = (r - l).clamp(min=0)  # [N,M]\n\n    union = area1[:, None] + area2 - inter\n\n    iou = inter / union\n\n    return iou\n\n\ndef temporal_iou_numpy(proposal_min, proposal_max, gt_min, gt_max):\n    \"\"\"Compute IoU score between a groundtruth instance and the proposals.\n\n    Args:\n        proposal_min (list[float]): List of temporal anchor min.\n        proposal_max (list[float]): List of temporal anchor max.\n        gt_min (float): Groundtruth temporal box min.\n        gt_max (float): Groundtruth temporal box max.\n\n    Returns:\n        list[float]: List of iou scores.\n    \"\"\"\n    len_anchors = proposal_max - proposal_min\n    int_tmin = np.maximum(proposal_min, gt_min)\n    int_tmax = np.minimum(proposal_max, gt_max)\n    inter_len = np.maximum(int_tmax - int_tmin, 0.)\n    union_len = len_anchors - inter_len + gt_max - gt_min\n    jaccard = np.divide(inter_len, union_len)\n    return jaccard\n\n\ndef temporal_iou_numpy(proposal_min, proposal_max, gt_min, gt_max):\n    \"\"\"Compute IoP score between a groundtruth bbox and the proposals.\n\n    Compute the IoP which is defined as the overlap ratio with\n    groundtruth proportional to the duration of this proposal.\n\n    Args:\n        proposal_min (list[float]): List of temporal anchor min.\n        proposal_max (list[float]): List of temporal anchor max.\n        gt_min (float): Groundtruth temporal box min.\n        gt_max (float): Groundtruth temporal box max.\n\n    Returns:\n        list[float]: List of intersection over anchor scores.\n    \"\"\"\n    len_anchors = np.array(proposal_max - proposal_min)\n    int_tmin = np.maximum(proposal_min, gt_min)\n    int_tmax = np.minimum(proposal_max, gt_max)\n    inter_len = np.maximum(int_tmax - int_tmin, 0.)\n    scores = np.divide(inter_len, len_anchors)\n    return scores\n\n\ndef soft_nms(proposals, alpha, low_threshold, high_threshold, top_k):\n    \"\"\"Soft NMS for temporal proposals.\n\n    Args:\n        proposals (np.ndarray): Proposals generated by network.\n        alpha (float): Alpha value of Gaussian decaying function.\n        low_threshold (float): Low threshold for soft nms.\n        high_threshold (float): High threshold for soft nms.\n        top_k (int): Top k values to be considered.\n\n    Returns:\n        np.ndarray: The updated proposals.\n    \"\"\"\n    proposals = proposals[proposals[:, -1].argsort()[::-1]]\n    tstart = list(proposals[:, 0])\n    tend = list(proposals[:, 1])\n    tscore = list(proposals[:, 2])\n    rstart = []\n    rend = []\n    rscore = []\n\n    while len(tscore) > 0 and len(rscore) <= top_k:\n        max_index = np.argmax(tscore)\n        max_width = tend[max_index] - tstart[max_index]\n        iou_list = temporal_iou_numpy(tstart[max_index], tend[max_index],\n                                      np.array(tstart), np.array(tend))\n        iou_exp_list = np.exp(-np.square(iou_list) / alpha)\n\n        for idx, _ in enumerate(tscore):\n            if idx != max_index:\n                current_iou = iou_list[idx]\n                if current_iou > low_threshold + (high_threshold -\n                                                  low_threshold) * max_width:\n                    tscore[idx] = tscore[idx] * iou_exp_list[idx]\n\n        rstart.append(tstart[max_index])\n        rend.append(tend[max_index])\n        rscore.append(tscore[max_index])\n        tstart.pop(max_index)\n        tend.pop(max_index)\n        tscore.pop(max_index)\n\n    rstart = np.array(rstart).reshape(-1, 1)\n    rend = np.array(rend).reshape(-1, 1)\n    rscore = np.array(rscore).reshape(-1, 1)\n    new_proposals = np.concatenate((rstart, rend, rscore), axis=1)\n    return new_proposals\n\n\ndef temporal_nms(segments, thresh):\n    \"\"\"\n    One-dimensional non-maximal suppression\n    :param segments: [[st, ed, score, ...], ...]\n    :param thresh:\n    :return:\n    \"\"\"\n    t1 = segments[:, 0]\n    t2 = segments[:, 1]\n    scores = segments[:, 2]\n\n    durations = t2 - t1\n    order = scores.argsort()[::-1]\n\n    keep = []\n    while order.size > 0:\n        i = order[0]\n        keep.append(i)\n        tt1 = np.maximum(t1[i], t1[order[1:]])\n        tt2 = np.minimum(t2[i], t2[order[1:]])\n        intersection = tt2 - tt1\n        IoU = intersection / \\\n            (durations[i] + durations[order[1:]] - intersection).astype(float)\n\n        inds = np.where(IoU <= thresh)[0]\n        order = order[inds + 1]\n\n    return segments[keep, :]\n"
  }
]