Repository: xlliu7/TadTR
Branch: master
Commit: 983ae14bcec8
Files: 37
Total size: 197.1 KB

Directory structure:
gitextract_inim4nmp/

├── .gitignore
├── Evaluation/
│   ├── README.md
│   ├── eval_detection.py
│   └── utils.py
├── LICENSE
├── README.md
├── configs/
│   └── thumos14_i3d2s_tadtr.yml
├── datasets/
│   ├── __init__.py
│   ├── data_utils.py
│   ├── path.yml
│   ├── tad_dataset.py
│   └── tad_eval.py
├── demo.py
├── docs/
│   └── 1_train_on_your_dataset.md
├── engine.py
├── main.py
├── models/
│   ├── __init__.py
│   ├── custom_loss.py
│   ├── matcher.py
│   ├── ops/
│   │   ├── roi_align/
│   │   │   ├── __init__.py
│   │   │   ├── roi_align.py
│   │   │   └── src/
│   │   │       ├── roi_align_cuda.cpp
│   │   │       └── roi_align_kernel.cu
│   │   ├── setup.py
│   │   └── temporal_deform_attn/
│   │       ├── __init__.py
│   │       └── temporal_deform_attn.py
│   ├── position_encoding.py
│   ├── tadtr.py
│   └── transformer.py
├── opts.py
├── requirements.txt
├── scripts/
│   ├── run_parallel.sh
│   └── test_reference_models.sh
└── util/
    ├── __init__.py
    ├── logger.py
    ├── misc.py
    └── segment_ops.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
# User defined
data/
outputs/


# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/


================================================
FILE: Evaluation/README.md
================================================
#ActivityNet Large Scale Activity Recognition Challenge - Evaluation Toolkit
This is the documentation of the ActivityNet Large Scale Activity Recognition
Challenge Evaluation Toolkit. It includes APIs to evaluate the performance of a method in the two different tasks in the challenge: *untrimmed video classification* and *activity detection*. For more information about the challenge competitions, please read the [guidelines](http://activity-net.org/challenges/2016/guidelines.html).

##Dependencies
The Evaluation Toolkit is purely written in Python (>=2.7) and it requires the 
following third party libraries:
* [Numpy](http://www.numpy.org/)
* [Pandas](http://pandas.pydata.org/)

##Getting started
We include sample prediction files in the folder data to show how to evaluate your prediction results. Please follow this steps to obtain the performance evaluation on the provided sample files:
* Run `git clone` this repository.
* To evaluate classification performance call: `python get_classification_performance.py data/activity_net.v1-3.min.json sample_classification_prediction.json`
* To evaluate detection performance call: `python get_detection_performance.py data/activity_net.v1-3.min.json sample_detection_prediction.json`

##Contributions and Troubleshooting
We are welcome to contributions, please keep your pull-request simple so we can go back to you as soon as we can. If you found a bug please open a new issue and describe the problem.


================================================
FILE: Evaluation/eval_detection.py
================================================
import json
import sys

import urllib.error, urllib.parse

import numpy as np
import pandas as pd

from .utils import get_blocked_videos
from .utils import interpolated_prec_rec
from .utils import segment_iou
import pdb
import traceback
import logging


from joblib import Parallel, delayed


logger_initilized = False


def setup_logger(log_file_path, name=None, level=logging.INFO):
    """
    Setup a logger that simultaneously output to a file and stdout
    ARGS
        log_file_path: string, path to the logging file
    """
    # logging settings
    #   log_formatter = logging.Formatter("%(asctime)s [%(levelname)-5.5s]  %(message)s")
    log_formatter = logging.Formatter(
            "[%(asctime)s][%(levelname)s] %(pathname)s: %(lineno)4d: %(message)s",
            datefmt="%m/%d %H:%M:%S")
    root_logger = logging.getLogger(name)

    if name:
        root_logger.propagate = False
    root_logger.setLevel(level)
    # file handler
    if log_file_path is not None:
        log_file_handler = logging.FileHandler(log_file_path)
        log_file_handler.setFormatter(log_formatter)
        root_logger.addHandler(log_file_handler)
 
    log_formatter = logging.Formatter(
            "[%(asctime)s][%(levelname)s]: %(message)s",
            datefmt="%m/%d %H:%M:%S")
    log_stream_handler = logging.StreamHandler(sys.stdout)
    log_stream_handler.setFormatter(log_formatter)
    # log_stream_handler.setLevel(logging.INFO)
    root_logger.addHandler(log_stream_handler)

    logging.info('Log file is %s' % log_file_path)
    global logger_initilized
    logger_initilized = True
    return root_logger


def get_classes(anno_dict):
    if 'classes' in anno_dict:
        classes = anno_dict['classes']
    else:
        
        database = anno_dict['database']
        all_gts = []
        for vid in database:
            all_gts += database[vid]['annotations']
        classes = list(sorted({x['label'] for x in all_gts}))
    return classes


class ANETdetection(object):

    GROUND_TRUTH_FIELDS = ['database', 'taxonomy', 'version']
    PREDICTION_FIELDS = ['results', 'version', 'external_data']

    def __init__(self, ground_truth_filename=None, prediction_filename=None,
                 ground_truth_fields=GROUND_TRUTH_FIELDS,
                 prediction_fields=PREDICTION_FIELDS,
                 tiou_thresholds=np.linspace(0.5, 0.95, 10), 
                 subset='validation', verbose=False, 
                 check_status=False, log_path=None, exclude_videos=None):
        
        if not ground_truth_filename:
            raise IOError('Please input a valid ground truth file.')
        if not prediction_filename:
            raise IOError('Please input a valid prediction file.')
        self.subset = subset
        # if log_path is None:
        if not logger_initilized:
            print('setup logger')
            logger = setup_logger(log_path)
        else:
            logger = logging.getLogger()
        self.logger = logger
        
        self.tiou_thresholds = tiou_thresholds
        self.verbose = verbose
        self.gt_fields = ground_truth_fields
        self.pred_fields = prediction_fields
        self.ap = None
        self.check_status = check_status
        
        self.blocked_videos = exclude_videos if exclude_videos else list()
        # self.blocked_videos = ['video_test_0000270', 'video_test_0001292', 'video_test_0001496']
        # Import ground truth and predictions.
        self.ground_truth, self.activity_index = self._import_ground_truth(
            ground_truth_filename)
        self.prediction = self._import_prediction(prediction_filename)

        if self.verbose:
            self.logger.info('[INIT] Loaded annotations from {} subset.'.format(subset))
            nr_gt = len(self.ground_truth)
            self.logger.info('\tNumber of ground truth instances: {}'.format(nr_gt))
            nr_pred = len(self.prediction)
            self.logger.info('\tNumber of predictions: {}'.format(nr_pred))
            self.logger.info('\tFixed threshold for tiou score: {}'.format(self.tiou_thresholds))

    def _import_ground_truth(self, ground_truth_filename):
        """Reads ground truth file, checks if it is well formatted, and returns
           the ground truth instances and the activity classes.

        Parameters
        ----------
        ground_truth_filename : str
            Full path to the ground truth json file.

        Outputs
        -------
        ground_truth : df
            Data frame containing the ground truth instances.
        activity_index : dict
            Dictionary containing class index.
        """
        if isinstance(ground_truth_filename, str):
            with open(ground_truth_filename, 'r') as fobj:
                data = json.load(fobj)
        else:
            data = ground_truth_filename
        # # Checking format
        # if not all([field in list(data.keys()) for field in self.gt_fields]):
        #     raise IOError('Please input a valid ground truth file.')

        # Read ground truth data.
        # activity_index, cidx = {}, 0

        class_list = get_classes(data)
        activity_index = {cls_name: idx for idx, cls_name in enumerate(class_list)}
        video_lst, t_start_lst, t_end_lst, label_lst, difficult_lst = [], [], [], [], []
        for videoid, v in data['database'].items():
            if self.subset != v['subset']:
                continue
            if videoid in self.blocked_videos:
                continue
            for ann in v['annotations']:
                # if ann['label'] not in class_list:
                #     class_list.append(ann['label'])
                video_lst.append(videoid)
                t_start_lst.append(float(ann['segment'][0]))
                t_end_lst.append(float(ann['segment'][1]))
                label_lst.append(activity_index[ann['label']])
                difficult = 0 if 'difficult' not in ann else ann['difficult']
                difficult_lst.append(difficult)

        ground_truth = pd.DataFrame({'video-id': video_lst,
                                     't-start': t_start_lst,
                                     't-end': t_end_lst,
                                     'label': label_lst,
                                     'difficult': difficult_lst})
        self.class_list = [x for x in class_list]
        
        return ground_truth, activity_index

    def _import_prediction(self, prediction_filename):
        """Reads prediction file, checks if it is well formatted, and returns
           the prediction instances.

        Parameters
        ----------
        prediction_filename : str
            Full path to the prediction json file.

        Outputs
        -------
        prediction : df
            Data frame containing the prediction instances.
        """
        if isinstance(prediction_filename, str):
            with open(prediction_filename, 'r') as fobj:
                data = json.load(fobj)
        else:
            data = prediction_filename
        # Checking format...
        if not all([field in list(data.keys()) for field in self.pred_fields]):
            raise IOError('Please input a valid prediction file.')

        # Read predicitons.
        video_lst, t_start_lst, t_end_lst = [], [], []
        label_lst, score_lst = [], []
        for videoid, v in data['results'].items():
            if videoid in self.blocked_videos:
                continue
            for result in v:
                label = self.activity_index[result['label']]
                video_lst.append(videoid)
                t_start_lst.append(result['segment'][0])
                t_end_lst.append(result['segment'][1])
                label_lst.append(label)
                score_lst.append(result['score'])
        prediction = pd.DataFrame({'video-id': video_lst,
                                   't-start': t_start_lst,
                                   't-end': t_end_lst,
                                   'label': label_lst,
                                   'score': score_lst})
        return prediction

    # def wrapper_compute_average_precision(self):
    #     """Computes average precision for each class in the subset.
    #     """
    #     ap = np.zeros((len(self.tiou_thresholds), len(list(self.activity_index.items()))))
    #     for activity, cidx in self.activity_index.items():
    #         gt_idx = self.ground_truth['label'] == cidx
    #         pred_idx = self.prediction['label'] == cidx
    #         ap[:,cidx] = compute_average_precision_detection(
    #             self.ground_truth.loc[gt_idx].reset_index(drop=True),
    #             self.prediction.loc[pred_idx].reset_index(drop=True),
    #             tiou_thresholds=self.tiou_thresholds)
    #     return ap

    ################################# copied from GTAD #######################################
    def _get_predictions_with_label(self, prediction_by_label, label_name, cidx):
        """Get all predicitons of the given label. Return empty DataFrame if there
        is no predcitions with the given label.
        """
        try:
            return prediction_by_label.get_group(cidx).reset_index(drop=True)
        except:
            if self.verbose:
                print('Warning: No predictions of label \'%s\' were provdied.' % label_name)
            return pd.DataFrame()

    def wrapper_compute_average_precision(self):
        """Computes average precision for each class in the subset.
        """
        ap = np.zeros((len(self.tiou_thresholds), len(self.activity_index)))

        # Adaptation to query faster
        ground_truth_by_label = self.ground_truth.groupby('label')
        prediction_by_label = self.prediction.groupby('label')

        results = Parallel(n_jobs=len(self.activity_index))(
                    delayed(compute_average_precision_detection)(
                        ground_truth=ground_truth_by_label.get_group(cidx).reset_index(drop=True),
                        prediction=self._get_predictions_with_label(prediction_by_label, label_name, cidx),
                        tiou_thresholds=self.tiou_thresholds,
                    ) for label_name, cidx in self.activity_index.items())

        for i, cidx in enumerate(self.activity_index.values()):
            ap[:,cidx] = results[i]

        return ap
    #################################################################################

    def evaluate(self):
        """Evaluates a prediction file. For the detection task we measure the
        interpolated mean average precision to measure the performance of a
        method.
        """
        self.ap = self.wrapper_compute_average_precision()
        self.mAP = self.ap.mean(axis=1)
        if self.verbose:
            self.logger.info('[RESULTS] Performance on ActivityNet detection task.')
            self.logger.info('\n{}'.format(' '.join(['%.4f' % (x * 1) for x in self.mAP])))
            self.logger.info('\tAverage-mAP: {}'.format(self.mAP.mean()))

def compute_average_precision_detection(ground_truth, prediction, tiou_thresholds=np.linspace(0.5, 0.95, 10)):
    """Compute average precision (detection task) between ground truth and
    predictions data frames. If multiple predictions occurs for the same
    predicted segment, only the one with highest score is matches as
    true positive. This code is greatly inspired by Pascal VOC devkit.

    Parameters
    ----------
    ground_truth : df
        Data frame containing the ground truth instances.
        Required fields: ['video-id', 't-start', 't-end']
    prediction : df
        Data frame containing the prediction instances.
        Required fields: ['video-id, 't-start', 't-end', 'score']
    tiou_thresholds : 1darray, optional
        Temporal intersection over union threshold.

    Outputs
    -------
    ap : float
        Average precision score.
    """
    
    npos = float(len(ground_truth))
    lock_gt = np.ones((len(tiou_thresholds),len(ground_truth))) * -1
    # Sort predictions by decreasing score order.
    sort_idx = prediction['score'].values.argsort()[::-1]
    prediction = prediction.loc[sort_idx].reset_index(drop=True)

    # Initialize true positive and false positive vectors.
    tp = np.zeros((len(tiou_thresholds), len(prediction)))
    fp = np.zeros((len(tiou_thresholds), len(prediction)))

    # Adaptation to query faster
    ground_truth_gbvn = ground_truth.groupby('video-id')
    # Assigning true positive to truly grount truth instances.
    for idx, this_pred in prediction.iterrows():

        try:
            # Check if there is at least one ground truth in the video associated.
            ground_truth_videoid = ground_truth_gbvn.get_group(this_pred['video-id'])
        except Exception as e:
            # print(e)
            fp[:, idx] = 1
            continue

        this_gt = ground_truth_videoid.reset_index()
        tiou_arr = segment_iou(this_pred[['t-start', 't-end']].values,
                               this_gt[['t-start', 't-end']].values)
        # We would like to retrieve the predictions with highest tiou score.
        tiou_sorted_idx = tiou_arr.argsort()[::-1]
        # matched_to_difficult = False
        for tidx, tiou_thr in enumerate(tiou_thresholds):
            for jdx in tiou_sorted_idx:
                if tiou_arr[jdx] < tiou_thr:
                    fp[tidx, idx] = 1
                    break
                if lock_gt[tidx, this_gt.loc[jdx]['index']] >= 0:
                    continue
                # Assign as true positive after the filters above.
                tp[tidx, idx] = 1
                lock_gt[tidx, this_gt.loc[jdx]['index']] = idx
                break
                    
            if fp[tidx, idx] == 0 and tp[tidx, idx] == 0:
                fp[tidx, idx] = 1

    ap = np.zeros(len(tiou_thresholds))

    for tidx in range(len(tiou_thresholds)):
        # Computing prec-rec
        this_tp = np.cumsum(tp[tidx,:]).astype(np.float)
        this_fp = np.cumsum(fp[tidx,:]).astype(np.float)
        rec = this_tp / npos
        prec = this_tp / (this_tp + this_fp)
        ap[tidx] = interpolated_prec_rec(prec, rec)

    return ap


================================================
FILE: Evaluation/utils.py
================================================
import json
import urllib.request, urllib.error, urllib.parse

import numpy as np

API = 'http://ec2-52-11-11-89.us-west-2.compute.amazonaws.com/challenge17/api.py'

def get_blocked_videos(api=API):
    api_url = '{}?action=get_blocked'.format(api)
    req = urllib.request.Request(api_url)
    response = urllib.request.urlopen(req)
    return json.loads(response.read())

def interpolated_prec_rec(prec, rec):
    """Interpolated AP - VOCdevkit from VOC 2011.
    """
    mprec = np.hstack([[0], prec, [0]])
    mrec = np.hstack([[0], rec, [1]])
    for i in range(len(mprec) - 1)[::-1]:
        mprec[i] = max(mprec[i], mprec[i + 1])
    idx = np.where(mrec[1::] != mrec[0:-1])[0] + 1
    ap = np.sum((mrec[idx] - mrec[idx - 1]) * mprec[idx])
    return ap

def segment_iou(target_segment, candidate_segments):
    """Compute the temporal intersection over union between a
    target segment and all the test segments.

    Parameters
    ----------
    target_segment : 1d array
        Temporal target segment containing [starting, ending] times.
    candidate_segments : 2d array
        Temporal candidate segments containing N x [starting, ending] times.

    Outputs
    -------
    tiou : 1d array
        Temporal intersection over union score of the N's candidate segments.
    """
    tt1 = np.maximum(target_segment[0], candidate_segments[:, 0])
    tt2 = np.minimum(target_segment[1], candidate_segments[:, 1])
    # Intersection including Non-negative overlap score.
    segments_intersection = (tt2 - tt1).clip(0)
    # Segment union.
    segments_union = (candidate_segments[:, 1] - candidate_segments[:, 0]) \
      + (target_segment[1] - target_segment[0]) - segments_intersection
    # Compute overlap as the ratio of the intersection
    # over union of two segments.
    tIoU = segments_intersection.astype(float) / segments_union
    return tIoU

def wrapper_segment_iou(target_segments, candidate_segments):
    """Compute intersection over union btw segments
    Parameters
    ----------
    target_segments : ndarray
        2-dim array in format [m x 2:=[init, end]]
    candidate_segments : ndarray
        2-dim array in format [n x 2:=[init, end]]
    Outputs
    -------
    tiou : ndarray
        2-dim array [n x m] with IOU ratio.
    Note: It assumes that candidate-segments are more scarce that target-segments
    """
    if candidate_segments.ndim != 2 or target_segments.ndim != 2:
        raise ValueError('Dimension of arguments is incorrect')

    n, m = candidate_segments.shape[0], target_segments.shape[0]
    tiou = np.empty((n, m))
    for i in range(m):
        tiou[:, i] = segment_iou(target_segments[i,:], candidate_segments)

    return tiou


================================================
FILE: LICENSE
================================================
Copyright (c) 2021 - 2022, Xiaolong Liu et al. All Rights Reserved.

                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

-----------------------------------------------------------------------
Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)

Copyright 2020, SenseTime

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

      http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

-----------------------------------------------------------------------
DETR (https://github.com/facebookresearch/detr)

Copyright 2020 - present, Facebook, Inc

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

================================================
FILE: README.md
================================================
# TadTR: End-to-end Temporal Action Detection with Transformer

[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/end-to-end-temporal-action-detection-with/temporal-action-localization-on-thumos14)](https://paperswithcode.com/sota/temporal-action-localization-on-thumos14?p=end-to-end-temporal-action-detection-with)

By [Xiaolong Liu](https://github.com/xlliu7), [Qimeng Wang](https://scholar.google.com/citations?user=hi7AeE8AAAAJ), [Yao Hu](https://scholar.google.com/citations?user=LIu7k7wAAAAJ), [Xu Tang](https://scholar.google.com/citations?user=grP24aAAAAAJ), [Shiwei Zhang](https://scholar.google.com/citations?user=ZO3OQ-8AAAAJ), [Song Bai](http://songbai.site), [Xiang Bai](https://scholar.google.com/citations?user=UeltiQ4AAAAJ).

This repo holds the code for TadTR, described in the paper
[End-to-end temporal action detection with Transformer](https://arxiv.org/abs/2106.10271) published in IEEE Transactions on Image Processing (TIP) 2022.

<!-- _The tech report is out-dated. We have significantly improved TadTR since we uploaded it to arxiv. It achives much better performance now. We'll update the arxiv version recently._  -->

We have also explored fully end-to-end training from RGB images with TadTR. See our CVPR 2022 work [E2E-TAD][e2e-tad].


## Introduction

TadTR is an end-to-end Temporal Action Detection TRansformer. It has the following advantages over previous methods:
- Simple. It adopts a set-prediction pipeline and achieves TAD with a *single network*. It does not require a separate proposal generation stage.
- Flexible. It removes hand-crafted design such as anchor setting and NMS.
- Sparse. It produces very sparse detections (e.g. 10 on ActivityNet), thus requiring lower computation cost.
- Strong. As a *self-contained* temporal action detector, TadTR achieves state-of-the-art performance on HACS and THUMOS14. It is also much stronger than concurrent Transformer-based methods such as **RTD-Net** and **AGT**.

![](data_intro/arch.png "Architecture")

## Updates
[2023.2.19] Fix a bug a loss caculation ([issue #21](https://github.com/xlliu7/TadTR/issues/21)). Thank [@zachpvin](https://github.com/zachpvin) for raising this issue!

[2022.8.7] Add support for training/testing on THUMOS14!

[2022.7.4] Glad to share that this paper will appear in IEEE Transactions on Image Processing (TIP). Although I am still busy with my thesis, I will try to make the code accessible soon. Thanks for your patience.

[2022.6] Update the technical report of this work on arxiv (now v3).

[2022.3] Our new work [E2E-TAD][e2e-tad] based on TadTR is accepted to CVPR 2022. It supports fully end-to-end training from RGB images.

[2021.9.15] Update the performance on THUMOS14.

[2021.9.1] Add demo code.

[2021.7] Our revised paper was submitted to IEEE Transactions on Image Processing.

[2021.6] Our revised paper was uploaded to arxiv.

[2021.1.21] Our paper was submitted to IJCAI 2021. 

## TODOs
- [x] add model code
- [x] add inference code
- [x] add training code
- [x] support training/inference with video input. See [E2E-TAD][e2e-tad]

## Main Results
- HACS Segments

|Method|Feature|mAP@0.5|mAP@0.75|mAP@0.95|Avg. mAP|
| :----: |:----: | :--: | :----: | :---: | :----: |
|TadTR|I3D RGB|47.14 |32.11 |10.94| 32.09|


- THUMOS14

|Method|Feature|mAP@0.3|mAP@0.4|mAP@0.5|mAP@0.6|mAP@0.7|Avg. mAP|
| :----: |:----: | :--: | :----: | :---: | :----: |:----: | :----: |
|TadTR|I3D 2stream|74.8 |69.1| 60.1| 46.6| 32.8| 56.7|

- ActivityNet-1.3

|Method|Feature|mAP@0.5|mAP@0.75|mAP@0.95|Avg. mAP|
| :----: |:----: | :--: | :----: | :---: | :----: |
|TadTR|TSN 2stream|51.29 |34.99| 9.49| 34.64|
|TadTR|TSP|53.62| 37.52| 10.56| 36.75|


## Install
### Requirements

* Linux or Windows
  
* Python>=3.7

* (Optional) CUDA>=9.2, GCC>=5.4
  
* PyTorch>=1.5.1, torchvision>=0.6.1 (following instructions [here](https://pytorch.org/))
  
* Other requirements
    ```bash
    pip install -r requirements.txt
    ```
### Compiling CUDA extensions (Optional)
The RoIAlign operator is implemented with CUDA extension.
If your machine does have a NVIDIA GPU with CUDA support, you can run this step. Otherwise, please set `disable_cuda=True` in `opts.py`.
```bash
cd model/ops;

# If you have multiple installations of CUDA Toolkits, you'd better add a prefix
# CUDA_HOME=<your_cuda_toolkit_path> to specify the correct version. 
python setup.py build_ext --inplace
```

### Run a quick test
```
python demo.py
```

## 1.Data Preparation
Currently we only support `thumos14`.

### THUMOS14
Download all data from [[BaiduDrive(code: adTR)]](https://pan.baidu.com/s/183VprlbKNjMb3Gr-rfmROQ) or [[OneDrive]](https://husteducn-my.sharepoint.com/:f:/g/personal/liuxl_hust_edu_cn/EsMyXDlkrTdBsikoRQSIeUsBkxJJRsplbMyIQVYotiZRIQ?e=QYgiCH).

- Features: Download the I3D features `I3D_2stream_Pth.tar`. It was originally provided by the authors of P-GCN. I have concatenated the RGB and Flow features (drop the tail of the longer one if the lengths are inconsistent) and converted the data to float32 precision to save space.
- Annotations: The annotations of action instances and the meta information of feature files. Both are in JSON format (`th14_annotations_with_fps_duration.json` and `th14_i3d2s_ft_info.json`).
- Pre-trained Reference Models: Our pretrained model that use I3D features `thumos14_i3d2s_tadtr_reference.pth`. This model corresponds to the config file `configs/thumos14_i3d2s_tadtr.yml`.

After downloading is finished, extract the archived feature files inplace by `cd data;tar -xf I3D_2stream_Pth.tar`. Then put the features, annotations, the model under the `data/thumos14` directory. We expect the following structure in root folder.
```
- data
  - thumos14
    - I3D_2stream_Pth
     - xxxxx
     - xxxxx
    - th14_annotations_with_fps_duration.json
    - th14_i3d2s_ft_info.json
    - thumos14_tadtr_reference.pth
```


## 2.Testing Pre-trained Models
Run
```
python main.py --cfg CFG_PATH --eval --resume CKPT_PATH
```
CFG_PATH is the path to the YAML-format config file that defines the experimental setting. For example, `configs/thumos14_i3d2s_tadtr.yml`. CKPT_PATH is the path of the pre-trained model. Alternatively, you can execute the Shell script `bash scripts/test_reference_models.sh thumos14` for simplity.


## 3.Training by Yourself
Run the following command
```
python main.py --cfg CFG_PATH
```

This codebase supports running on both CPU and GPU. 
- To run on CPU: please add ` --device cpu` to the above command. Also, you need to set `disable_cuda=True` in `opts.py`. The CPU mode does not support actionness regression and the detection performance is lower.
- To run on GPU: since the model is very lightweight, just one GPU is enough. You may specify the GPU device ID (e.g., 0) to use by the adding the prefix `CUDA_VISIBLE_DEVICES=ID ` before the above command. To run on multiple GPUs, please refer to `scripts/run_parallel.sh`.

During training, our code will automatically perform testing every N epochs (N is the `test_interval` in opts.py). Training takes 6~10 minutes on THUMOS14 if you use a modern GPU (e.g. TITAN Xp). You can also monitor the training process with Tensorboard (need to set `cfg.tensorboard=True` in `opts.py`). The tensorboard record and the checkpoint will be saved at `output_dir` (can be modified in config file).

After training is done, you can also test your trained model by running
```
python main.py --cfg CFG_PATH --eval
```
It will automatically use the best model checkpoint. If you want to manually specify the model checkpoint, run
```
python main.py --cfg CFG_PATH --eval --resume CKPT_PATH
```

Note that the performance of the model trained by your own may be different from the reference model, even though all seeds are fixed. The reason is that TadTR uses the `grid_sample` operator, whoses gradient computation involves the non-deterministic `AtomicAdd` operator. Please refer to [ref1](https://pytorch.org/docs/stable/notes/randomness.html) [ref2](https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html#torch.use_deterministic_algorithms) [ref3(Chinese)](https://zhuanlan.zhihu.com/p/109166845) for details.

## Acknowledgement
The code is based on the [DETR](https://github.com/facebookresearch/detr) and [Deformable DETR](https://github.com/fundamentalvision/Deformable-DETR). We also borrow the implementation of the RoIAlign1D from [G-TAD](https://github.com/Frostinassiky/gtad). Thanks for their great works.

## Citing
```
@article{liu2022end,
  title={End-to-end Temporal Action Detection with Transformer},
  author={Liu, Xiaolong and Wang, Qimeng and Hu, Yao and Tang, Xu and Zhang, Shiwei and Bai, Song and Bai, Xiang},
  journal={IEEE Transactions on Image Processing (TIP)},
  year={2022}
}
```

## Contact

For questions and suggestions, please contact Xiaolong Liu by email ("liuxl at hust dot edu dot cn").

[e2e-tad]: https://github.com/xlliu7/E2E-TAD


================================================
FILE: configs/thumos14_i3d2s_tadtr.yml
================================================
# model setting
enc_layers: 4
dec_layers: 4
dim_feedforward: 1024
num_queries: 40

# data setting
dataset_name: thumos14
feature: i3d2s
feature_dim: 2048
online_slice: true
slice_len: 128
slice_overlap: 0.75
test_slice_overlap: 0.25

# output
output_dir: outputs/thumos14_i3d2s_tadtr


================================================
FILE: datasets/__init__.py
================================================
from .tad_dataset import build as build_video_dataset


def build_dataset(subset, args, mode):
    if args.dataset_name in ['activitynet', 'thumos14', 'hacs', 'muses']:
        return build_video_dataset(args.dataset_name, subset, args, mode)
    
    raise ValueError(f'dataset {args.dataset_name} not supported')

================================================
FILE: datasets/data_utils.py
================================================
'''Utilities for data loading'''

import json
import math
import logging
import os

import pandas as pd
import easydict
import yaml

import numpy as np
# import cv2
import torch
import torch.nn.functional as F
# import ipdb as pdb

def load_json(path):
    return json.load(open(path))


def get_valid_anno(gt_instances, slice, thr=0.75,
        start_getter=lambda x: x['segment'][0],
        end_getter=lambda x: x['segment'][1]):
    '''Perform integrity based instance filtering'''
    start, end = slice
    kept_instances = []
    for inst in gt_instances:
        # ignore insts outside the time window (slice)
        if end_getter(inst) <= start or start_getter(inst) >= end:
            continue
        else:
            # clamped inst
            new_start = max(start_getter(inst), start)
            new_end = min(end_getter(inst), end)
            integrity = (new_end - new_start) * 1.0 / (end_getter(inst) - start_getter(inst))
            
            if integrity >= thr:
                new_inst = {k:v for k,v in inst.items()}
                new_inst['segment'] = [new_start - start, new_end - start]
                kept_instances.append(new_inst)
    return kept_instances


def get_dataset_dict(video_info_path, video_anno_path, subset, mode='test', exclude_videos=None, online_slice=False, slice_len=None, ignore_empty=True, slice_overlap=0, return_id_list=False):
    '''
    Prepare a dict that contains the information of each video, such as duration, annotations.
    Args:
        video_info_path: path to the video info file in json format. This file records the length and fps of each video.
        video_anno_path: path to the ActivityNet-style video annotation in json format.
        subset: e.g. train, val, test
        mode: train (for training) or test (for inference).
        online_slice: cut videos into slices for training and testing. It should be enabled if the videos are too long.
        slice_len: length of video slices.
        ignore_empty: ignore video slices that does not contain any action instance. This should be enabled only in the training phase.
        slice_overlap: overlap ration between adjacent slices (= overlap_length / slice_len)

    Return:
        dict
    '''
    video_ft_info = load_json(video_info_path)
    anno_data = load_json(video_anno_path)['database']

    video_dict = {}
    id_list = []
    cnt = 0

    video_set = set([x for x in anno_data if anno_data[x]['subset'] in subset])
    video_set = video_set.intersection(video_ft_info.keys())

    if exclude_videos is not None:
        assert isinstance(exclude_videos, (list, tuple))
        video_set = video_set.difference(exclude_videos)

    video_list = list(sorted(video_set))

    for video_name in video_list:
        # remove ambiguous instances on THUMOS14
        annotations = [x for x in anno_data[video_name]['annotations'] if x['label'] != 'Ambiguous']
        annotations = list(sorted(annotations, key=lambda x: sum(x['segment'])))

        if video_name in video_ft_info:
            # video_info records the length in snippets, duration and fps (#frames per second) of the feature/image sequence
            video_info = video_ft_info[video_name]
            # number of frames or snippets
            feature_length = int(video_info['feature_length'])   
            feature_fps = video_info['feature_fps']
            feature_second = video_info['feature_second']
        else:
            continue

        video_subset = anno_data[video_name]['subset']
        # For THUMOS14, we crop video into slices of fixed length
        if online_slice:
            stride = slice_len * (1 - slice_overlap)

            if feature_length <= slice_len:
                slices = [[0, feature_length]]
            else:
                # stride * (i - 1) + slice_len <= feature_length
                # i <= (feature_length - slice_len)
                num_complete_slices = int(math.floor(
                    (feature_length / slice_len - 1) / (1 - slice_overlap) + 1))
                slices = [
                    [int(i * stride), int(i * stride) + slice_len] for i in range(num_complete_slices)]
                if (num_complete_slices - 1) * stride + slice_len < feature_length:
                    # if video_name == 'video_test_0000006':
                    #     pdb.set_trace()
                    if mode != 'train':
                        # take the last incomplete slice
                        last_slice_start = int(stride * num_complete_slices)
                    else:
                        # move left to get a complete slice.
                        # This is a historical issue. The performance might be better
                        # if we keep the same rule for training and inference 
                        last_slice_start = max(0, feature_length - slice_len)
                    slices.append([last_slice_start, feature_length])
            num_kept_slice = 0
            for slice in slices:
                time_slices = [slice[0] / video_info['feature_fps'], slice[1] / video_info['feature_fps']]
                feature_second = time_slices[1] - time_slices[0]
                # perform integrity-based instance filtering
                valid_annotations = get_valid_anno(annotations, time_slices)
                
                if not ignore_empty or len(valid_annotations) >= 1:
                    # rename the video slice
                    new_vid_name = video_name + '_window_{}_{}'.format(*slice)
                    new_vid_info = {
                        'annotations': valid_annotations, 'src_vid_name': video_name, 
                        'feature_fps': feature_fps, 'feature_length': slice_len, 
                        'subset': subset, 'feature_second': feature_second, 'time_offset': time_slices[0]}
                    video_dict[new_vid_name] = new_vid_info
                    id_list.append(new_vid_name)
                    num_kept_slice += 1
            if num_kept_slice > 0:
                cnt += 1
        # for ActivityNet and hacs, use the full-length videos as samples
        else:
            if not ignore_empty or len(annotations) >= 1:
                # Remove incorrect annotions on ActivityNet
                valid_annotations = [x for x in annotations if x['segment'][1] - x['segment'][0] > 0.02]

                if ignore_empty and len(valid_annotations) == 0:
                    continue
                
                video_dict[video_name] = {
                    'src_vid_name': video_name, 'annotations': valid_annotations, 
                    'feature_fps': feature_fps, 'feature_length': int(feature_length),
                    'subset': video_subset, 'feature_second': feature_second, 'time_offset': 0}
                id_list.append(video_name)
                cnt += 1
    logging.info('{} videos, {} slices'.format(cnt, len(video_dict)))
    if return_id_list:
        return video_dict, id_list
    else:
        return video_dict


def load_video_frames(frame_dir, start, seq_len, stride=1, fn_tmpl='img_%07d.jpg'):
    raise NotImplementedError


def load_feature(ft_path, ft_format, shape=None):
    if ft_format == 'npy':
        video_df = np.load(ft_path)
        if shape == "CT":
            video_df = video_df.T
    elif ft_format == 'torch':
        video_df = torch.load(ft_path).numpy()

    else:
        raise ValueError('unsupported feature format: {}'.format(ft_format))
    return video_df


def get_dataset_info(dataset, feature):
    '''get basic information for each dataset'''

    path_info = easydict.EasyDict(yaml.load(open('datasets/path.yml'), yaml.SafeLoader))

    if dataset == 'thumos14':
        subset_mapping = {'train': 'val', 'val': 'test'}
        ann_file = path_info['thumos14']['ann_file']
    
        if feature == 'i3d2s':
            feature_info = {'local_path': path_info['thumos14'][feature]['local_path'], 'format': 'torch', 'fn_templ': '%s'}
            ft_info_file = path_info['thumos14'][feature]['ft_info_file']

        else:
            raise ValueError('unsupported feature, should be one of [i3d2s]')

    elif dataset == 'activitynet':
        raise NotImplementedError

    elif dataset == 'hacs':
        raise NotImplementedError
    
    elif dataset == 'muses':
        raise NotImplementedError
        
    else:
        raise ValueError('unsupported dataset {}'.format(dataset))

    return subset_mapping, feature_info, ann_file, ft_info_file


def make_img_transform(*args, **kwargs):
    raise NotImplementedError

================================================
FILE: datasets/path.yml
================================================
# set the path of features, anno file and feature info file

thumos14:
  ann_file: 'data/thumos14/th14_annotations_with_fps_duration.json'
  i3d2s:
    local_path: data/thumos14/I3D_2stream_Pth
    ft_info_file: 'data/thumos14/th14_i3d2s_ft_info.json'


================================================
FILE: datasets/tad_dataset.py
================================================
# ------------------------------------------------------------------------
# TadTR: End-to-end Temporal Action Detection with Transformer
# Copyright (c) 2021 - 2022. Xiaolong Liu.
# ------------------------------------------------------------------------

'''Universal TAD Dataset loader.'''

import json
import logging
import math
import os.path as osp

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import torch.utils.data
import tqdm
import h5py

from .data_utils import get_dataset_dict, load_feature, load_video_frames, get_dataset_info, make_img_transform
# from util.config import cfg
from util.segment_ops import segment_t1t2_to_cw


class TADDataset(torch.utils.data.Dataset):
    def __init__(self, subset, mode, feature_info, ann_file, ft_info_file, transforms, mem_cache=False, online_slice=False, slice_len=None, slice_overlap=0, binary=False, padding=True, input_type='feature', img_stride=1):
        '''TADDataset
        Parameters:
            subset: train/val/test
            mode: train, or test
            feature_info: basic info of video features, e.g. path, file format, filename template
            ann_file: path to the ground truth file
            ft_info_file: path to the file that describe other information of each video
            transforms: which transform to use
            mem_cache: cache features of the whole dataset into memory.
            binary: transform all gt to binary classes. This is required for training a class-agnostic detector
            padding: whether to pad the input feature to `slice_len`
        
        '''

        super().__init__()
        self.feature_info = feature_info
        self.ann_file = ann_file
        self.ft_info_file = ft_info_file
        self.subset = subset
        self.online_slice = online_slice
        self.slice_len = slice_len
        self.slice_overlap = slice_overlap
        self.padding = padding
        self.mode = mode
        self.transforms = transforms
        print('Use data transform {}'.format(self.transforms))
        self.binary = binary
        self.is_image_input = input_type == 'image'
        self.mem_cache = mem_cache
        self.img_stride = img_stride

        self._prepare()

    def _get_classes(self, anno_dict):
        '''get class list from the annotation dict'''
        if 'classes' in anno_dict:
            classes = anno_dict['classes']
        else:
            database = anno_dict['database']
            all_gts = []
            for vid in database:
                all_gts += database[vid]['annotations']
            classes = list(sorted({x['label'] for x in all_gts}))
        return classes

    def _prepare(self):
        '''parse annotation file'''
        anno_dict = json.load(open(self.ann_file))
        self.classes = self._get_classes(anno_dict)
      
        self.video_dict, self.video_list = get_dataset_dict(self.ft_info_file, self.ann_file, self.subset, mode=self.mode, online_slice=self.online_slice, slice_len=self.slice_len, slice_overlap=self.slice_overlap, ignore_empty=self.mode == 'train', return_id_list=True)

        # video_list = self.video_dict.keys()
        # self.video_list = list(sorted(video_list))
     
        logging.info("{} subset video numbers: {}".format(self.subset,len(self.video_list)))
        self.anno_dict = anno_dict

        self.cached_data = {}

        # if the features of all videos is saved in one hdf5 file (all in one), e.g. TSP features
        self.all_video_data = {}
        feature_info = self.feature_info
        fn_templ = feature_info['fn_templ']
        src_video_list = {self.video_dict[k]['src_vid_name'] for k in self.video_list}
        # 
        if feature_info.get('all_in_one', False):
            data = h5py.File(feature_info['local_path'][self.subset])
            for k in src_video_list:
                self.all_video_data[k] = np.array(data[fn_templ % k]).T
            if not self.online_slice:
                self.cached_data = self.all_video_data

    def __len__(self):
        return len(self.video_list)

    def _get_video_data(self, index):
        if self.is_image_input:
            return self._get_img_data(index)
        else:
            return self._get_feature_data(index)

    def _get_feature_data(self,index):
        video_name = self.video_list[index]
        # directly fetch from memory
        if video_name in self.cached_data:
            video_data = self.cached_data[video_name]
            return torch.Tensor(video_data).float().contiguous()

        src_vid_name = self.video_dict[video_name]['src_vid_name']
        # retrieve feature info
        feature_info = self.feature_info
        # "ft" is short for "feature"
        local_ft_dir = feature_info['local_path']
        ft_format = feature_info['format']
        local_ft_path = osp.join(local_ft_dir, feature_info['fn_templ'] % src_vid_name) if local_ft_dir else None
        # the shape of feature sequence, can be TxC (in most cases) or CxT
        shape = feature_info.get('shape', 'TC')

        if src_vid_name in self.all_video_data:
            feature_data = self.all_video_data[src_vid_name].T
        
        else:
            feature_data = load_feature(local_ft_path, ft_format, shape)

        feature_data = feature_data.T   # T x C to C x T.

        if self.online_slice:
            slice_start, slice_end = [int(x) for x in video_name.split('_')[-2:]]
            assert slice_end  > slice_start
            assert slice_start < feature_data.shape[1]
            feature_data = feature_data[:, slice_start:slice_end]

            if self.padding and feature_data.shape[1] < self.slice_len:
                diff = self.slice_len - feature_data.shape[1]
                feature_data = np.pad(
                    feature_data, ((0, 0), (0, diff)), mode='constant')

                # IMPORATANT: if padded is done, the length info must be modified
                self.video_dict[video_name]['feature_length'] = self.slice_len
                self.video_dict[video_name]['feature_second'] = self.slice_len / self.video_dict[video_name]['feature_fps']

        if self.mem_cache and video_name not in self.cached_data:
            self.cached_data[video_name] = feature_data

        feature_data = torch.Tensor(feature_data).float().contiguous()
        return feature_data

    def _get_img_data(self, index):
        '''have not been tested'''
        raise NotImplementedError

    def _get_train_label(self, video_name):
        '''get normalized target'''
        video_info = self.video_dict[video_name]
        video_labels = video_info['annotations']
        feature_second = video_info['feature_second']
      
        target = {
            'segments': [], 'labels': [],
            'orig_labels': [], 'video_id': video_name,
            'video_duration': feature_second,   # only used in inference
            'feature_fps': video_info['feature_fps'],
            }
        for j in range(len(video_labels)):
            tmp_info=video_labels[j]
           
            segment = tmp_info['segment'] 
            # special rule for thumos14, treat ambiguous instances as negatives
            if tmp_info['label'] not in self.classes:
                continue
            # the label id of first forground class is 0
            label_id = self.classes.index(tmp_info['label'])
            target['orig_labels'].append(label_id)

            if self.binary:
                label_id = 0
            target['segments'].append(segment)
            target['labels'].append(label_id)

        # normalized the coordinate
        target['segments'] = np.array(target['segments']) / feature_second
        
        if len(target['segments']) > 0:
            target['segments'] = segment_t1t2_to_cw(target['segments'])

            # convert to torch format
            for k, dtype in zip(['segments', 'labels'], ['float32', 'int64']):
                if not isinstance(target[k], torch.Tensor):
                    target[k] = torch.from_numpy(np.array(target[k], dtype=dtype))
       
        return target

    def __getitem__(self, index):
        # index = index % len(self.video_list)
        video_data = self._get_video_data(index)
        video_name = self.video_list[index]

        target =  self._get_train_label(video_name)
        
        return video_data, target
       

def build(dataset, subset, args, mode):
    '''build TADDataset'''
    subset_mapping, feature_info, ann_file, ft_info_file = get_dataset_info(dataset, args.feature)
    transforms = None
    if args.input_type == 'image':
        transforms = make_img_transform(mode)
    else:
        transforms = None
    return TADDataset(
        subset_mapping[subset], mode, feature_info, ann_file, ft_info_file, transforms,
        online_slice=args.online_slice, slice_len=args.slice_len, slice_overlap=args.slice_overlap if mode=='train' else args.test_slice_overlap, 
        binary=args.binary,
        input_type=args.input_type)


================================================
FILE: datasets/tad_eval.py
================================================
# TadTR: End-to-end Temporal Action Detection with Transformer

import json
import os.path as osp
import os
import pandas as pd
import time
import numpy as np
import logging
import concurrent.futures
import sys
import logging
# import ipdb as pdb
import pickle

from opts import cfg

from Evaluation.eval_detection import compute_average_precision_detection
# from Evaluation.eval_proposal import average_recall_vs_avg_nr_proposals
import matplotlib.pyplot as plt
# from util.proposal_utils import soft_nms
from .data_utils import get_dataset_dict
from util.misc import all_gather
from util.segment_ops import soft_nms, temporal_nms


def eval_ap(iou, cls, gt, predition):
    ap = compute_average_precision_detection(gt, predition, iou)
    sys.stdout.flush()
    return cls, ap


def apply_nms(dets_arr, nms_thr=0.4, use_soft_nms=False):
    # the last column are class ids
    unique_classes = np.unique(dets_arr[:, 3])
    output_dets = []
    for cls in unique_classes:
        this_cls_dets = dets_arr[dets_arr[:,3] == cls]
        if not use_soft_nms:
            this_cls_dets_kept = temporal_nms(this_cls_dets, nms_thr)
        else:
            classes = this_cls_dets[:, [3]]
            this_cls_dets_kept = soft_nms(this_cls_dets, 0.8, 0, 0, 100)
            this_cls_dets_kept = np.concatenate((this_cls_dets_kept, classes), -1)
        output_dets.append(this_cls_dets_kept)
    output_dets = np.concatenate(output_dets, axis=0)
    sort_idx = output_dets[:, 2].argsort()[::-1]
    output_dets = output_dets[sort_idx, :]
    return output_dets


class TADEvaluator(object):
    def __init__(self, dataset_name, subset, video_dict=None, nms_mode=['raw'], iou_range=[0.5], epoch=None, num_workers=None):
        '''dataset_name:  thumos14, activitynet or hacs
        subset: val or test
        video_dict: the dataset dict created in video_dataset.py
        iou_range: [0.3:0.7:0.1] for thumos14; [0.5:0.95:0.05] for anet and hacs.
        '''

        self.epoch = epoch
        self.iou_range = iou_range
        self.nms_mode = nms_mode
        self.dataset_name = dataset_name
        self.ignored_videos = list()

        if dataset_name == 'thumos14':
            subset_mapping = {'train': 'val', 'val': 'test'}
            anno_file = 'data/thumos14/th14_annotations_with_fps_duration.json'
            # follow SSN/PGCN/AFSD/MUSES to remove three falsely annotated videos
            self.ignored_videos = ['video_test_0000270', 'video_test_0001292', 'video_test_0001496']
        else:
            raise NotImplementedError
        anno_dict = json.load(open(anno_file))
        classes = self._get_classes(anno_dict)
        num_classes = len(classes)
        
        database = anno_dict['database']
        all_gt = []

        unique_video_list = [x for x in database if database[x]['subset'] in subset_mapping[subset]]

        for vid in unique_video_list:
            if vid in self.ignored_videos:
                continue
            this_gts = [x for x in database[vid]['annotations'] if x['label'] != 'Ambiguous']
            all_gt += [[vid, classes.index(x['label']), x['segment'][0], x['segment'][1]] for x in this_gts]

        all_gt = pd.DataFrame(all_gt, columns=["video-id", "cls","t-start", "t-end"])
        self.video_ids = all_gt['video-id'].unique().tolist()
        logging.info('{} ground truth instances from {} videos'.format(len(all_gt), len(self.video_ids)))

        # per class ground truth
        gt_by_cls = []
        for cls in range(num_classes):
            gt_by_cls.append(all_gt[all_gt.cls == cls].reset_index(drop=True).drop('cls', 1))

        self.gt_by_cls = gt_by_cls
        self.all_pred = {k: [] for k in self.nms_mode}
        self.num_classes = num_classes
        self.classes = classes
        self.anno_dict = anno_dict
        self.all_gt = all_gt
        self.num_workers = num_classes if num_workers is None else num_workers
        self.video_dict = video_dict
        self.stats = {k: dict() for k in self.nms_mode}
        self.subset = subset

    def _get_classes(self, anno_dict):
        if 'classes' in anno_dict:
            classes = anno_dict['classes']
        else:
            
            database = anno_dict['database']
            all_gts = []
            for vid in database:
                all_gts += database[vid]['annotations']
            classes = list(sorted({x['label'] for x in all_gts}))
        return classes

    def update(self, pred, assign_cls_labels=False):
        '''pred: a dict of predictions for each video. For each video, the predictions are in a dict with these fields: scores, labels, segments
        assign_cls_labels: manually assign class labels to the detections. This is necessary when the predictions are class-agnostic.
        '''
        pred_numpy = {k: {kk: vv.detach().cpu().numpy() for kk, vv in v.items()} for k,v in pred.items()}
        for k, v in pred_numpy.items():
            # pdb.set_trace()
            if 'window' not in k:
                this_dets = [
                    [v['segments'][i, 0], 
                     v['segments'][i, 1],
                     v['scores'][i], v['labels'][i]]
                     for i in range(len(v['scores']))]
                video_id = k
            else:
                window_start = self.video_dict[k]['time_offset']
                video_id = self.video_dict[k]['src_vid_name']
                this_dets = [
                    [v['segments'][i, 0] + window_start, 
                     v['segments'][i, 1] + window_start, 
                     v['scores'][i],
                     v['labels'][i]]
                    for i in range(len(v['scores']))]
            
            # ignore videos that are not in ground truth set
            if video_id not in self.video_ids:
                continue
            this_dets = np.array(this_dets)   # start, end, score, label
            
            for nms_mode in self.nms_mode:
                input_dets = np.copy(this_dets)
                # if nms_mode == 'nms' and not (cfg.TEST_SLICE_OVERLAP > 0 and self.dataset_name == 'thumos14'):  # when cfg.TEST_SLICE_OVERLAP > 0, only do nms at summarization
                #     dets = apply_nms(input_dets, nms_thr=cfg.nms_thr, use_soft_nms=self.dataset_name=='activitynet' and assign_cls_labels)
                # else:
                if True:
                    sort_idx = input_dets[:, 2].argsort()[::-1]
                    dets = input_dets[sort_idx, :]

                # only keep top 200 detections per video
                dets = dets[:200, :]

                # On ActivityNet, follow the tradition to use external video label
                if assign_cls_labels:
                        raise NotImplementedError
                self.all_pred[nms_mode] += [[video_id, k] + det for det in dets.tolist()]


    def nms_whole_dataset(self):
        video_ids = list(set([v['src_vid_name'] for k, v in self.video_dict.items()]))
        all_pred = []
        for vid in video_ids:
            this_dets = self.all_pred['nms'][self.all_pred['nms']['video-id'] == vid][['t-start', 't-end', 'score', 'cls']].values
            
            this_dets = apply_nms(this_dets)[:200, ...]
            this_dets = [[vid] + x.tolist() for x in this_dets]
            all_pred += this_dets
        self.all_pred['nms'] = pd.DataFrame(all_pred, columns=["video-id", "t-start", "t-end", "score", "cls"])

    def cross_window_fusion(self):
        '''
        merge detections in the overlapped regions of adjacent windows. Only used for THUMOS14
        '''
        # video_ids = list(set([v['src_vid_name'] for k, v in self.video_dict.items()]))
        all_pred = []

        video_ids = self.all_pred['raw']['video-id'].unique()
        vid = video_ids[0]

        for vid in video_ids:
            this_dets = self.all_pred['raw'][self.all_pred['raw']['video-id'] == vid]
            slice_ids = this_dets['slice-id'].unique().tolist()
            if len(slice_ids) > 1:
                slice_sorted = sorted(slice_ids, key=lambda k: int(k.split('_')[4]))
               
                overlap_region_time_list = []
                for i in range(0, len(slice_sorted) - 1):
                    slice_name = slice_sorted[i]
                    feature_fps = self.video_dict[slice_name]['feature_fps']
                    time_base = 0  # self.video_dict[slice_name]['time_base']
                    # parse the temporal coordinate from name
                    cur_slice = [int(x) for x in slice_sorted[i].split('_')[4:6]]
                    next_slice = [int(x) for x in slice_sorted[i+1].split('_')[4:6]]
                    overlap_region_time = [next_slice[0], cur_slice[1]]
                    # add time offset of each window/slice
                    overlap_region_time = [time_base + overlap_region_time[iii] / feature_fps for iii in range(2)]
                    overlap_region_time_list.append(overlap_region_time)
                
                mask_union = None
                processed_dets = []
                for overlap_region_time in overlap_region_time_list:
                    inters = np.minimum(this_dets['t-end'], overlap_region_time[1]) - np.maximum(this_dets['t-start'], overlap_region_time[0])
                    # we only perform NMS to the overlapped regions
                    mask = inters > 0
                    overlap_dets = this_dets[mask]
                    overlap_dets_arr = overlap_dets[['t-start', 't-end', 'score', 'cls']].values
                    if len(overlap_dets) > 0:
                        kept_dets_arr = apply_nms(np.concatenate((overlap_dets_arr, np.arange(len(overlap_dets_arr))[:, None]), axis=1))
                        processed_dets.append(overlap_dets.iloc[kept_dets_arr[:, -1].astype('int64')])
                    
                    if mask_union is not None:
                        mask_union = mask_union | mask
                    else:
                        mask_union = mask
                # instances not in overlapped region
                processed_dets.append(this_dets[~mask_union])
                all_pred += processed_dets
            else:
                all_pred.append(this_dets)

        all_pred = pd.concat(all_pred)
        self.all_pred['raw'] = all_pred

    def accumulate(self, test_slice_overlap=0):
        '''accumulate detections in all videos'''
        for nms_mode in self.nms_mode:
            self.all_pred[nms_mode] = pd.DataFrame(self.all_pred[nms_mode], columns=["video-id", "slice-id", "t-start", "t-end", "score", "cls"])
        
        self.pred_by_cls = {}
        for nms_mode in self.nms_mode:
            if self.dataset_name == 'thumos14' and nms_mode == 'raw' and test_slice_overlap > 0:
                self.cross_window_fusion()
            # if you really want to use NMS
            if self.dataset_name == 'thumos14' and nms_mode == 'nms' and test_slice_overlap > 0:
                self.nms_whole_dataset()

            self.pred_by_cls[nms_mode] = [self.all_pred[nms_mode][self.all_pred[nms_mode].cls == cls].reset_index(drop=True).drop('cls', 1) for cls in range(self.num_classes)]

    def import_prediction(self):
        pass

    def format_arr(self, arr, format='{:.2f}'):
        line = ' '.join([format.format(x) for x in arr])
        return line

    def synchronize_between_processes(self):
        mode = self.nms_mode[0]
        print(
            len(self.all_pred[mode]),
            len({x[0] for x in self.all_pred[mode]})
        )
        self.all_pred = merge_distributed(self.all_pred)

    def summarize(self):
        '''Compute mAP and collect stats'''
        if self.dataset_name in ['thumos14', 'muses']:
            # 0.3~0.7 avg
            display_iou_thr_inds = [0, 1, 2, 3, 4]
        else:
            # 0.5 0.75 0.95 avg
            display_iou_thr_inds = [0, 5, 9]
        
        for nms_mode in self.nms_mode:
            logging.info(
                'mode={} {} predictions from {} videos'.format(
                    nms_mode,
                    len(self.all_pred[nms_mode]),
                    len(self.all_pred[nms_mode]['video-id'].unique()))
            )

        header = ' '.join('%.2f' % self.iou_range[i] for i in display_iou_thr_inds) + ' avg'  # 0 5 9
        lines = []
        for nms_mode in self.nms_mode:
            per_iou_ap = self.compute_map(nms_mode)
            line = ' '.join(['%.2f' % (100*per_iou_ap[i]) for i in display_iou_thr_inds]) + ' %.2f' % (100*per_iou_ap.mean()) + ' {} epoch{}'.format(nms_mode, self.epoch)
            lines.append(line)
        msg = header
        for l in lines:
            msg += '\n' + l
        logging.info('\n' + msg)

        for nms_mode in self.nms_mode:
            if self.dataset_name == 'thumos14':
                self.stats[nms_mode]['AP50'] = self.stats[nms_mode]['per_iou_ap'][2]
            else:
                self.stats[nms_mode]['AP50'] = self.stats[nms_mode]['per_iou_ap'][0]
        self.stats_summary = msg

    def compute_map(self, nms_mode):
        '''Compute mean average precision'''
        start_time = time.time()

        gt_by_cls, pred_by_cls = self.gt_by_cls, self.pred_by_cls[nms_mode]

        iou_range = self.iou_range
        num_classes = self.num_classes
        ap_values = np.zeros((num_classes, len(iou_range)))

        with concurrent.futures.ProcessPoolExecutor(min(self.num_workers, 8)) as p:
            futures = []
            for cls in range(len(pred_by_cls)):
                if len(gt_by_cls[cls]) == 0:
                    logging.info('no gt for class {}'.format(self.classes[cls]))
                if len(pred_by_cls[cls]) == 0:
                    logging.info('no prediction for class {}'.format(self.classes[cls]))
                futures.append(p.submit(eval_ap, iou_range, cls, gt_by_cls[cls], pred_by_cls[cls]))
            for f in concurrent.futures.as_completed(futures):
                x = f.result()
                ap_values[x[0], :] = x[1]

        per_iou_ap = ap_values.mean(axis=0)
        per_cls_ap = ap_values.mean(axis=1)
        mAP = per_cls_ap.mean()
       
        self.stats[nms_mode]['mAP'] = mAP
        self.stats[nms_mode]['ap_values'] = ap_values
        self.stats[nms_mode]['per_iou_ap'] = per_iou_ap
        self.stats[nms_mode]['per_cls_ap'] = per_cls_ap
        return per_iou_ap

    def dump_to_json(self, dets, save_path):
        result_dict = {}
        videos = dets['video-id'].unique()
        for video in videos:
            this_detections = dets[dets['video-id'] == video]
            det_list = []
            for idx, row in this_detections.iterrows():
                det_list.append(
                    {'segment': [float(row['t-start']), float(row['t-end'])], 'label': self.classes[int(row['cls'])], 'score': float(row['score'])}
                )
            
            video_id = video[2:] if video.startswith('v_') else video
            result_dict[video_id] = det_list

        # the standard detection format for ActivityNet
        output_dict={
            "version": "VERSION 1.3",
            "results": result_dict,
            "external_data":{}}
        if save_path:
            dirname = osp.dirname(save_path)
            if not osp.exists(dirname):
                os.makedirs(dirname)
            with open(save_path, 'w') as f:
                json.dump(output_dict, f)
        # return output_dict

    def dump_detection(self, save_path=None):
        for nms_mode in self.nms_mode:
            logging.info(
                'dump detection result in JSON format to {}'.format(save_path.format(nms_mode)))
            self.dump_to_json(self.all_pred[nms_mode], save_path.format(nms_mode))


def merge_distributed(all_pred):
    '''gather outputs from different nodes at distributed mode'''
    all_pred_gathered = all_gather(all_pred)
    
    merged_all_pred = {k: [] for k in all_pred}
    for p in all_pred_gathered:
        for k in p:
            merged_all_pred[k] += p[k]

    return merged_all_pred

    
if __name__ == '__main__':
    pass


================================================
FILE: demo.py
================================================
# ------------------------------------------------------------------------
# TadTR: End-to-end Temporal Action Detection with Transformer
# Copyright (c) 2021. Xiaolong Liu.
# ------------------------------------------------------------------------


from models import build_model
from opts import update_cfg_from_file
from util.misc import NestedTensor
import torch
import time
import pdb


# @torch.no_grad()
def demo(args, cfg):
    device = torch.device(args.device)
    model, _, _ = build_model(cfg)

    bs, t = 1, 100
    x = torch.rand([bs, cfg.feature_dim, t]).to(device)
    mask = torch.ones([bs, t], dtype=torch.bool).to(device)
    samples = NestedTensor(x, mask)
    targets = [
        {
            'labels': torch.LongTensor([0, 0]).to(device),
            'segments': torch.FloatTensor([[0.5, 0.2], [0.7, 0.3]]).to(device),
            'orig_size': 100.0
        } for i in range(bs)]

    model.to(device)

    outputs = model(samples)
    
    # orig_target_sizes = torch.FloatTensor(
    #         [t["orig_size"] for t in targets]).cuda()
    # results = postprocessor(outputs, orig_target_sizes)
    print('Passed')


if __name__ == '__main__':
    from opts import get_args_parser, cfg, update_cfg_with_args
    args = get_args_parser().parse_args()

    if args.cfg:
        update_cfg_from_file(cfg, args.cfg)
    update_cfg_with_args(cfg, args.opt)

    if cfg.disable_cuda:
        cfg.act_reg = False
    demo(args, cfg)


================================================
FILE: docs/1_train_on_your_dataset.md
================================================
# Train and Evaluate TadTR on Your Dataset

TODO

================================================
FILE: engine.py
================================================
# ------------------------------------------------------------------------
# TadTR: End-to-end Temporal Action Detection with Transformer
# ------------------------------------------------------------------------
# Modified from DETR (https://github.com/facebookresearch/detr)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# ------------------------------------------------------------------------

"""
Train and eval functions used in main.py
"""
import math
import os.path as osp
import sys
from typing import Iterable
import tqdm
import logging

import torch

import util.misc as utils
from datasets.tad_eval import TADEvaluator
import pickle

def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
                    data_loader: Iterable, optimizer: torch.optim.Optimizer,
                    device: torch.device, epoch: int, cfg, max_norm: float = 0):
    model.train()
    criterion.train()

    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter('lr', utils.SmoothedValue(
        window_size=1, fmt='{value:.6f}'))
    header = 'Epoch: [{}]'.format(epoch)
    print_freq = 20
    cnt = 0

    for samples, targets in metric_logger.log_every(data_loader, print_freq, header):
        samples = samples.to(device)
        targets = [{k: v.to(device) if k in ['segments', 'labels']
                    else v for k, v in t.items()} for t in targets]

        outputs = model((samples.tensors, samples.mask))
        loss_dict = criterion(outputs, targets)
        weight_dict = criterion.weight_dict
        losses = sum(loss_dict[k] * weight_dict[k]
                     for k in loss_dict.keys() if k in weight_dict)
        # reduce losses over all GPUs for logging purposes
        loss_dict_reduced = utils.reduce_dict(loss_dict)
        # loss of each type
        loss_dict_reduced_unscaled = {f'{k}_unscaled': v
                                      for k, v in loss_dict_reduced.items()}
        # weighted_loss of each type
        loss_dict_reduced_scaled = {k: v * weight_dict[k]
                                    for k, v in loss_dict_reduced.items() if k in weight_dict}
        losses_reduced_scaled = sum(loss_dict_reduced_scaled.values())

        loss_value = losses_reduced_scaled.item()

        if not math.isfinite(loss_value):
            logging.info("Loss is {}, stopping training".format(loss_value))
            logging.info(str(loss_dict_reduced))
            sys.exit(1)

        losses.backward()
        if (cnt + 1) % cfg.iter_size == 0:
            # scale gradients when iter size is functioning
            if cfg.iter_size != 1:
                for g in optimizer.param_groups:
                    for p in g['params']:
                        p.grad /= cfg.iter_size

            if max_norm > 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
            optimizer.step()
            optimizer.zero_grad()

        metric_logger.update(
            loss=loss_value, **loss_dict_reduced_scaled, **loss_dict_reduced_unscaled)
        metric_logger.update(lr=optimizer.param_groups[0]["lr"])

        cnt += 1

    optimizer.zero_grad()
    # gather the stats from all processes
    metric_logger.synchronize_between_processes()
    logging.info(f"Averaged stats:{metric_logger}")
    return {k: meter.global_avg for k, meter in metric_logger.meters.items()}


def to_device(t, device):
    if isinstance(t, (list, tuple)):
        return t
    else:
        return t.to(device)


@torch.no_grad()
def test(model, criterion, postprocessor, data_loader, base_ds, device, output_dir, cfg, subset='val', epoch=None, test_mode=False):
    '''
    Run inference and evaluation. Do not compute loss
    test_mode: indicates that we are evaluating specific epoch during testing
    '''
    model.eval()
    criterion.eval()

    metric_logger = utils.MetricLogger(delimiter="  ")
    metric_logger.add_meter('class_error', utils.SmoothedValue(
        window_size=1, fmt='{value:.2f}'))

    iou_range = [0.3, 0.4, 0.5, 0.6, 0.7] if cfg.dataset_name == 'thumos14' else [
        num/100 for num in range(50, 100, 5)]
    # logging.info('iou range {}'.format(iou_range))

    # action_evaluator = None
    action_evaluator = TADEvaluator(cfg.dataset_name, subset, base_ds, nms_mode=[
                                          'raw'], iou_range=iou_range, epoch=epoch)

    # raw_res = []
    cnt = 0
    for (samples, targets) in tqdm.tqdm(data_loader, total=len(data_loader)):
        samples = samples.to(device)
        outputs = model((samples.tensors, samples.mask))

        # raw_res.append((outputs, targets))
        video_duration = torch.FloatTensor(
            [t["video_duration"] for t in targets]).to(device)
        results = postprocessor(outputs, video_duration, fuse_score=cfg.act_reg)

        res = {target['video_id']: output for target,
               output in zip(targets, results)}
        if action_evaluator is not None:
            action_evaluator.update(res, assign_cls_labels=cfg.binary)
        # if cnt >= 9:
        #     break
        cnt += 1

    # accumulate predictions from all videos
    if action_evaluator is not None:
        action_evaluator.synchronize_between_processes()
        action_evaluator.accumulate(cfg.test_slice_overlap)
        # dump detections
        if test_mode:
            save_path = osp.join('outputs', 'detection_{}.json')
            action_evaluator.dump_detection(save_path)
        action_evaluator.summarize()

    stats = {}

    if action_evaluator is not None:
        for k, v in action_evaluator.stats.items():
            for vk, vv in v.items():
                stats[vk + '_' + k] = vv

        mAP_values = ' '.join([f'{k}: {100*v:.2f}'.format(k, v)
                              for k, v in stats.items() if k.startswith('mAP')])
        logging.info(mAP_values)

        stats['stats_summary'] = action_evaluator.stats_summary

    # with open('raw_outputs.pkl', 'wb') as f:
    #     pickle.dump(raw_res, f)

    return stats


================================================
FILE: main.py
================================================
# ------------------------------------------------------------------------
# TadTR: End-to-end Temporal Action Detection with Transformer
# Copyright (c) 2021 - 2012. Xiaolong Liu
# ------------------------------------------------------------------------
# Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0
# ------------------------------------------------------------------------
# and DETR (https://github.com/facebookresearch/detr)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# ------------------------------------------------------------------------
'''Entry for training and testing'''

import datetime
import json
import random
import time
from pathlib import Path
import re
import os
import logging
import sys
import os.path as osp

import numpy as np
import torch
from torch.utils.data import DataLoader, DistributedSampler

from opts import get_args_parser, cfg, update_cfg_with_args, update_cfg_from_file
import util.misc as utils
from datasets import build_dataset
from engine import train_one_epoch, test
from models import build_model
if cfg.tensorboard:
    from torch.utils.tensorboard import SummaryWriter

        
def main(args):
    from util.logger import setup_logger

    if args.cfg is not None:
        update_cfg_from_file(cfg, args.cfg)

    update_cfg_with_args(cfg, args.opt)

    if cfg.output_dir:
        Path(cfg.output_dir).mkdir(parents=True, exist_ok=True)

    # The actionness regression module requires CUDA support
    # If your machine does not have CUDA enabled, this module will be disabled.
    if cfg.disable_cuda:
        cfg.act_reg = False

    utils.init_distributed_mode(args)

    if not args.eval:
        mode = 'train'
    else:
        mode = 'test'

    # Logs will be saved in log_path
    log_path = os.path.join(cfg.output_dir, mode + '.log')
    setup_logger(log_path)

    logging.info("git:\n  {}\n".format(utils.get_sha()))

    logging.info(' '.join(sys.argv))

    with open(osp.join(cfg.output_dir, mode + '_cmd.txt'), 'w') as f:
        f.write(' '.join(sys.argv) + '\n')
    logging.info(str(args))
    logging.info(str(cfg))

    device = torch.device(args.device)

    # fix the seed
    seed = args.seed + utils.get_rank()
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    if cfg.input_type == 'image':
        # We plan to support image input in the future
        raise NotImplementedError

    model, criterion, postprocessors = build_model(cfg)

    model.to(device)
    model_without_ddp = model

    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.gpu], find_unused_parameters=True)
        model_without_ddp = model.module
    elif args.multi_gpu:
        model = torch.nn.DataParallel(model)
        model_without_ddp = model.module

    n_parameters = sum(p.numel() for p in model.parameters())
    logging.info('number of params: {}'.format(n_parameters))

    def match_name_keywords(n, name_keywords):
        out = False
        for b in name_keywords:
            if b in n:
                out = True
                break
        return out

    param_dicts = [
        # non-backbone, non-offset
        {
            "params":
                [p for n, p in model_without_ddp.named_parameters()
                 if not match_name_keywords(n, cfg.lr_backbone_names) and not match_name_keywords(n, cfg.lr_linear_proj_names) and p.requires_grad],
            "lr": cfg.lr,
            "initial_lr": cfg.lr
        },
        # backbone
        {
            "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, cfg.lr_backbone_names) and p.requires_grad],
            "lr": cfg.lr_backbone,
            "initial_lr": cfg.lr_backbone
        },
        # offset
        {
            "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, cfg.lr_linear_proj_names) and p.requires_grad],
            "lr": cfg.lr * cfg.lr_linear_proj_mult,
            "initial_lr": cfg.lr * cfg.lr_linear_proj_mult
        }
    ]

    optimizer = torch.optim.__dict__[cfg.optimizer](param_dicts, lr=cfg.lr,
                                                     weight_decay=cfg.weight_decay)

    output_dir = Path(cfg.output_dir)

    if args.resume == 'latest':
        args.resume = osp.join(cfg.output_dir, 'checkpoint.pth')
    elif args.resume == 'best':
        args.resume = osp.join(cfg.output_dir, 'model_best.pth')

    if 'model_best.pth' in os.listdir(cfg.output_dir) and not args.resume and not args.eval:
        # for many times, my trained models were accidentally overwrittern by new models😂. So I add this to avoid that
        logging.error(
            'Danger! You are overwriting an existing output dir {}, probably because you forget to change the output_dir option'.format(cfg.output_dir))
        confirm = input('confirm: y/n')
        if confirm != 'y':
            return

    last_epoch = -1

    if args.resume:
        checkpoint = torch.load(args.resume, map_location='cpu')
        last_epoch = checkpoint['epoch']

    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, cfg.lr_step, last_epoch=last_epoch)

    dataset_val = build_dataset(subset=cfg.test_set, args=cfg, mode='val')
    if not args.eval:
        dataset_train = build_dataset(subset='train', args=cfg, mode='train')

    if args.distributed:
        if not args.eval:
            sampler_train = DistributedSampler(dataset_train)
        sampler_val = DistributedSampler(dataset_val, shuffle=False)

    else:
        if not args.eval:
            sampler_train = torch.utils.data.RandomSampler(dataset_train)
        sampler_val = torch.utils.data.SequentialSampler(dataset_val)

    if not args.eval:
        batch_sampler_train = torch.utils.data.BatchSampler(
            sampler_train, cfg.batch_size, drop_last=True)

        data_loader_train = DataLoader(dataset_train,
                                       batch_sampler=batch_sampler_train,
                                       collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True)

    data_loader_val = DataLoader(dataset_val, cfg.batch_size, sampler=sampler_val,
                                 drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True)

    base_ds = dataset_val.video_dict

    if not args.eval and cfg.tensorboard and utils.is_main_process():
        smry_writer = SummaryWriter(output_dir)
    else:
        smry_writer = None

    best_metric = -1
    best_metric_txt = ''

    if args.eval and not args.resume:
        args.resume = osp.join(output_dir, 'model_best.pth')

    # start training from this epoch. You do not to set this option.
    start_epoch = 0
    if args.resume:
        print('loading checkpint {}'.format(args.resume))
        if args.resume.startswith('https'):
            checkpoint = torch.hub.load_state_dict_from_url(
                args.resume, map_location='cpu', check_hash=True)
        else:
            checkpoint = torch.load(args.resume, map_location='cpu')
        
        model_without_ddp.load_state_dict(checkpoint['model'], strict=False)

        if 'epoch' in checkpoint:
            start_epoch = checkpoint['epoch'] + 1

        if 'best_metric' in checkpoint:
            best_metric = checkpoint['best_metric']

    if args.eval:
        test_stats = test(model, criterion, postprocessors,
                          data_loader_val, base_ds, device, cfg.output_dir, cfg, subset=cfg.test_set, epoch=checkpoint['epoch'], test_mode=True)

        return

    logging.info("Start training")
    start_time = time.time()

    for epoch in range(start_epoch, cfg.epochs):
        if args.distributed:
            sampler_train.set_epoch(epoch)

        for group in optimizer.param_groups:
            logging.info('lr={}'.format(group['lr']))
        train_stats = train_one_epoch(
            model, criterion, data_loader_train, optimizer, device, epoch, cfg,
            cfg.clip_max_norm)

        lr_scheduler.step()

        if cfg.output_dir:
            # save checkpoint every `cfg.ckpt_interval` epochs, also when reducing the learning rate
            checkpoint_paths = [output_dir / 'checkpoint.pth']
            if (epoch + 1) in cfg.lr_step or (epoch + 1) % cfg.ckpt_interval == 0:
                checkpoint_paths.append(
                    output_dir / f'checkpoint{epoch:04}.pth')
            ckpt = {
                'model': model_without_ddp.state_dict(),
                'epoch': epoch,
                'args': args,
                'cfg': cfg,
                'best_metric': best_metric,
            }
            for checkpoint_path in checkpoint_paths:
                utils.save_on_master(ckpt, checkpoint_path)

        if (epoch + 1) % cfg.test_interval == 0:
            test_stats = test(
                model, criterion, postprocessors, data_loader_val, base_ds, device, cfg.output_dir, cfg, epoch=epoch
            )
            prime_metric = 'mAP_raw'
            if test_stats[prime_metric] > best_metric:
                best_metric = test_stats[prime_metric]
                best_metric_txt = test_stats['stats_summary']
                logging.info(
                    'new best metric {:.4f}@epoch{}'.format(best_metric, epoch))
                if cfg.output_dir:
                    ckpt['best_metric'] = best_metric
                    best_ckpt_path = output_dir / 'model_best.pth'
                    utils.save_on_master(ckpt, best_ckpt_path)

        else:
            test_stats = {}

        log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
                     **{f'test_{k}': v for k, v in test_stats.items()},
                     'epoch': epoch,
                     'n_parameters': n_parameters}

        if cfg.output_dir and utils.is_main_process():
            for k, v in log_stats.items():
                if isinstance(v, np.ndarray):
                    log_stats[k] = v.tolist()
            with (output_dir / "log.txt").open("a") as f:
                f.write(json.dumps(log_stats) + "\n")
            if smry_writer:
                for k, v in log_stats.items():
                    if re.findall('loss_\S+unscaled', k) or k.endswith('loss') or 'lr' in k or 'AP50' in k or 'AP75' in k or 'AP95' in k or 'mAP' in k or 'AR' in k:
                        smry_writer.add_scalar(k, v, epoch)

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
    if utils.is_main_process():
        logging.info('Training time {}'.format(total_time_str))
        logging.info(str(
            ['{}:{}'.format(k, v) for k, v in test_stats.items() if 'AP' in k or 'AR' in k]))
        if smry_writer is not None:
            smry_writer.close()
    logging.info('best det result\n{}'.format(best_metric_txt))
    logging.info(log_path)


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(
        'TadTR training and evaluation script', parents=[get_args_parser()])
    args = parser.parse_args()

    s_ = time.time()
    main(args)
    logging.info('main takes {:.3f} seconds'.format(time.time() - s_))


================================================
FILE: models/__init__.py
================================================
# ------------------------------------------------------------------------
# TadTR: End-to-end Temporal Action Detection with Transformer
# Copyright (c) 2021. Xiaolong Liu.
# ------------------------------------------------------------------------

'''build models'''

from .tadtr import build

def build_model(args):
    return build(args)


================================================
FILE: models/custom_loss.py
================================================
# Mostly copied from DETR (https://github.com/facebookresearch/detr)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

'''Focal loss implementation'''


import torch
import torch.nn.functional as F


def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
    """
    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
    Args:
        inputs: A float tensor of arbitrary shape.
                The predictions for each example.
        targets: A float tensor with the same shape as inputs. Stores the binary
                 classification label for each element in inputs
                (0 for the negative class and 1 for the positive class).
        alpha: (optional) Weighting factor in range (0,1) to balance
                positive vs negative examples. Default = -1 (no weighting).
        gamma: Exponent of the modulating factor (1 - p_t) to
               balance easy vs hard examples.
    Returns:
        Loss tensor
    """
    prob = inputs.sigmoid()
    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
    p_t = prob * targets + (1 - prob) * (1 - targets)
    loss = ce_loss * ((1 - p_t) ** gamma)

    if alpha >= 0:
        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
        loss = alpha_t * loss

    return loss.mean(1).sum() / num_boxes


if __name__ == "__main__":
    import numpy as np
    pred = torch.from_numpy(np.random.random([8, 2]))
    target = torch.from_numpy(np.random.random(8) > 0.5).long()
    loss = sigmoid_focal_loss(pred, target)
    

================================================
FILE: models/matcher.py
================================================
# ------------------------------------------------------------------------
# TadTR: End-to-end Temporal Action Detection with Transformer
# ------------------------------------------------------------------------
# Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# ------------------------------------------------------------------------
# Modified from DETR (https://github.com/facebookresearch/detr)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
# ------------------------------------------------------------------------

# Modified from DETR (https://github.com/facebookresearch/detr)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.

import torch
from scipy.optimize import linear_sum_assignment
from torch import nn

from util.segment_ops import segment_cw_to_t1t2, segment_iou
import pdb


class HungarianMatcher(nn.Module):
    """This class computes an assignment between the targets and the predictions of the network
    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
    while the others are un-matched (and thus treated as non-objects).
    """

    def __init__(self, cost_class: float = 1, cost_seg: float = 1, cost_iou: float = 1):
        """Creates the matcher
        Params:
            cost_class: This is the relative weight of the classification error in the matching cost
            cost_seg: This is the relative weight of the L1 error of the segment coordinates in the matching cost
            cost_iou: This is the relative weight of the iou loss of the segment in the matching cost
        """
        super().__init__()
        self.cost_class = cost_class
        self.cost_seg = cost_seg
        self.cost_iou = cost_iou
        assert cost_class != 0 or cost_seg!= 0 or cost_iou != 0, "all costs cant be 0"

    @torch.no_grad()
    def forward(self, outputs, targets):
        """ Performs the matching
        Params:
            outputs: This is a dict that contains at least these entries:
                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
                 "pred_segments": Tensor of dim [batch_size, num_queries, 2] with the predicted segment coordinates

            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
                 "labels": Tensor of dim [num_target_segments] (where num_target_segments is the number of ground-truth
                           objects in the target) containing the class labels
                 "segments": Tensor of dim [num_target_segments, 2] containing the target segment coordinates

        Returns:
            A list of size batch_size, containing tuples of (index_i, index_j) where:
                - index_i is the indices of the selected predictions (in order)
                - index_j is the indices of the corresponding selected targets (in order)
            For each batch element, it holds:
                len(index_i) = len(index_j) = min(num_queries, num_target_segments)
        """
        bs, num_queries = outputs["pred_logits"].shape[:2]

        # We flatten to compute the cost matrices in a batch
        out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid()  #  [batch_size * num_queries, num_classes]
        out_seg = outputs["pred_segments"].flatten(0, 1)  # [batch_size * num_queries, 2]

        # Also concat the target labels and segments
        tgt_ids = torch.cat([v["labels"] for v in targets])  # shape = n1+n2+...
        tgt_seg = torch.cat([v["segments"] for v in targets])

        # Compute the classification cost.
        alpha = 0.25
        gamma = 2.0
        neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
        pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
        cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
        
        # Compute the L1 cost between segments
        cost_seg = torch.cdist(out_seg, tgt_seg, p=1)

        # Compute the iou cost betwen segments
        cost_iou = -segment_iou(segment_cw_to_t1t2(out_seg), segment_cw_to_t1t2(tgt_seg))

        # Final cost matrix, [bs x nq, batch_ngt]
        C = self.cost_seg * cost_seg + self.cost_class * cost_class + self.cost_iou * cost_iou
        C = C.view(bs, num_queries, -1).cpu()

        sizes = [len(v["segments"]) for v in targets]
        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]


def build_matcher(args):
    return HungarianMatcher(cost_class=args.set_cost_class, cost_seg=args.set_cost_seg, cost_iou=args.set_cost_iou)

================================================
FILE: models/ops/roi_align/__init__.py
================================================
from .roi_align import ROIAlign

# __all__ = ['roi_pool', 'ROIAlign']

================================================
FILE: models/ops/roi_align/roi_align.py
================================================
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
import torch
from torch import nn
from torch.autograd import Function
from torch.autograd.function import once_differentiable
from torch.nn.modules.utils import _pair

from . import Align1D as _align_1d

class _Align1D(Function):
    @staticmethod
    def forward(ctx, input, roi, feature_dim, ratio):
        ctx.save_for_backward(roi)
        ctx.feature_dim = feature_dim
        ctx.input_shape = input.size()
        ctx.sampling_ratio = ratio
        output = _align_1d.forward(
            input, roi, feature_dim, ratio
        )
        return output

    @staticmethod
    @once_differentiable
    def backward(ctx, grad_output):
        rois, = ctx.saved_tensors
        feature_dim = ctx.feature_dim
        bs, ch, t = ctx.input_shape
        ratio = ctx.sampling_ratio
        grad_input = _align_1d.backward(
            grad_output,
            rois,
            feature_dim,
            bs,
            ch,
            t,
            ratio
        )
        return grad_input, None, None, None, None


align1d = _Align1D.apply


class ROIAlign(nn.Module):
    def __init__(self, feature_dim, ratio=0):
        super(ROIAlign, self).__init__()
        self.feature_dim = feature_dim
        self.ratio = ratio

    def forward(self, input, rois):
        # print('- input shape is', input.shape)
        # print('- input mean is', input.mean())
        # print('- rois shape is', rois.shape)
        # print('- rois is on', rois.get_device())
        assert input.device==rois.device, 'Align operation requires ' + \
			'both feature and roi are on the same device! ' + \
            'Get feature on {} but roi on {}'.format(input.device,rois.device)

        out = align1d(input, rois, self.feature_dim, self.ratio)
        # print('- output shape is', out.shape)
        # print('- output mean is', out.mean())
        return out

    def __repr__(self):
        tmpstr = self.__class__.__name__ + "("
        tmpstr += "feature_dim=" + str(self.feature_dim)
        tmpstr += "sampling_ratio=" + str(self.ratio)
        tmpstr += ")"
        return tmpstr

if __name__ == "__main__":
    layer = Align1DLayer(16)
    # layer = torch.nn.DataParallel(layer, device_ids=[0,1])
    input = torch.tensor([[[1.,2,3,4,5,6,7,8,9,10],[11,12,13,14,15,16,17,18,19,20]]]).cuda()
    proposal = torch.tensor([[0,-0.5,9.5],[0,0.1,0.9]]).cuda()
    print(input.shape, proposal.shape)
    output = layer(input, proposal)
    print("output has shape {}, with mean {}".format(output.shape, torch.mean(output)))
    print(output)

================================================
FILE: models/ops/roi_align/src/roi_align_cuda.cpp
================================================
#include <torch/extension.h>

#include <vector>

// CUDA forward declarations
at::Tensor Align_forward_cuda(const at::Tensor& input,
                                 const at::Tensor& rois,
                                 const float spatial_scale,
                                 const int pooled_height,
                                 const int sampling_ratio);

at::Tensor Align_backward_cuda(const at::Tensor& grad,
                                  const at::Tensor& rois,
                                  const float spatial_scale,
                                  const int pooled_height,
                                  const int batch_size,
                                  const int channels,
                                  const int height,
                                  const int sampling_ratio);

// C++ interface
at::Tensor Align_forward(const at::Tensor& input, // (bs,ch,t)
                                 const at::Tensor& rois, // (bs, start, end)
                                 const int pooled_height,
                                 const int sampling_ratio){
    return Align_forward_cuda( input, rois, 1.0, pooled_height, sampling_ratio);
                                     }

at::Tensor Align_backward(const at::Tensor& grad,
                                  const at::Tensor& rois,
                                  const int pooled_height,
                                  const int batch_size,
                                  const int channels,
                                  const int height,
                                  const int sampling_ratio){
    return Align_backward_cuda(grad, rois, 1.0, pooled_height, batch_size, channels, height, sampling_ratio);
                                      }

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("forward", &Align_forward, "Align forward (CUDA)");
  m.def("backward", &Align_backward, "Align backward (CUDA)");
}

================================================
FILE: models/ops/roi_align/src/roi_align_kernel.cu
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
// Modifies by Frost for 1D ussage
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>

#include <THC/THC.h>
#include <THC/THCAtomics.cuh>
#include <THC/THCDeviceUtils.cuh>

// TODO make it in a common file
#define CUDA_1D_KERNEL_LOOP(i, n)                            \
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
       i += blockDim.x * gridDim.x)


template <typename T>
__device__ T linear_interpolate(const T* bottom_data,
    const int height,
    T t,
    const int index /* index for debug only*/) {

  // deal with cases that inverse elements are out of feature map boundary
  if (t < -1.0 || t > height) {
    //empty
    return 0;
  }

  if (t <= 0) t = 0;

  int t_low = (int) t;
  int t_high;

  // get closest integers to t
  if (t_low >= height - 1) {
    t_high = t_low = height - 1;
    t = (T) t_low;
  } else {
    t_high = t_low + 1;
  }

  // get the distance to t
  T lt = t - t_low;
  T ht = 1. - lt;

  // do linear interpolation
  T v1 = bottom_data[t_low];
  T v2 = bottom_data[t_high];
  T w1 = ht, w2 = lt;

  T val = (w1 * v1 + w2 * v2);
  // printf("Check Linear Interpolate: w1=%f, v1=%f, w2=%f, v2=%f \n", w1, v1, w2, v2);
  return val;
}

template <typename T>
__global__ void Align1DForward(const int nthreads, const T* bottom_data,
    const T spatial_scale, const int channels,
    const int height,
    const int pooled_height, 
    const int sampling_ratio,
    const T* bottom_rois, T* top_data) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, pt) is an element in the pooled output
    int pt = index % pooled_height;
    int c = (index / pooled_height) % channels;
    int n = index / pooled_height / channels;

    // printf("Debug Main Loop: get pt, c, n are %d, %d, %d \n", pt, c, n);

    const T* offset_bottom_rois = bottom_rois + n * 3;
    int roi_batch_ind = offset_bottom_rois[0];

    // Do not using rounding; this implementation detail is critical
    T roi_start = offset_bottom_rois[1] * spatial_scale;
    T roi_end = offset_bottom_rois[2] * spatial_scale;
    // printf("Debug roi boundary: w1,  w2,  is  %f, %f \n", roi_start,roi_end,);

    // Force malformed ROIs to be 1x1
    T roi_height = max(roi_end- roi_start, (T)1.);
    T bin_size = static_cast<T>(roi_height) / static_cast<T>(pooled_height);

    const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height;

    // We use roi_bin_grid to sample the grid and mimic integral
    int roi_bin_grid = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2

    // We do average (integral) pooling inside a bin
    const T count = roi_bin_grid; // e.g. = 4

    T output_val = 0.;
    for (int it = 0; it < roi_bin_grid; it ++) // e.g., it = 0, 1
    {
      const T t = roi_start + pt * bin_size + static_cast<T>(it + .5f) * bin_size / static_cast<T>(roi_bin_grid); // e.g., 0.5, 1.5

      T val = linear_interpolate(offset_bottom_data, height, t, index);
      // printf("Debug linear_interpolate: input=height:%d, t:%f, ... ; output=val:%f \n", height, t, val);
      output_val += val;
    }
    output_val /= count;

    top_data[index] = output_val;
  }
}


template <typename T>
__device__ void linear_interpolate_gradient(
    const int height, 
    T t,
    T & w1, T & w2,
    int & t_low, int & t_high, 
    const int index /* index for debug only*/) {

  // deal with cases that inverse elements are out of feature map boundary
  if (t < -1.0 || t > height) {
    //empty
    w1 = w2 = 0.;
    t_low = t_high = -1;
    return;
  }

  if (t <= 0) t = 0;

  t_low = (int) t;

  if (t_low >= height - 1) {
    t_high = t_low = height - 1;
    t = (T) t_low;
  } else {
    t_high = t_low + 1;
  }

  T lt = t - t_low;
  T ht = 1. - lt;

  // T val = (w1 * v1 + w2 * v2);
  // T w1 = ht, w2 = lt;
  w1 = ht , w2 = lt;

  return;
}

template <typename T>
__global__ void Align1DBackwardFeature(const int nthreads, const T* top_diff,
    const int num_rois, const T spatial_scale,
    const int channels, const int height,
    const int pooled_height,
    const int sampling_ratio,
    T* bottom_diff,
    const T* bottom_rois) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, pt) is an element in the pooled output
    int pt = (index ) % pooled_height;
    int c = (index / pooled_height) % channels;
    int n = index / pooled_height / channels;

    const T* offset_bottom_rois = bottom_rois + n * 3;
    int roi_batch_ind = offset_bottom_rois[0];

    // Do not using rounding; this implementation detail is critical
    T roi_start= offset_bottom_rois[1] * spatial_scale;
    T roi_end= offset_bottom_rois[2] * spatial_scale;

    // Force malformed ROIs to be 1x1
    T roi_height = max(roi_end- roi_start, (T)1.);
    T bin_size = static_cast<T>(roi_height) / static_cast<T>(pooled_height);

    T* offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height;

    int top_offset    = (n * channels + c) * pooled_height;
    const T* offset_top_diff = top_diff + top_offset;
    const T top_diff_this_bin = offset_top_diff[pt];

    // We use roi_bin_grid to sample the grid and mimic integral
    int roi_bin_grid= (sampling_ratio > 0) ? sampling_ratio : ceil(roi_height / pooled_height); // e.g., = 2

    // We do average (integral) pooling inside a bin
    const T count = roi_bin_grid; // e.g. = 4

    for (int it = 0; it < roi_bin_grid; it ++) // e.g., iy = 0, 1
    {
      const T t = roi_start+ pt * bin_size+ static_cast<T>(it + .5f) * bin_size/ static_cast<T>(roi_bin_grid); // e.g., 0.5, 1.5

      T w1, w2;
      int t_low, t_high;

      linear_interpolate_gradient(height, t, w1, w2, t_low, t_high, index);

      T g1 = top_diff_this_bin * w1 / count;
      T g2 = top_diff_this_bin * w2 / count;

      if (t_low >= 0 && t_high >= 0)
      {
          atomicAdd(offset_bottom_diff + t_low, static_cast<T>(g1));
          atomicAdd(offset_bottom_diff + t_high, static_cast<T>(g2));
      } // if
    } // it
  } // CUDA_1D_KERNEL_LOOP
} // RoIAlignBackward


at::Tensor Align_forward_cuda(const at::Tensor& input,
                                 const at::Tensor& rois,
                                 const float spatial_scale,
                                 const int pooled_height,
                                 const int sampling_ratio) {
  AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor");
  AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor");
  auto num_rois = rois.size(0);
  auto channels = input.size(1);
  auto height = input.size(2);

  auto output = at::empty({num_rois, channels, pooled_height}, input.options());
  auto output_size = num_rois * pooled_height * channels;
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  dim3 grid(std::min(THCCeilDiv((long)output_size, 512L), 4096L));
  dim3 block(512);

  // printf("Debug main function: height:%d\n", height);

  if (output.numel() == 0) {
    THCudaCheck(cudaGetLastError());
    return output;
  }

  AT_DISPATCH_FLOATING_TYPES(input.type(), "Align1D_forward", [&] {
    Align1DForward<scalar_t><<<grid, block, 0, stream>>>(
         output_size,
         input.contiguous().data<scalar_t>(),
         spatial_scale,
         channels,
         height,
         pooled_height,
         sampling_ratio,
         rois.contiguous().data<scalar_t>(),
         output.data<scalar_t>());
  });
  THCudaCheck(cudaGetLastError());
  return output;
}

// TODO remove the dependency on input and use instead its sizes -> save memory
at::Tensor Align_backward_cuda(const at::Tensor& grad,
                                  const at::Tensor& rois,
                                  const float spatial_scale,
                                  const int pooled_height,
                                  const int batch_size,
                                  const int channels,
                                  const int height,
                                  const int sampling_ratio) {
  AT_ASSERTM(grad.type().is_cuda(), "grad must be a CUDA tensor");
  AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor");

  auto num_rois = rois.size(0);
  auto grad_input = at::zeros({batch_size, channels, height}, grad.options());

  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  dim3 grid(std::min(THCCeilDiv((long)grad.numel(), 512L), 4096L));
  dim3 block(512);

  // handle possibly empty gradients
  if (grad.numel() == 0) {
    THCudaCheck(cudaGetLastError());
    return grad_input;
  }

  AT_DISPATCH_FLOATING_TYPES(grad.type(), "ROIAlign_backward", [&] {
    Align1DBackwardFeature<scalar_t><<<grid, block, 0, stream>>>(
         grad.numel(),
         grad.contiguous().data<scalar_t>(),
         num_rois,
         spatial_scale,
         channels,
         height,
         pooled_height,
         sampling_ratio,
         grad_input.data<scalar_t>(),
         rois.contiguous().data<scalar_t>());
  });
  THCudaCheck(cudaGetLastError());
  return grad_input;
}

================================================
FILE: models/ops/setup.py
================================================
# ------------------------------------------------------------------------
# TadTR: End-to-end Temporal Action Detection with Transformer
# Copyright (c) 2021. Xiaolong Liu.
# ------------------------------------------------------------------------
# Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------

import os
import glob
import pdb

import torch

from torch.utils.cpp_extension import CUDA_HOME
from torch.utils.cpp_extension import CppExtension
from torch.utils.cpp_extension import CUDAExtension

from setuptools import find_packages
from setuptools import setup

requirements = ["torch", "torchvision"]


def get_sources(extensions_dir):
    main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
    source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
    source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
    return main_file + source_cpu + source_cuda


def get_extensions():
    this_dir = os.path.dirname(os.path.abspath(__file__))
   
    extra_compile_args = {"cxx": []}
    define_macros = []

    if torch.cuda.is_available() and CUDA_HOME is not None:
        define_macros += [("WITH_CUDA", None)]
        extra_compile_args["nvcc"] = [
            "-DCUDA_HAS_FP16=1",
            "-D__CUDA_NO_HALF_OPERATORS__",
            "-D__CUDA_NO_HALF_CONVERSIONS__",
            "-D__CUDA_NO_HALF2_OPERATORS__",
        ]
    else:
        raise NotImplementedError('Cuda is not availabel')


    ext_modules = [
        # Temporal Deformable Attention, optional
        # CUDAExtension(
        #     "temporal_deform_attn.TemporalDeformableAttention",
        #     get_sources(os.path.join(this_dir, "temporal_deform_attn/src")),
        #     include_dirs=[os.path.join(this_dir, "temporal_deform_attn/src")],
        #     define_macros=define_macros,
        #     extra_compile_args=extra_compile_args
        # ),

        CUDAExtension('roi_align.Align1D', [
            'roi_align/src/roi_align_cuda.cpp',
            'roi_align/src/roi_align_kernel.cu'])
    ]
    return ext_modules

setup(
    name="TadTR_release",
    version="1.0",
    author="Xiaolong Liu",
    description="PyTorch Wrapper for CUDA Functions of TadTR",
    packages=find_packages(exclude=("configs", "tests",)),
    ext_modules=get_extensions(),
    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
)


================================================
FILE: models/ops/temporal_deform_attn/__init__.py
================================================
# ------------------------------------------------------------------------
# TadTR: End-to-end Temporal Action Detection with Transformer
# Copyright (c) 2021. Xiaolong Liu.
# ------------------------------------------------------------------------------------------------
# Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------

from .temporal_deform_attn import DeformAttn


================================================
FILE: models/ops/temporal_deform_attn/temporal_deform_attn.py
================================================
# ------------------------------------------------------------------------
# TadTR: End-to-end Temporal Action Detection with Transformer
# Copyright (c) 2021. Xiaolong Liu.
# ------------------------------------------------------------------------------------------------
# Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------

from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

from opts import cfg

# if not cfg.disable_cuda:
#     from .functions import TDAFunction

import warnings
import math
import pdb

import torch
from torch import nn
import torch.nn.functional as F
from torch.nn.init import xavier_uniform_, constant_


def _is_power_of_2(n):
    if (not isinstance(n, int)) or (n < 0):
        raise ValueError(
            "invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
    return (n & (n-1) == 0) and n != 0


class DeformAttn(nn.Module):
    def __init__(self, d_model=256, n_levels=1, n_heads=8, n_points=4):
        """
        Deformable Attention Module
        :param d_model      hidden dimension
        :param n_levels     number of feature levels
        :param n_heads      number of attention heads
        :param n_points     number of sampling points per attention head
        """
        super().__init__()
        if d_model % n_heads != 0:
            raise ValueError(
                'd_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
        _d_per_head = d_model // n_heads
        # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
        if not _is_power_of_2(_d_per_head):
            warnings.warn("You'd better set d_model in DeformAttn to make the dimension of each attention head a power of 2 "
                          "which is more efficient in our CUDA implementation.")

        assert n_levels == 1, 'multi-level attention is not supported!'

        self.seq2col_step = 64

        self.d_model = d_model
        self.n_levels = n_levels
        self.n_heads = n_heads
        self.n_points = n_points

        self.sampling_offsets = nn.Linear(
            d_model, n_heads * n_levels * n_points)
        self.attention_weights = nn.Linear(
            d_model, n_heads * n_levels * n_points)
        self.value_proj = nn.Linear(d_model, d_model)
        self.output_proj = nn.Linear(d_model, d_model)

        self._reset_parameters()

    def _reset_parameters(self):
        constant_(self.sampling_offsets.weight.data, 0.)
        # Initial offsets:
        # (1, 0, -1, 0, -1, 0, 1, 0)
        thetas = torch.arange(
            self.n_heads, dtype=torch.float32) * (4.0 * math.pi / self.n_heads)
        grid_init = thetas.cos()[:, None]

        grid_init = grid_init.view(self.n_heads, 1, 1, 1).repeat(
            1, self.n_levels, self.n_points, 1)
        for i in range(self.n_points):
            grid_init[:, :, i, :] *= i + 1

        with torch.no_grad():
            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
        constant_(self.attention_weights.weight.data, 0.)
        constant_(self.attention_weights.bias.data, 0.)
        xavier_uniform_(self.value_proj.weight.data)
        constant_(self.value_proj.bias.data, 0.)
        xavier_uniform_(self.output_proj.weight.data)
        constant_(self.output_proj.bias.data, 0.)

    def forward(self, query, reference_points, input_flatten, input_temporal_lens, input_level_start_index, input_padding_mask=None):
        """
        :param query (= src + pos)         (N, Length_{query}, C)
        :param reference_points            (N, Length_{query}, n_levels, 1), range in [0, 1], left (0), right (1), including padding area
                                        or (N, Length_{query}, n_levels, 2), add additional (t) to form reference segments
        :param input_flatten (=src)        (N, \sum_{l=0}^{L-1} T_l, C)
        :param input_temporal_lens         (n_levels), [T_0, T_1, ..., T_(L-1)]
        :param input_level_start_index     (n_levels, ), [0, T_0, T_1, T_2, ..., T_{L-1}]
        :param input_padding_mask          (N, \sum_{l=0}^{L-1} T_l), True for padding elements, False for non-padding elements

        :return output                     (N, Length_{query}, C)
        """
        N, Len_q, _ = query.shape
        N, Len_in, _ = input_flatten.shape
        assert input_temporal_lens.sum() == Len_in

        value = self.value_proj(input_flatten)
        if input_padding_mask is not None:
            value = value.masked_fill(input_padding_mask[..., None], float(0))
        value = value.view(N, Len_in, self.n_heads,
                           self.d_model // self.n_heads)
        # the predicted offset in temporal axis. They are *absolute* values, not normalized
        sampling_offsets = self.sampling_offsets(query).view(
            N, Len_q, self.n_heads, self.n_levels, self.n_points, 1)
        attention_weights = self.attention_weights(query).view(
            N, Len_q, self.n_heads, self.n_levels * self.n_points)
        attention_weights = F.softmax(
            attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)

        if reference_points.shape[-1] == 1:
            # the reference points are normalized, but the offset are unnormalized
            # so we need to normalize the offsets
            offset_normalizer = input_temporal_lens[..., None]
            # (N, Length_{query}, n_heads, n_levels, n_points, 1)
            sampling_locations = reference_points[:, :, None, :, None, :] \
                + sampling_offsets / \
                offset_normalizer[None, None, None, :, None, :]
        # deform attention in the l-th (l >= 2) decoder layer when segment refinement is enabled
        elif reference_points.shape[-1] == 2:
            # offsets are related with the size of the reference segment
            sampling_locations = reference_points[:, :, None, :, None, :1] \
                + sampling_offsets / self.n_points * \
                reference_points[:, :, None, :, None, 1:] * 0.5

        else:
            raise ValueError(
                'Last dim of reference_points must be 1 or 2, but get {} instead.'.format(reference_points.shape[-1]))
        if cfg.dfm_att_backend == 'pytorch' or cfg.disable_cuda:
            # Implementation with PyTorch grid_sample operator. 
            # Note that grid_sample only supports image inputs. We need to view the sequence as an image with height=1
            sampling_locations = torch.cat((sampling_locations, torch.ones_like(sampling_locations)*0.5), dim=-1)
            input_spatial_shapes = torch.stack((torch.ones_like(input_temporal_lens), input_temporal_lens), dim=-1)
            output = deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
        else:
            raise NotImplementedError
            # # CUDA implementation. You will get identical results with the pytorch implementation
            # output = TDAFunction.apply(
            #     value, input_temporal_lens, input_level_start_index, sampling_locations, attention_weights, self.seq2col_step)
        output = self.output_proj(output)
        return output, (sampling_locations, attention_weights)


def deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
    '''deformable attention implemeted with grid_sample.'''
    N_, S_, M_, D_ = value.shape
    _, Lq_, M_, L_, P_, _ = sampling_locations.shape
    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
    sampling_grids = 2 * sampling_locations - 1
    sampling_value_list = []
    for lid_, (H_, W_) in enumerate(value_spatial_shapes):
        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
        value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
        sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
        # N_*M_, D_, Lq_, P_
        sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
                                          mode='bilinear', padding_mode='zeros', align_corners=False)
        sampling_value_list.append(sampling_value_l_)
    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
    attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
    output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
    return output.transpose(1, 2).contiguous()

================================================
FILE: models/position_encoding.py
================================================
# ------------------------------------------------------------------------
# TadTR: End-to-end Temporal Action Detection with Transformer
# Copyright (c) 2021. Xiaolong Liu.#
#  ------------------------------------------------------------------------
# Modified from DETR (https://github.com/facebookresearch/detr)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# ------------------------------------------------------------------------


"""
Positional encodings for the transformer.
"""
import math
import torch
from torch import nn

from util.misc import NestedTensor


class PositionEmbeddingSine(nn.Module):
    """
    This is a more standard version of the position embedding, very similar to the one
    used by the Attention is all you need paper, generalized to work on videos.
    """
    def __init__(self, num_pos_feats=256, temperature=10000, normalize=False, scale=None):
        super().__init__()
        self.num_pos_feats = num_pos_feats
        self.temperature = temperature
        self.normalize = normalize
        if scale is not None and normalize is False:
            raise ValueError("normalize should be True if scale is passed")
        if scale is None:
            scale = 2 * math.pi
        self.scale = scale

    def forward(self, tensor_list: NestedTensor):
        x = tensor_list.tensors
        mask = tensor_list.mask
        assert mask is not None
        not_mask = ~mask
        x_embed = not_mask.cumsum(1, dtype=torch.float32)  # N x T
        if self.normalize:
            eps = 1e-6
            x_embed = x_embed / (x_embed[:, -1:] + eps) * self.scale

        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)

        pos_x = x_embed[:, :, None] / dim_t  # N x T x C
        # n,c,t
        pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
        pos = pos_x.permute(0, 2, 1)    # N x C x T
        return pos


def build_position_encoding(args):
    feat_dim = args.hidden_dim
    if args.position_embedding in ('v2', 'sine'):
        position_embedding = PositionEmbeddingSine(feat_dim, normalize=True)
    else:
        raise ValueError(f"not supported {args.position_embedding}")

    return position_embedding


================================================
FILE: models/tadtr.py
================================================
# ------------------------------------------------------------------------
# TadTR: End-to-end Temporal Action Detection with Transformer
# Copyright (c) 2021. Xiaolong Liu.
# ------------------------------------------------------------------------
# Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0
# ------------------------------------------------------------------------
# and DETR (https://github.com/facebookresearch/detr)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# ------------------------------------------------------------------------

"""
TadTR model and criterion classes.
"""
import math
import copy

import torch
import torch.nn.functional as F
from torch import nn

from util import segment_ops
from util.misc import (NestedTensor, nested_tensor_from_tensor_list,
                       accuracy, get_world_size,
                       is_dist_avail_and_initialized, inverse_sigmoid)
from models.matcher import build_matcher
from models.position_encoding import build_position_encoding
from .custom_loss import sigmoid_focal_loss
from .transformer import build_deformable_transformer
from opts import cfg

if not cfg.disable_cuda:
    from models.ops.roi_align import ROIAlign


def _get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])


def get_norm(norm_type, dim, num_groups=None):
    if norm_type == 'gn':
        assert num_groups is not None, 'num_groups must be specified'
        return nn.GroupNorm(num_groups, dim)
    elif norm_type == 'bn':
        return nn.BatchNorm1d(dim)
    else:
        raise NotImplementedError


class TadTR(nn.Module):
    """ This is the TadTR module that performs temporal action detection """

    def __init__(self, position_embedding, transformer, num_classes, num_queries, aux_loss=True, with_segment_refine=True, with_act_reg=True):
        """ Initializes the model.
        Parameters:
            backbone: torch module of the backbone to be used. See backbone.py
            transformer: torch module of the transformer architecture. See deformable_transformer.py
            num_classes: number of action classes
            num_queries: number of action queries, ie detection slot. This is the maximal number of actions
                         TadTR can detect in a single video.
            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
            with_segment_refine: iterative segment refinement
        """
        super().__init__()
        self.num_queries = num_queries
        self.transformer = transformer
        hidden_dim = transformer.d_model
        self.class_embed = nn.Linear(hidden_dim, num_classes)
        self.segment_embed = MLP(hidden_dim, hidden_dim, 2, 3)
        self.query_embed = nn.Embedding(num_queries, hidden_dim*2)

        self.input_proj = nn.ModuleList([
            nn.Sequential(
                nn.Conv1d(2048, hidden_dim, kernel_size=1),
                nn.GroupNorm(32, hidden_dim),
            )])
        # self.backbone = backbone
        self.position_embedding = position_embedding
        self.aux_loss = aux_loss
        self.with_segment_refine = with_segment_refine
        self.with_act_reg = with_act_reg

        prior_prob = 0.01
        bias_value = -math.log((1 - prior_prob) / prior_prob)
        self.class_embed.bias.data = torch.ones(num_classes) * bias_value
        nn.init.constant_(self.segment_embed.layers[-1].weight.data, 0)
        nn.init.constant_(self.segment_embed.layers[-1].bias.data, 0)
        for proj in self.input_proj:
            nn.init.xavier_uniform_(proj[0].weight, gain=1)
            nn.init.constant_(proj[0].bias, 0)

        num_pred = transformer.decoder.num_layers
        if with_segment_refine:
            self.class_embed = _get_clones(self.class_embed, num_pred)
            self.segment_embed = _get_clones(self.segment_embed, num_pred)
            nn.init.constant_(
                self.segment_embed[0].layers[-1].bias.data[1:], -2.0)
            # hack implementation for segment refinement
            self.transformer.decoder.segment_embed = self.segment_embed
        else:
            nn.init.constant_(
                self.segment_embed.layers[-1].bias.data[1:], -2.0)
            self.class_embed = nn.ModuleList(
                [self.class_embed for _ in range(num_pred)])
            self.segment_embed = nn.ModuleList(
                [self.segment_embed for _ in range(num_pred)])
            self.transformer.decoder.segment_embed = None

        if with_act_reg:
            # RoIAlign params
            self.roi_size = 16
            self.roi_scale = 0
            self.roi_extractor = ROIAlign(self.roi_size, self.roi_scale)
            self.actionness_pred = nn.Sequential(
                nn.Linear(self.roi_size * hidden_dim, hidden_dim),
                nn.ReLU(inplace=True),
                nn.Linear(hidden_dim, hidden_dim),
                nn.ReLU(inplace=True),
                nn.Linear(hidden_dim, 1),
                nn.Sigmoid()
            )

    def _to_roi_align_format(self, rois, T, scale_factor=1):
        '''Convert RoIs to RoIAlign format.
        Params:
            RoIs: normalized segments coordinates, shape (batch_size, num_segments, 4)
            T: length of the video feature sequence
        '''
        # transform to absolute axis
        B, N = rois.shape[:2]
        rois_center = rois[:, :, 0:1]
        rois_size = rois[:, :, 1:2] * scale_factor
        rois_abs = torch.cat(
            (rois_center - rois_size/2, rois_center + rois_size/2), dim=2) * T
        # expand the RoIs
        rois_abs = torch.clamp(rois_abs, min=0, max=T)  # (N, T, 2)
        # add batch index
        batch_ind = torch.arange(0, B).view((B, 1, 1)).to(rois_abs.device)
        batch_ind = batch_ind.repeat(1, N, 1)
        rois_abs = torch.cat((batch_ind, rois_abs), dim=2)
        # NOTE: stop gradient here to stablize training
        return rois_abs.view((B*N, 3)).detach()

    def forward(self, samples):
        """ The forward expects a NestedTensor, which consists of:
               - samples.tensors: batched images, of shape [batch_size x 3 x H x W]
               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
            or a tuple of tensors and mask

            It returns a dict with the following elements:
               - "pred_logits": the classification logits (including no-action) for all queries.
                                Shape= [batch_size x num_queries x (num_classes + 1)]
               - "pred_segments": The normalized segments coordinates for all queries, represented as
                               (center, width). These values are normalized in [0, 1],
                               relative to the size of each individual image (disregarding possible padding).
                               See PostProcess for information on how to retrieve the unnormalized segment.
               - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
                                dictionnaries containing the two above keys for each decoder layer.
        """
        if not isinstance(samples, NestedTensor):
            if isinstance(samples, (list, tuple)):
                samples = NestedTensor(*samples)
            else:
                samples = nested_tensor_from_tensor_list(samples)  # (n, c, t)

        pos = [self.position_embedding(samples)]
        src, mask = samples.tensors, samples.mask
        srcs = [self.input_proj[0](src)]
        masks = [mask]

        query_embeds = self.query_embed.weight
        hs, init_reference, inter_references, memory = self.transformer(
            srcs, masks, pos, query_embeds)

        outputs_classes = []
        outputs_coords = []
        # gather outputs from each decoder layer
        for lvl in range(hs.shape[0]):
            if lvl == 0:
                reference = init_reference
            else:
                reference = inter_references[lvl - 1]

            reference = inverse_sigmoid(reference)
            outputs_class = self.class_embed[lvl](hs[lvl])
            tmp = self.segment_embed[lvl](hs[lvl])
            # the l-th layer (l >= 2)
            if reference.shape[-1] == 2:
                tmp += reference
            # the first layer
            else:
                assert reference.shape[-1] == 1
                tmp[..., 0] += reference[..., 0]
            outputs_coord = tmp.sigmoid()
            outputs_classes.append(outputs_class)
            outputs_coords.append(outputs_coord)
        outputs_class = torch.stack(outputs_classes)
        outputs_coord = torch.stack(outputs_coords)

        if not self.with_act_reg:
            out = {'pred_logits': outputs_class[-1],
                   'pred_segments': outputs_coord[-1]}
        else:
            # perform RoIAlign
            B, N = outputs_coord[-1].shape[:2]
            origin_feat = memory

            rois = self._to_roi_align_format(
                outputs_coord[-1], origin_feat.shape[2], scale_factor=1.5)
            roi_features = self.roi_extractor(origin_feat, rois)
            roi_features = roi_features.view((B, N, -1))
            pred_actionness = self.actionness_pred(roi_features)

            last_layer_cls = outputs_class[-1]
            last_layer_reg = outputs_coord[-1]

            out = {'pred_logits': last_layer_cls,
                   'pred_segments': last_layer_reg, 'pred_actionness': pred_actionness}

        if self.aux_loss:
            out['aux_outputs'] = self._set_aux_loss(
                outputs_class, outputs_coord)

        return out

    @torch.jit.unused
    def _set_aux_loss(self, outputs_class, outputs_coord):
        # this is a workaround to make torchscript happy, as torchscript
        # doesn't support dictionary with non-homogeneous values, such
        # as a dict having both a Tensor and a list.
        return [{'pred_logits': a, 'pred_segments': b}
                for a, b in zip(outputs_class[:-1], outputs_coord[:-1])]


class SetCriterion(nn.Module):
    """ This class computes the loss for TadTR.
    The process happens in two steps:
        1) we compute hungarian assignment between ground truth segments and the outputs of the model
        2) we supervise each pair of matched ground-truth / prediction (supervise class and segment)
    """

    def __init__(self, num_classes, matcher, weight_dict, losses, focal_alpha=0.25):
        """ Create the criterion.
        Parameters:
            num_classes: number of action categories, omitting the special no-action category
            matcher: module able to compute a matching between targets and proposals
            weight_dict: dict containing as key the names of the losses and as values their relative weight.
            losses: list of all the losses to be applied. See get_loss for list of available losses.
            focal_alpha: alpha in Focal Loss
        """
        super().__init__()
        self.num_classes = num_classes
        self.matcher = matcher
        self.weight_dict = weight_dict
        self.losses = losses
        self.focal_alpha = focal_alpha

    def loss_labels(self, outputs, targets, indices, num_segments, log=True):
        """Classification loss (NLL)
        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_segments]
        """
        assert 'pred_logits' in outputs
        src_logits = outputs['pred_logits']

        idx = self._get_src_permutation_idx(indices)
        target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)])
        target_classes = torch.full(src_logits.shape[:2], self.num_classes,
                                    dtype=torch.int64, device=src_logits.device)
        target_classes[idx] = target_classes_o

        target_classes_onehot = torch.zeros([src_logits.shape[0], src_logits.shape[1], src_logits.shape[2] + 1],
                                            dtype=src_logits.dtype, layout=src_logits.layout, device=src_logits.device)
        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)

        target_classes_onehot = target_classes_onehot[:,:,:-1]
        loss_ce = sigmoid_focal_loss(src_logits, target_classes_onehot, num_segments, alpha=self.focal_alpha, gamma=2) * src_logits.shape[1]  # nq
        losses = {'loss_ce': loss_ce}

        if log:
            # TODO this should probably be a separate loss, not hacked in this one here
            losses['class_error'] = 100 - accuracy(src_logits[idx], target_classes_o)[0]

        return losses

    def loss_segments(self, outputs, targets, indices, num_segments):
        """Compute the losses related to the segmentes, the L1 regression loss and the IoU loss
           targets dicts must contain the key "segments" containing a tensor of dim [nb_target_segments, 2]
           The target segments are expected in format (center, width), normalized by the video length.
        """
        assert 'pred_segments' in outputs
        idx = self._get_src_permutation_idx(indices)
        src_segments = outputs['pred_segments'][idx]
        target_segments = torch.cat([t['segments'][i] for t, (_, i) in zip(targets, indices)], dim=0)

        loss_segment = F.l1_loss(src_segments, target_segments, reduction='none')

        losses = {}
        losses['loss_segments'] = loss_segment.sum() / num_segments

        loss_iou = 1 - torch.diag(segment_ops.segment_iou(
            segment_ops.segment_cw_to_t1t2(src_segments),
            segment_ops.segment_cw_to_t1t2(target_segments)))
        losses['loss_iou'] = loss_iou.sum() / num_segments
        return losses

    def loss_actionness(self, outputs, targets, indices, num_segments):
        """Compute the actionness regression loss
           targets dicts must contain the key "segments" containing a tensor of dim [nb_target_segments, 2]
           The target segments are expected in format (center, width), normalized by the video length.
        """
        assert 'pred_segments' in outputs
        assert 'pred_actionness' in outputs
        src_segments = outputs['pred_segments'].view((-1, 2))
        target_segments = torch.cat([t['segments'] for t in targets], dim=0)

        losses = {}

        iou_mat = segment_ops.segment_iou(
            segment_ops.segment_cw_to_t1t2(src_segments),
            segment_ops.segment_cw_to_t1t2(target_segments))

        gt_iou = iou_mat.max(dim=1)[0]
        pred_actionness = outputs['pred_actionness']
        loss_actionness = F.l1_loss(pred_actionness.view(-1), gt_iou.view(-1).detach())   

        losses['loss_actionness'] = loss_actionness
        return losses

    def _get_src_permutation_idx(self, indices):
        # permute predictions following indices
        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
        src_idx = torch.cat([src for (src, _) in indices])
        return batch_idx, src_idx

    def _get_tgt_permutation_idx(self, indices):
        # permute targets following indices
        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
        return batch_idx, tgt_idx

    def get_loss(self, loss, outputs, targets, indices, num_segments, **kwargs):
        loss_map = {
            'labels': self.loss_labels,
            'segments': self.loss_segments,
            'actionness': self.loss_actionness,
        }

        assert loss in loss_map, f'do you really want to compute {loss} loss?'
        return loss_map[loss](outputs, targets, indices, num_segments, **kwargs)

    def forward(self, outputs, targets):
        """ This performs the loss computation.
        Parameters:
             outputs: dict of tensors, see the output specification of the model for the format
             targets: list of dicts, such that len(targets) == batch_size.
                      The expected keys in each dict depends on the losses applied, see each loss' doc
        """
        outputs_without_aux = {k: v for k, v in outputs.items() if k != 'aux_outputs'}

        # Retrieve the matching between the outputs of the last layer and the targets
        indices = self.matcher(outputs_without_aux, targets)

        # Compute the average number of target segments accross all nodes, for normalization purposes
        num_segments = sum(len(t["labels"]) for t in targets)
        num_segments = torch.as_tensor([num_segments], dtype=torch.float, device=next(iter(outputs.values())).device)
        if is_dist_avail_and_initialized():
            torch.distributed.all_reduce(num_segments)
        num_segments = torch.clamp(num_segments / get_world_size(), min=1).item()

        # Compute all the requested losses
        losses = {}
        for loss in self.losses:
            kwargs = {}
            losses.update(self.get_loss(loss, outputs, targets, indices, num_segments, **kwargs))

        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
        if 'aux_outputs' in outputs:
            for i, aux_outputs in enumerate(outputs['aux_outputs']):
                indices = self.matcher(aux_outputs, targets)
                for loss in self.losses:
                    # we do not compute actionness loss for aux outputs
                    if 'actionness' in loss:
                        continue
         
                    kwargs = {}
                    if loss == 'labels':
                        # Logging is enabled only for the last layer
                        kwargs['log'] = False
                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_segments, **kwargs)
                    l_dict = {k + f'_{i}': v for k, v in l_dict.items()}
                    losses.update(l_dict)

        self.indices = indices
        return losses


class PostProcess(nn.Module):
    """ This module converts the model's output into the format expected by the TADEvaluator"""

    @torch.no_grad()
    def forward(self, outputs, target_sizes, fuse_score=True):
        """ Perform the computation
        Parameters:
            outputs: raw outputs of the model
            target_sizes: tensor of dimension [batch_size] containing the duration of each video of the batch
        """
        out_logits, out_segments = outputs['pred_logits'], outputs['pred_segments']

        assert len(out_logits) == len(target_sizes)
        # assert target_sizes.shape[1] == 1

        prob = out_logits.sigmoid()   # [bs, nq, C]
        if fuse_score:
            prob *= outputs['pred_actionness']

        segments = segment_ops.segment_cw_to_t1t2(out_segments)   # bs, nq, 2

        if cfg.postproc_rank == 1:     # default
            # sort across different instances, pick top 100 at most
            topk_values, topk_indexes = torch.topk(prob.view(
                out_logits.shape[0], -1), min(cfg.postproc_ins_topk, prob.shape[1]*prob.shape[2]), dim=1)
            scores = topk_values
            topk_segments = topk_indexes // out_logits.shape[2]
            labels = topk_indexes % out_logits.shape[2]

            # bs, nq, 2; bs, num, 2
            segments = torch.gather(
                segments, 1, topk_segments.unsqueeze(-1).repeat(1, 1, 2))
            query_ids = topk_segments
        else:
            # pick topk classes for each query
            # pdb.set_trace()
            scores, labels = torch.topk(prob, cfg.postproc_cls_topk, dim=-1)
            scores, labels = scores.flatten(1), labels.flatten(1)
            # (bs, nq, 1, 2)
            segments = segments[:, [
                i//cfg.postproc_cls_topk for i in range(cfg.postproc_cls_topk*segments.shape[1])], :]
            query_ids = (torch.arange(0, cfg.postproc_cls_topk*segments.shape[1], 1, dtype=labels.dtype,
                         device=labels.device) // cfg.postproc_cls_topk)[None, :].repeat(labels.shape[0], 1)

        # from normalized [0, 1] to absolute [0, length] coordinates
        vid_length = target_sizes
        scale_fct = torch.stack([vid_length, vid_length], dim=1)
        segments = segments * scale_fct[:, None, :]

        results = [{'scores': s, 'labels': l, 'segments': b, 'query_ids': q}
                   for s, l, b, q in zip(scores, labels, segments, query_ids)]

        return results


class MLP(nn.Module):
    """ Very simple multi-layer perceptron (also called FFN)"""

    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super().__init__()
        self.num_layers = num_layers
        h = [hidden_dim] * (num_layers - 1)
        self.layers = nn.ModuleList(nn.Linear(n, k)
                                    for n, k in zip([input_dim] + h, h + [output_dim]))

    def forward(self, x):
        for i, layer in enumerate(self.layers):
            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
        return x


def build(args):
    if args.binary:
        num_classes = 1
    else:
        if args.dataset_name == 'thumos14':
            num_classes = 20
        elif args.dataset_name == 'muses':
            num_classes = 25
        elif args.dataset_name in ['activitynet', 'hacs']:
            num_classes = 200
        else:
            raise ValueError('unknown dataset {}'.format(args.dataset_name))

    pos_embed = build_position_encoding(args)
    transformer = build_deformable_transformer(args)

    model = TadTR(
        pos_embed,
        transformer,
        num_classes=num_classes,
        num_queries=args.num_queries,
        aux_loss=args.aux_loss,
        with_segment_refine=args.seg_refine,
        with_act_reg=args.act_reg
    )

    matcher = build_matcher(args)
    losses = ['labels', 'segments']

    weight_dict = {
        'loss_ce': args.cls_loss_coef, 
        'loss_segments': args.seg_loss_coef,
        'loss_iou': args.iou_loss_coef}

    if args.act_reg:
        weight_dict['loss_actionness'] = args.act_loss_coef
        losses.append('actionness')

    if args.aux_loss:
        aux_weight_dict = {}
        for i in range(args.dec_layers - 1):
            aux_weight_dict.update({k + f'_{i}': v for k, v in weight_dict.items()})
        aux_weight_dict.update({k + f'_enc': v for k, v in weight_dict.items()})
        weight_dict.update(aux_weight_dict)

    criterion = SetCriterion(num_classes, matcher,
        weight_dict, losses, focal_alpha=args.focal_alpha)

    postprocessor = PostProcess()

    return model, criterion, postprocessor


================================================
FILE: models/transformer.py
================================================
# ------------------------------------------------------------------------
# TadTR: End-to-end Temporal Action Detection with Transformer
# Copyright (c) 2021. Xiaolong Liu.
# ------------------------------------------------------------------------
# Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0
# ------------------------------------------------------------------------
# and DETR (https://github.com/facebookresearch/detr)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# ------------------------------------------------------------------------

import copy

import torch
import torch.nn.functional as F
from torch import nn, Tensor
from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_

from util.misc import inverse_sigmoid
from models.ops.temporal_deform_attn import DeformAttn
from opts import cfg


class DeformableTransformer(nn.Module):
    def __init__(self, d_model=256, nhead=8,
                 num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=1024, dropout=0.1,
                 activation="relu", return_intermediate_dec=False,
                 num_feature_levels=4, dec_n_points=4,  enc_n_points=4):
        super().__init__()

        self.d_model = d_model
        self.nhead = nhead

        encoder_layer = DeformableTransformerEncoderLayer(d_model, dim_feedforward,
                                                        dropout, activation,
                                                        num_feature_levels, nhead, enc_n_points)
        self.encoder = DeformableTransformerEncoder(encoder_layer, num_encoder_layers)

        decoder_layer = DeformableTransformerDecoderLayer(d_model, dim_feedforward,
                                                          dropout, activation,
                                                          num_feature_levels, nhead, dec_n_points)
        self.decoder = DeformableTransformerDecoder(decoder_layer, num_decoder_layers, return_intermediate_dec)

        self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))

        self.reference_points = nn.Linear(d_model, 1)

        self._reset_parameters()

    def _reset_parameters(self):
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
        for m in self.modules():
            if isinstance(m, DeformAttn):
                m._reset_parameters()

        xavier_uniform_(self.reference_points.weight.data, gain=1.0)
        constant_(self.reference_points.bias.data, 0.)
        normal_(self.level_embed)

    def get_valid_ratio(self, mask):
        _, T = mask.shape
        valid_T = torch.sum(~mask, 1)
        valid_ratio = valid_T.float() / T
        return valid_ratio    # shape=(bs)

    def forward(self, srcs, masks, pos_embeds, query_embed=None):
        '''
        Params:
            srcs: list of Tensor with shape (bs, c, t)
            masks: list of Tensor with shape (bs, t)
            pos_embeds: list of Tensor with shape (bs, c, t)
            query_embed: list of Tensor with shape (nq, 2c)
        Returns:
            hs: list, per layer output of decoder
            init_reference_out: reference points predicted from query embeddings
            inter_references_out: reference points predicted from each decoder layer
            memory: (bs, c, t), final output of the encoder
        '''
        assert query_embed is not None
        # prepare input for encoder
        src_flatten = []
        mask_flatten = []
        lvl_pos_embed_flatten = []
        temporal_lens = []
        for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):
            bs, c, t = src.shape
            temporal_lens.append(t)
            # (bs, c, t) => (bs, t, c)
            src = src.transpose(1, 2)   
            pos_embed = pos_embed.transpose(1, 2)
            lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
            lvl_pos_embed_flatten.append(lvl_pos_embed)
            src_flatten.append(src)
            mask_flatten.append(mask)

        src_flatten = torch.cat(src_flatten, 1)
        mask_flatten = torch.cat(mask_flatten, 1)
        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
        temporal_lens = torch.as_tensor(temporal_lens, dtype=torch.long, device=src_flatten.device)
        level_start_index = torch.cat((temporal_lens.new_zeros((1, )), temporal_lens.cumsum(0)[:-1]))
        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)   # (bs, nlevels)

        # deformable encoder
        memory = self.encoder(src_flatten, temporal_lens, level_start_index, valid_ratios, 
            lvl_pos_embed_flatten if cfg.use_pos_embed else None, 
            mask_flatten)  # shape=(bs, t, c)

        bs, _, c = memory.shape
        
        query_embed, tgt = torch.split(query_embed, c, dim=1)
        query_embed = query_embed.unsqueeze(0).expand(bs, -1, -1)
        tgt = tgt.unsqueeze(0).expand(bs, -1, -1)
        reference_points = self.reference_points(query_embed).sigmoid()
        init_reference_out = reference_points

        # decoder
        hs, inter_references = self.decoder(tgt, reference_points, memory,
                                            temporal_lens, level_start_index, valid_ratios, query_embed, mask_flatten)
        inter_references_out = inter_references 
        return hs, init_reference_out, inter_references_out, memory.transpose(1, 2)


class DeformableTransformerEncoderLayer(nn.Module):
    def __init__(self,
                 d_model=256, d_ffn=1024,
                 dropout=0.1, activation="relu",
                 n_levels=4, n_heads=8, n_points=4):
        super().__init__()

        # self attention
        self.self_attn = DeformAttn(d_model, n_levels, n_heads, n_points)
        self.dropout1 = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(d_model)

        # ffn
        self.linear1 = nn.Linear(d_model, d_ffn)
        self.activation = _get_activation_fn(activation)
        self.dropout2 = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ffn, d_model)
        self.dropout3 = nn.Dropout(dropout)
        self.norm2 = nn.LayerNorm(d_model)

    @staticmethod
    def with_pos_embed(tensor, pos):
        return tensor if pos is None else tensor + pos

    def forward_ffn(self, src):
        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
        src = src + self.dropout3(src2)
        src = self.norm2(src)
        return src

    def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, padding_mask=None):
        # self attention
        src2, _ = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index, padding_mask)
        src = src + self.dropout1(src2)
        src = self.norm1(src)

        # ffn
        src = self.forward_ffn(src)

        return src


class DeformableTransformerEncoder(nn.Module):
    def __init__(self, encoder_layer, num_layers):
        super().__init__()
        self.layers = _get_clones(encoder_layer, num_layers)
        self.num_layers = num_layers

    @staticmethod
    def get_reference_points(spatial_shapes, valid_ratios, device):
        reference_points_list = []
        for lvl, T_ in enumerate(spatial_shapes):
            ref = torch.linspace(0.5, T_ - 0.5, T_, dtype=torch.float32, device=device)  # (t,)
            ref = ref[None] / (valid_ratios[:, None, lvl] * T_)                          # (bs, t)
            reference_points_list.append(ref)
        reference_points = torch.cat(reference_points_list, 1)
        reference_points = reference_points[:, :, None] * valid_ratios[:, None]          # (N, t, n_levels)
        return reference_points[..., None]                                               # (N, t, n_levels, 1)

    def forward(self, src, temporal_lens, level_start_index, valid_ratios, pos=None, padding_mask=None):
        '''
        src: shape=(bs, t, c)
        temporal_lens: shape=(n_levels). content: [t1, t2, t3, ...]
        level_start_index: shape=(n_levels,). [0, t1, t1+t2, ...]
        valid_ratios: shape=(bs, n_levels).
        '''
        output = src
        # (bs, t, levels, 1)
        reference_points = self.get_reference_points(temporal_lens, valid_ratios, device=src.device)
        for _, layer in enumerate(self.layers):
            output = layer(output, pos, reference_points, temporal_lens, level_start_index, padding_mask)
        return output


class DeformableTransformerDecoderLayer(nn.Module):
    def __init__(self, d_model=256, d_ffn=1024,
                 dropout=0.1, activation="relu",
                 n_levels=4, n_heads=8, n_points=4):
        super().__init__()

        # cross attention
        self.cross_attn = DeformAttn(d_model, n_levels, n_heads, n_points)
        self.dropout1 = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(d_model)

        # self attention
        self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.norm2 = nn.LayerNorm(d_model)

        # ffn
        self.linear1 = nn.Linear(d_model, d_ffn)
        self.activation = _get_activation_fn(activation)
        self.dropout3 = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ffn, d_model)
        self.dropout4 = nn.Dropout(dropout)
        self.norm3 = nn.LayerNorm(d_model)

    @staticmethod
    def with_pos_embed(tensor, pos):
        return tensor if pos is None else tensor + pos

    def forward_ffn(self, tgt):
        tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
        tgt = tgt + self.dropout4(tgt2)
        tgt = self.norm3(tgt)
        return tgt

    def forward(self, tgt, query_pos, reference_points, src, src_spatial_shapes, level_start_index, src_padding_mask=None):
        if not cfg.disable_query_self_att:
            # self attention
            q = k = self.with_pos_embed(tgt, query_pos)

            tgt2 = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), tgt.transpose(0, 1))[0].transpose(0, 1)
            tgt = tgt + self.dropout2(tgt2)
            tgt = self.norm2(tgt)

        else:
            pass
        # cross attention
        tgt2, _ = self.cross_attn(self.with_pos_embed(tgt, query_pos),
                               reference_points,
                               src, src_spatial_shapes, level_start_index, src_padding_mask)
        tgt = tgt + self.dropout1(tgt2)
        tgt = self.norm1(tgt)

        # ffn
        tgt = self.forward_ffn(tgt)

        return tgt


class DeformableTransformerDecoder(nn.Module):
    def __init__(self, decoder_layer, num_layers, return_intermediate=False):
        super().__init__()
        self.layers = _get_clones(decoder_layer, num_layers)
        self.num_layers = num_layers
        self.return_intermediate = return_intermediate
        # hack implementation for iterative bounding box refinement and two-stage Deformable DETR
        self.segment_embed = None
        self.class_embed = None

    def forward(self, tgt, reference_points, src, src_spatial_shapes, src_level_start_index, src_valid_ratios,
                query_pos=None, src_padding_mask=None):
        '''
        tgt: [bs, nq, C]
        reference_points: [bs, nq, 1 or 2]
        src: [bs, T, C]
        src_valid_ratios: [bs, levels]
        '''
        output = tgt
        intermediate = []
        intermediate_reference_points = []
        for lid, layer in enumerate(self.layers):
            # (bs, nq, 1, 1 or 2) x (bs, 1, num_level, 1) => (bs, nq, num_level, 1 or 2)
            reference_points_input = reference_points[:, :, None] * src_valid_ratios[:, None,:, None]
            output = layer(output, query_pos, reference_points_input, src, src_spatial_shapes, src_level_start_index, src_padding_mask)
            
            # hack implementation for segment refinement
            if self.segment_embed is not None:
                # update the reference point/segment of the next layer according to the output from the current layer
                tmp = self.segment_embed[lid](output)
                if reference_points.shape[-1] == 2:
                    new_reference_points = tmp + inverse_sigmoid(reference_points)
                    new_reference_points = new_reference_points.sigmoid()
                else:
                    # at the 0-th decoder layer
                    # d^(n+1) = delta_d^(n+1)
                    # c^(n+1) = sigmoid( inverse_sigmoid(c^(n)) + delta_c^(n+1))
                    assert reference_points.shape[-1] == 1
                    new_reference_points = tmp
                    new_reference_points[..., :1] = tmp[..., :1] + inverse_sigmoid(reference_points)
                    new_reference_points = new_reference_points.sigmoid()
                reference_points = new_reference_points.detach()

            if self.return_intermediate:
                intermediate.append(output)
                intermediate_reference_points.append(reference_points)
        if self.return_intermediate:
            return torch.stack(intermediate), torch.stack(intermediate_reference_points)

        return output, reference_points


def _get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])


def _get_activation_fn(activation):
    """Return an activation function given a string"""
    if activation == "relu":
        return F.relu
    if activation == "gelu":
        return F.gelu
    if activation == "glu":
        return F.glu
    if activation == "leaky_relu":
        return F.leaky_relu
    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")


def build_deformable_transformer(args):
    return DeformableTransformer(
        d_model=args.hidden_dim,
        nhead=args.nheads,
        num_encoder_layers=args.enc_layers,
        num_decoder_layers=args.dec_layers,
        dim_feedforward=args.dim_feedforward,
        dropout=args.dropout,
        activation=args.activation,
        return_intermediate_dec=True,
        num_feature_levels=1,
        dec_n_points=args.dec_n_points,
        enc_n_points=args.enc_n_points)


================================================
FILE: opts.py
================================================
# ------------------------------------------------------------------------
# TadTR: End-to-end Temporal Action Detection with Transformer
# Copyright (c) 2021 - 2022. Xiaolong Liu.
# ------------------------------------------------------------------------


import argparse
from easydict import EasyDict
import yaml


def str2bool(x):
    if x.lower() in ['true', 't', '1', 'y']:
        return True
    else:
        return False


def get_args_parser():
    parser = argparse.ArgumentParser('TadTR', add_help=False)

    parser.add_argument('--cfg', type=str, help='the config file to use')

    parser.add_argument('--device', default='cuda',
                        help='device to use for training / testing')
    parser.add_argument('--seed', default=42, type=int)

    parser.add_argument('--resume', default='', help='resume from checkpoint')
    
    parser.add_argument('--eval', action='store_true', help='perform testing')
    parser.add_argument('--num_workers', default=2, type=int, help='number of dataloader workers')

    # Multi-GPU training
    # We support both DataParallel and Distributed DataParallel (DDP)
    parser.add_argument('--multi_gpu', action='store_true', help='use nn.DataParallel')
    parser.add_argument('--world_size', default=1, type=int,
                        help='number of distributed processes')
    parser.add_argument('--dist_url', default='env://',
                        help='url used to set up distributed training')

    # Other options
    parser.add_argument('opt', nargs=argparse.REMAINDER,
                        help='Command arguments that override configs')
    return parser


cfg = EasyDict()
# ---- Basic option ----
# whether to enable tensorboard
cfg.tensorboard = False
# Disable CUDA extensions so that we can run the model on CPU
cfg.disable_cuda = False
# The backend of deformable attention, pytorch or CUDA
cfg.dfm_att_backend = 'pytorch'

# path where to save, empty for no saving
cfg.output_dir = ''


# # ------ Data options ------
cfg.dataset_name = 'thumos14'
# Use feature input or raw image input (jointly train the video encoder and the detection head). Choices: {feature, image}
cfg.input_type = 'feature'   
# Which kind of feature to use. e.g. i3d, tsn.
cfg.feature = 'i3d2s'
# dimension (channels) of the video feature
cfg.feature_dim = 2048
# Perform binary detection (proposal generation) only 
cfg.binary = False
# Testing on Which subset 'val' or 'test' (For Anet and HACS). Note that we rename the training/validation/testing subsets for all datasets. For example, the validation subset used for training on THUMOS14 is renamed as 'train' subset.
cfg.test_set = 'val'
# whether to crop video into windows (A window is also called a slice in this codebase). Required for THUMOS14
cfg.online_slice = False
# length of video slices. For feature input, the length is for feature sequence. For video input, the length is for frame sequence.
cfg.slice_len = None
# overlap ratio (=overlap_length/slice_length) between adjacent slices during training
cfg.slice_overlap = 0
# overlap ratio between adjacent slices during inference 
cfg.test_slice_overlap = 0


# ---- Model option --------
# Name of the convolutional backbone to use. If we use video features as input, backbone should be 'none' 
cfg.backbone = 'none'

# whether to use position embedding
cfg.use_pos_embed = True
# Type of positional embedding to use on top of the video features. Only support sine embedding.
cfg.position_embedding = "sine"

# Number of encoding layers in the transformer
cfg.enc_layers = 2
# Number of decoding layers in the transformer
cfg.dec_layers = 4
# Intermediate size of the feedforward layers in the transformer blocks
cfg.dim_feedforward = 2048
# Size of the embeddings (dimension of the transformer)
cfg.hidden_dim = 256
# Dropout applied in the transformer
cfg.dropout = 0.1
# Number of attention heads inside the transformer's attentions
cfg.nheads = 8
# Number of sampled points per head for deformable attention in the encoder
cfg.enc_n_points = 4
# Number of sampled points per head for deformable attention in the decoder
cfg.dec_n_points = 4
# Number of action queries
cfg.num_queries = 30
# Transformer activation type, relu|leaky_relu|gelu
cfg.activation = 'relu'
# Whether to enable segment refinement mechanism
cfg.seg_refine = True
# Whether to enable actionness regression head
cfg.act_reg = True
# whether to disable self-attention between action queries
cfg.disable_query_self_att = False


# ----- Loss and matcher setting -------
# Enable auxiliary decoding losses (loss at each layer)
cfg.aux_loss = True

# Loss weight 
cfg.act_loss_coef = 4
cfg.cls_loss_coef = 2
cfg.seg_loss_coef = 5
cfg.iou_loss_coef = 2
# Relative classification weight of the no-action class
cfg.eos_coef = 0.1
# For focal loss
cfg.focal_alpha = 0.25

# Set cost weight
cfg.set_cost_class = 6    # Class coefficient 
cfg.set_cost_seg = 5      # Segment L1 coefficient 
cfg.set_cost_iou = 2      # Segment IoU coefficient


# ----- Training option -------
# base learning rate. If you set lr in yaml file, don't use this format, use 0.0002 instead
cfg.lr = 2e-4

# Valid only when the input is video frames
# specify the name pattern of the backbone layers.
cfg.lr_backbone_names = ['backbone']
# learning rate of backbone layers
cfg.lr_backbone = 1e-5

# special linear projection layers that need to use smaller lr
cfg.lr_linear_proj_names = ['reference_points', 'sampling_offsets']
cfg.lr_linear_proj_mult = 0.1

# which optimizer to use, choose from ['AdamW', 'Adam', 'SGD']
cfg.optimizer = 'AdamW'
cfg.batch_size = 16
cfg.weight_decay = 1e-4
# gradient clipping max norm
cfg.clip_max_norm = 0.1

# maximum number of training epochs
cfg.epochs = 16

# when to decay lr
cfg.lr_step = [14]
# save checkpoint every N epochs. Set it to a small value if you want to save intermediate models
cfg.ckpt_interval = 10
# update parameters every N forward-backward passes. N=1 (default)
cfg.iter_size = 1
# test model every N epochs. N=1 (default)
cfg.test_interval = 1


# ----- Postproc option -------
# How to rank the predicted instances. 
# 1: for each query, generate a instance for each class; then pick top-scored instance from the whole set
# 2: pick top classes for each query
cfg.postproc_rank = 1
# for each query, pick top k classes; keep all queries
# this setting is useful for debug
cfg.postproc_cls_topk = 1
# for each video, pick topk detections
cfg.postproc_ins_topk = 100
# IoU threshold for NMS. Note that NMS is not necessary.
cfg.nms_thr = 0.4


def update_cfg_with_args(cfg, arg_list):
    from ast import literal_eval
    for i in range(0, len(arg_list), 2):
        cur_entry = cfg
        key_parts = arg_list[i].split('.')
        for k in key_parts[:-1]:
            cur_entry = cur_entry[k]
        node = key_parts[-1]
        try:
            cur_entry[node] = literal_eval(arg_list[i+1])
        except:
            # print(f'literal_eval({arg_list[i+1]}) failed, directly take the value')
            cur_entry[node] = arg_list[i+1]


def update_cfg_from_file(cfg, cfg_path):
    import os
    assert os.path.exists(cfg_path), 'cfg_path is invalid'
    cfg_from_file = yaml.load(open(cfg_path), yaml.FullLoader)
    cfg.update(cfg_from_file)

================================================
FILE: requirements.txt
================================================
torch>=1.5.1
torchvision>=0.6.1
scipy
tqdm
easydict
PyYAML
numpy
pandas


================================================
FILE: scripts/run_parallel.sh
================================================
# Run on two GPUs in non-distributed mode (more convenient)
CUDA_VISIBLE_DEVICES=0,1 python -u main.py --cfg "CFG_PATH" --multi_gpu

# Run on two GPUs in distributed mode (more powerful)
MASTER_PORT=29510
CUDA_VISIBLE_DEVICES=0,1 python -u -m torch.distributed.launch --nproc_per_node=2 --master_port ${MASTER_PORT} --use_env main.py --cfg "CFG_PATH"


================================================
FILE: scripts/test_reference_models.sh
================================================
dataset=$1
   
if [[ $dataset = thumos14 ]];then

    CUDA_VISIBLE_DEVICES=0 python main.py --cfg configs/thumos14_i3d2s_tadtr.yml --eval --resume data/thumos14/thumos14_i3d2s_tadtr_reference.pth
else
    echo "Unsupported dataset ${dataset}. Exit"
fi


================================================
FILE: util/__init__.py
================================================
# ------------------------------------------------------------------------
# TadTR: End-to-end Temporal Action Detection with Transformer
# Copyright (c) 2021. Xiaolong Liu.
# ------------------------------------------------------------------------

================================================
FILE: util/logger.py
================================================
# ------------------------------------------------------------------------
# TadTR: End-to-end Temporal Action Detection with Transformer
# Copyright (c) 2021. Xiaolong Liu.
# ------------------------------------------------------------------------


import builtins
import logging
import sys
from .misc import is_main_process


def _suppress_print():
    """
    Suppresses printing from the current process.
    """

    def print_pass(*objects, sep=" ", end="\n", file=sys.stdout, flush=False):
        pass

    builtins.print = print_pass


def setup_logger(log_file_path, name=None, level=logging.INFO):
    """
    Setup a logger that simultaneously output to a file and stdout
    ARGS
      log_file_path: string, path to the logging file
    """
    if is_main_process():
        print('this is master process, set up logger')
        # logging settings
        #   log_formatter = logging.Formatter("%(asctime)s [%(levelname)-5.5s]  %(message)s")
        log_formatter = logging.Formatter(
            "[%(asctime)s][%(levelname)s] %(pathname)s: %(lineno)4d: %(message)s",
            datefmt="%m/%d %H:%M:%S")
        root_logger = logging.getLogger(name)
        if name:
            root_logger.propagate = False
        root_logger.setLevel(level)
        # file handler
        if log_file_path is not None:
            log_file_handler = logging.FileHandler(log_file_path)
            log_file_handler.setFormatter(log_formatter)
           
            root_logger.addHandler(log_file_handler)

        # stdout handler
        log_formatter = logging.Formatter(
            "[%(asctime)s][%(levelname)s]: %(message)s",
            datefmt="%m/%d %H:%M:%S")
        log_stream_handler = logging.StreamHandler(sys.stdout)
        log_stream_handler.setFormatter(log_formatter)
        root_logger.addHandler(log_stream_handler)

        logging.info('Log file is %s' % log_file_path)
        return root_logger

    else:
        print('this is not a master process, suppress print')
        _suppress_print()


================================================
FILE: util/misc.py
================================================
# ------------------------------------------------------------------------
# TadTR: End-to-end Temporal Action Detection with Transformer
# Copyright (c) 2021. Xiaolong Liu.
# ------------------------------------------------------------------------
# Modified from DETR (https://github.com/facebookresearch/detr)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# ------------------------------------------------------------------------

"""
Misc functions, including distributed helpers.

Mostly copy-paste from torchvision references.
"""
import os
import subprocess
import time
from collections import defaultdict, deque
import datetime
import pickle
from typing import Optional, List

import torch
import torch.distributed as dist
from torch import Tensor
import logging

# needed due to empty tensor bug in pytorch and torchvision 0.5
import torchvision


def mkdir_if_not_exist(dirname):
    if not os.path.exists(dirname):
        os.makedirs(dirname)


class SmoothedValue(object):
    """Track a series of values and provide access to smoothed values over a
    window or the global series average.
    """

    def __init__(self, window_size=20, fmt=None):
        if fmt is None:
            fmt = "{median:.4f} ({global_avg:.4f})"
        self.deque = deque(maxlen=window_size)
        self.total = 0.0
        self.count = 0
        self.fmt = fmt

    def update(self, value, n=1):
        self.deque.append(value)
        self.count += n
        self.total += value * n

    def synchronize_between_processes(self):
        """
        Warning: does not synchronize the deque!
        """
        if not is_dist_avail_and_initialized():
            return
        t = torch.tensor([self.count, self.total],
                         dtype=torch.float64, device='cuda')
        dist.barrier()
        dist.all_reduce(t)
        t = t.tolist()
        self.count = int(t[0])
        self.total = t[1]

    @property
    def median(self):
        d = torch.tensor(list(self.deque))
        return d.median().item()

    @property
    def avg(self):
        d = torch.tensor(list(self.deque), dtype=torch.float32)
        return d.mean().item()

    @property
    def global_avg(self):
        return self.total / self.count

    @property
    def max(self):
        return max(self.deque)

    @property
    def value(self):
        return self.deque[-1]

    def __str__(self):
        return self.fmt.format(
            median=self.median,
            avg=self.avg,
            global_avg=self.global_avg,
            max=self.max,
            value=self.value)


def all_gather(data):
    """
    Run all_gather on arbitrary picklable data (not necessarily tensors)
    Args:
        data: any picklable object
    Returns:
        list[data]: list of data gathered from each rank
    """
    world_size = get_world_size()
    if world_size == 1:
        return [data]

    # serialized to a Tensor
    buffer = pickle.dumps(data)
    storage = torch.ByteStorage.from_buffer(buffer)
    tensor = torch.ByteTensor(storage).to("cuda")

    # obtain Tensor size of each rank
    local_size = torch.tensor([tensor.numel()], device="cuda")
    size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)]
    dist.all_gather(size_list, local_size)
    size_list = [int(size.item()) for size in size_list]
    max_size = max(size_list)

    # receiving Tensor from all ranks
    # we pad the tensor because torch all_gather does not support
    # gathering tensors of different shapes
    tensor_list = []
    for _ in size_list:
        tensor_list.append(torch.empty(
            (max_size,), dtype=torch.uint8, device="cuda"))
    if local_size != max_size:
        padding = torch.empty(size=(max_size - local_size,),
                              dtype=torch.uint8, device="cuda")
        tensor = torch.cat((tensor, padding), dim=0)
    dist.all_gather(tensor_list, tensor)

    data_list = []
    for size, tensor in zip(size_list, tensor_list):
        buffer = tensor.cpu().numpy().tobytes()[:size]
        data_list.append(pickle.loads(buffer))

    return data_list


def reduce_dict(input_dict, average=True):
    """
    Args:
        input_dict (dict): all the values will be reduced
        average (bool): whether to do average or sum
    Reduce the values in the dictionary from all processes so that all processes
    have the averaged results. Returns a dict with the same fields as
    input_dict, after reduction.
    """
    world_size = get_world_size()
    if world_size < 2:
        return input_dict
    with torch.no_grad():
        names = []
        values = []
        # sort the keys so that they are consistent across processes
        for k in sorted(input_dict.keys()):
            names.append(k)
            values.append(input_dict[k])
        values = torch.stack(values, dim=0)
        dist.all_reduce(values)
        if average:
            values /= world_size
        reduced_dict = {k: v for k, v in zip(names, values)}
    return reduced_dict


class MetricLogger(object):
    def __init__(self, delimiter="\t"):
        self.meters = defaultdict(SmoothedValue)
        self.delimiter = delimiter

    def update(self, **kwargs):
        for k, v in kwargs.items():
            if isinstance(v, torch.Tensor):
                v = v.item()
            assert isinstance(v, (float, int))
            self.meters[k].update(v)

    def __getattr__(self, attr):
        if attr in self.meters:
            return self.meters[attr]
        if attr in self.__dict__:
            return self.__dict__[attr]
        raise AttributeError("'{}' object has no attribute '{}'".format(
            type(self).__name__, attr))

    def __str__(self):
        loss_str = []
        for name, meter in self.meters.items():
            loss_str.append(
                "{}: {}".format(name, str(meter))
            )
        return self.delimiter.join(loss_str)

    def synchronize_between_processes(self):
        for meter in self.meters.values():
            meter.synchronize_between_processes()

    def add_meter(self, name, meter):
        self.meters[name] = meter

    def log_every(self, iterable, print_freq, header=None):
        i = 0
        if not header:
            header = ''
        start_time = time.time()
        end = time.time()
        iter_time = SmoothedValue(fmt='{avg:.4f}')
        data_time = SmoothedValue(fmt='{avg:.4f}')
        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
        if torch.cuda.is_available():
            log_msg = self.delimiter.join([
                header,
                '[{0' + space_fmt + '}/{1}]',
                'eta: {eta}',
                '{meters}',
                'time: {time}',
                'data: {data}',
                'max mem: {memory:.0f}'
            ])
        else:
            log_msg = self.delimiter.join([
                header,
                '[{0' + space_fmt + '}/{1}]',
                'eta: {eta}',
                '{meters}',
                'time: {time}',
                'data: {data}'
            ])
        MB = 1024.0 * 1024.0
        for obj in iterable:
            data_time.update(time.time() - end)
            yield obj
            iter_time.update(time.time() - end)
            if i % print_freq == 0 or i == len(iterable) - 1:
                eta_seconds = iter_time.global_avg * (len(iterable) - i)
                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
                if torch.cuda.is_available():
                    logging.info(log_msg.format(
                        i, len(iterable), eta=eta_string,
                        meters=str(self),
                        time=str(iter_time), data=str(data_time),
                        memory=torch.cuda.max_memory_allocated() / MB))
                else:
                    logging.info(log_msg.format(
                        i, len(iterable), eta=eta_string,
                        meters=str(self),
                        time=str(iter_time), data=str(data_time)))
            i += 1
            end = time.time()
        total_time = time.time() - start_time
        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
        logging.info('{} Total time: {} ({:.4f} s / it)'.format(
            header, total_time_str, total_time / len(iterable)))


def get_sha():
    cwd = os.path.dirname(os.path.abspath(__file__))

    def _run(command):
        return subprocess.check_output(command, cwd=cwd).decode('ascii').strip()
    sha = 'N/A'
    diff = "clean"
    branch = 'N/A'
    try:
        sha = _run(['git', 'rev-parse', 'HEAD'])
        subprocess.check_output(['git', 'diff'], cwd=cwd)
        diff = _run(['git', 'diff-index', 'HEAD'])
        diff = "has uncommited changes" if diff else "clean"
        branch = _run(['git', 'rev-parse', '--abbrev-ref', 'HEAD'])
    except Exception:
        pass
    message = f"sha: {sha}, status: {diff}, branch: {branch}"
    return message


def collate_fn(batch):
    batch = list(zip(*batch))
    batch[0] = nested_tensor_from_tensor_list(batch[0])
    # print('collate_fn done')

    return tuple(batch)


def _max_by_axis(the_list):
    # type: (List[List[int]]) -> List[int]
    maxes = the_list[0]
    for sublist in the_list[1:]:
        for index, item in enumerate(sublist):
            maxes[index] = max(maxes[index], item)
    return maxes


class NestedTensor(object):
    def __init__(self, tensors, mask: Optional[Tensor]):
        self.tensors = tensors
        self.mask = mask

    def to(self, device):
        # type: (Device) -> NestedTensor # noqa
        cast_tensor = self.tensors.to(device)
        mask = self.mask
        if mask is not None:
            assert mask is not None
            cast_mask = mask.to(device)
        else:
            cast_mask = None
        return NestedTensor(cast_tensor, cast_mask)

    # def cuda(self):
    #     tensors = self.tensors.cuda()
    #     mask = self.mask.cuda()
    #     return NestedTensor(tensors, mask)

    def decompose(self):
        return self.tensors, self.mask

    def __repr__(self):
        return str(self.tensors)


def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
    # TODO make this more general
    if tensor_list[0].ndim == 3:  # n,c,t
        if torchvision._is_tracing():
            # nested_tensor_from_tensor_list() does not export well to ONNX
            # call _onnx_nested_tensor_from_tensor_list() instead
            return _onnx_nested_tensor_from_tensor_list(tensor_list)

        # TODO make it support different-sized images
        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
        # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
        batch_shape = [len(tensor_list)] + max_size
        b, c, h, w = batch_shape
        dtype = tensor_list[0].dtype
        device = tensor_list[0].device
        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
        mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
        for img, pad_img, m in zip(tensor_list, tensor, mask):
            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
            m[: img.shape[1], :img.shape[2]] = False
    elif tensor_list[0].ndim == 2 or tensor_list[0].ndim == 4:
        max_size = max([video_ft.shape[1]
                       for video_ft in tensor_list])  # [c,t,h,w] or [c,t]
        # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
        if tensor_list[0].ndim == 2:
            batch_shape = [len(tensor_list), tensor_list[0].shape[0], max_size]
        else:
            batch_shape = [len(tensor_list), tensor_list[0].shape[0],
                           max_size, tensor_list[0].shape[2], tensor_list[0].shape[3]]
        b, c, t = batch_shape[:3]
        dtype = tensor_list[0].dtype
        device = tensor_list[0].device
        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
        mask = torch.ones((b, t), dtype=torch.bool, device=device)
        for video_ft, pad_video_ft, m in zip(tensor_list, tensor, mask):
            pad_video_ft[: video_ft.shape[0],
                         : video_ft.shape[1]].copy_(video_ft)
            m[: video_ft.shape[1]] = False

    else:
        raise ValueError('not supported')
    return NestedTensor(tensor, mask)


def make_nested_tensor(tensor):
    b, t = tensor.shape[0], tensor.shape[2]
    mask = torch.zeros([b, t], dtype=torch.bool, device=tensor.device)
    return NestedTensor(tensor, mask)


# _onnx_nested_tensor_from_tensor_list() is an implementation of
# nested_tensor_from_tensor_list() that is supported by ONNX tracing.
@torch.jit.unused
def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
    max_size = []
    for i in range(tensor_list[0].dim()):
        max_size_i = torch.max(torch.stack(
            [img.shape[i] for img in tensor_list]).to(torch.float32)).to(torch.int64)
        max_size.append(max_size_i)
    max_size = tuple(max_size)

    # work around for
    # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
    # m[: img.shape[1], :img.shape[2]] = False
    # which is not yet supported in onnx
    padded_imgs = []
    padded_masks = []
    for img in tensor_list:
        padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
        padded_img = torch.nn.functional.pad(
            img, (0, padding[2], 0, padding[1], 0, padding[0]))
        padded_imgs.append(padded_img)

        m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
        padded_mask = torch.nn.functional.pad(
            m, (0, padding[2], 0, padding[1]), "constant", 1)
        padded_masks.append(padded_mask.to(torch.bool))

    tensor = torch.stack(padded_imgs)
    mask = torch.stack(padded_masks)

    return NestedTensor(tensor, mask=mask)


def setup_for_distributed(is_master):
    """
    This function disables printing when not in master process
    """
    import builtins as __builtin__
    builtin_print = __builtin__.print

    def print(*args, **kwargs):
        force = kwargs.pop('force', False)
        if is_master or force:
            builtin_print(*args, **kwargs)

    __builtin__.print = print


def is_dist_avail_and_initialized():
    if not dist.is_available():
        return False
    if not dist.is_initialized():
        return False
    return True


def get_world_size():
    if not is_dist_avail_and_initialized():
        return 1
    return dist.get_world_size()


def get_rank():
    if not is_dist_avail_and_initialized():
        return 0
    return dist.get_rank()


def is_main_process():
    return get_rank() == 0


def save_on_master(*args, **kwargs):
    if is_main_process():
        torch.save(*args, **kwargs)


def init_distributed_mode(args):
    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
        args.rank = int(os.environ["RANK"])
        args.world_size = int(os.environ['WORLD_SIZE'])
        args.gpu = int(os.environ['LOCAL_RANK'])
    elif 'SLURM_PROCID' in os.environ:
        args.rank = int(os.environ['SLURM_PROCID'])
        args.gpu = args.rank % torch.cuda.device_count()
    else:
        print('Not using distributed mode')
        args.distributed = False
        return

    args.distributed = True

    torch.cuda.set_device(args.gpu)
    args.dist_backend = 'nccl'
    print('| distributed init (rank {}): {}'.format(
        args.rank, args.dist_url), flush=True)
    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                         world_size=args.world_size, rank=args.rank)
    torch.distributed.barrier()
    setup_for_distributed(args.rank == 0)


@torch.no_grad()
def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    if target.numel() == 0:
        return [torch.zeros([], device=output.device)]
    maxk = max(topk)
    batch_size = target.size(0)

    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    for k in topk:
        correct_k = correct[:k].view(-1).float().sum(0)
        res.append(correct_k.mul_(100.0 / batch_size))
    return res


def inverse_sigmoid(x, eps=1e-5):
    x = x.clamp(min=0, max=1)
    x1 = x.clamp(min=eps)
    x2 = (1 - x).clamp(min=eps)
    return torch.log(x1/x2)


================================================
FILE: util/segment_ops.py
================================================
# ------------------------------------------------------------------------
# TadTR: End-to-end Temporal Action Detection with Transformer
# Copyright (c) 2021. Xiaolong Liu.
# ------------------------------------------------------------------------
# Modified from DETR (https://github.com/facebookresearch/detr)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# ------------------------------------------------------------------------

"""
Utilities for segment manipulation and IoU.
"""
import torch
import numpy as np
# from torchvision.ops.boxes import box_area


def segment_cw_to_t1t2(x):
    '''corresponds to box_cxcywh_to_xyxy in detr
    Params:
        x: segments in (center, width) format, shape=(*, 2)
    Returns:
        segments in (t_start, t_end) format, shape=(*, 2)
    '''
    if not isinstance(x, np.ndarray):
        x_c, w = x.unbind(-1)
        b = [(x_c - 0.5 * w), (x_c + 0.5 * w)]
        return torch.stack(b, dim=-1)
    else:
        x_c, w = x[..., 0], x[..., 1]
        b = [(x_c - 0.5 * w)[..., None], (x_c + 0.5 * w)[..., None]]
        return np.concatenate(b, axis=-1)


def segment_t1t2_to_cw(x):
    '''corresponds to box_xyxy_to_cxcywh in detr
    Params:
        x: segments in (t_start, t_end) format, shape=(*, 2)
    Returns:
        segments in (center, width) format, shape=(*, 2)
    '''
    if not isinstance(x, np.ndarray):
        x1, x2 = x.unbind(-1)
        b = [(x1 + x2) / 2, (x2 - x1)]
        return torch.stack(b, dim=-1)
    else:
        x1, x2 = x[..., 0], x[..., 1]
        b = [((x1 + x2) / 2)[..., None], (x2 - x1)[..., None]]
        return np.concatenate(b, axis=-1)


def segment_length(segments):
    return (segments[:, 1]-segments[:, 0]).clamp(min=0)


# modified from torchvision to also return the union
def segment_iou_and_union(segments1, segments2):
    area1 = segment_length(segments1)
    area2 = segment_length(segments2)

    l = torch.max(segments1[:, None, 0], segments2[:, 0])  # N,M
    r = torch.min(segments1[:, None, 1], segments2[:, 1])  # N,M
    inter = (r - l).clamp(min=0)  # [N,M]

    union = area1[:, None] + area2 - inter

    iou = inter / union
    return iou, union


def segment_iou(segments1, segments2):
    """
    Temporal IoU between 

    The boxes should be in [x0, y0, x1, y1] format

    Returns a [N, M] pairwise matrix, where N = len(segments1)
    and M = len(segments2)
    """
    # degenerate boxes gives inf / nan results
    # so do an early check
    assert (segments1[:, 1] >= segments1[:, 0]).all()

    area1 = segment_length(segments1)
    area2 = segment_length(segments2)

    l = torch.max(segments1[:, None, 0], segments2[:, 0])  # N,M
    r = torch.min(segments1[:, None, 1], segments2[:, 1])  # N,M
    inter = (r - l).clamp(min=0)  # [N,M]

    union = area1[:, None] + area2 - inter

    iou = inter / union

    return iou


def temporal_iou_numpy(proposal_min, proposal_max, gt_min, gt_max):
    """Compute IoU score between a groundtruth instance and the proposals.

    Args:
        proposal_min (list[float]): List of temporal anchor min.
        proposal_max (list[float]): List of temporal anchor max.
        gt_min (float): Groundtruth temporal box min.
        gt_max (float): Groundtruth temporal box max.

    Returns:
        list[float]: List of iou scores.
    """
    len_anchors = proposal_max - proposal_min
    int_tmin = np.maximum(proposal_min, gt_min)
    int_tmax = np.minimum(proposal_max, gt_max)
    inter_len = np.maximum(int_tmax - int_tmin, 0.)
    union_len = len_anchors - inter_len + gt_max - gt_min
    jaccard = np.divide(inter_len, union_len)
    return jaccard


def temporal_iou_numpy(proposal_min, proposal_max, gt_min, gt_max):
    """Compute IoP score between a groundtruth bbox and the proposals.

    Compute the IoP which is defined as the overlap ratio with
    groundtruth proportional to the duration of this proposal.

    Args:
        proposal_min (list[float]): List of temporal anchor min.
        proposal_max (list[float]): List of temporal anchor max.
        gt_min (float): Groundtruth temporal box min.
        gt_max (float): Groundtruth temporal box max.

    Returns:
        list[float]: List of intersection over anchor scores.
    """
    len_anchors = np.array(proposal_max - proposal_min)
    int_tmin = np.maximum(proposal_min, gt_min)
    int_tmax = np.minimum(proposal_max, gt_max)
    inter_len = np.maximum(int_tmax - int_tmin, 0.)
    scores = np.divide(inter_len, len_anchors)
    return scores


def soft_nms(proposals, alpha, low_threshold, high_threshold, top_k):
    """Soft NMS for temporal proposals.

    Args:
        proposals (np.ndarray): Proposals generated by network.
        alpha (float): Alpha value of Gaussian decaying function.
        low_threshold (float): Low threshold for soft nms.
        high_threshold (float): High threshold for soft nms.
        top_k (int): Top k values to be considered.

    Returns:
        np.ndarray: The updated proposals.
    """
    proposals = proposals[proposals[:, -1].argsort()[::-1]]
    tstart = list(proposals[:, 0])
    tend = list(proposals[:, 1])
    tscore = list(proposals[:, 2])
    rstart = []
    rend = []
    rscore = []

    while len(tscore) > 0 and len(rscore) <= top_k:
        max_index = np.argmax(tscore)
        max_width = tend[max_index] - tstart[max_index]
        iou_list = temporal_iou_numpy(tstart[max_index], tend[max_index],
                                      np.array(tstart), np.array(tend))
        iou_exp_list = np.exp(-np.square(iou_list) / alpha)

        for idx, _ in enumerate(tscore):
            if idx != max_index:
                current_iou = iou_list[idx]
                if current_iou > low_threshold + (high_threshold -
                                                  low_threshold) * max_width:
                    tscore[idx] = tscore[idx] * iou_exp_list[idx]

        rstart.append(tstart[max_index])
        rend.append(tend[max_index])
        rscore.append(tscore[max_index])
        tstart.pop(max_index)
        tend.pop(max_index)
        tscore.pop(max_index)

    rstart = np.array(rstart).reshape(-1, 1)
    rend = np.array(rend).reshape(-1, 1)
    rscore = np.array(rscore).reshape(-1, 1)
    new_proposals = np.concatenate((rstart, rend, rscore), axis=1)
    return new_proposals


def temporal_nms(segments, thresh):
    """
    One-dimensional non-maximal suppression
    :param segments: [[st, ed, score, ...], ...]
    :param thresh:
    :return:
    """
    t1 = segments[:, 0]
    t2 = segments[:, 1]
    scores = segments[:, 2]

    durations = t2 - t1
    order = scores.argsort()[::-1]

    keep = []
    while order.size > 0:
        i = order[0]
        keep.append(i)
        tt1 = np.maximum(t1[i], t1[order[1:]])
        tt2 = np.minimum(t2[i], t2[order[1:]])
        intersection = tt2 - tt1
        IoU = intersection / \
            (durations[i] + durations[order[1:]] - intersection).astype(float)

        inds = np.where(IoU <= thresh)[0]
        order = order[inds + 1]

    return segments[keep, :]