Repository: Zhongdao/UniTrack
Branch: main
Commit: a83e782f5c56
Files: 379
Total size: 2.3 MB
Directory structure:
gitextract_942_umbq/
├── .gitignore
├── LICENSE
├── README.md
├── config/
│ ├── crw_resnet18_s3.yaml
│ ├── crw_resnet18_s3_womotion.yaml
│ ├── imagenet_resnet18_s3.yaml
│ └── imagenet_resnet18_s3_womotion.yaml
├── core/
│ ├── association/
│ │ ├── __init__.py
│ │ └── matching.py
│ ├── motion/
│ │ └── kalman_filter.py
│ └── propagation/
│ ├── __init__.py
│ ├── propagate_box.py
│ ├── propagate_mask.py
│ └── propagate_pose.py
├── data/
│ ├── jhmdb.py
│ ├── kinetics.py
│ ├── video.py
│ └── vos.py
├── demo/
│ ├── mot_demo.py
│ └── sot_demo.py
├── detector/
│ └── YOLOX/
│ ├── .gitignore
│ ├── LICENSE
│ ├── README.md
│ ├── datasets/
│ │ └── README.md
│ ├── demo/
│ │ ├── ONNXRuntime/
│ │ │ ├── README.md
│ │ │ └── onnx_inference.py
│ │ ├── OpenVINO/
│ │ │ ├── README.md
│ │ │ ├── cpp/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ ├── README.md
│ │ │ │ └── yolox_openvino.cpp
│ │ │ └── python/
│ │ │ ├── README.md
│ │ │ └── openvino_inference.py
│ │ ├── TensorRT/
│ │ │ ├── cpp/
│ │ │ │ ├── CMakeLists.txt
│ │ │ │ ├── README.md
│ │ │ │ ├── logging.h
│ │ │ │ └── yolox.cpp
│ │ │ └── python/
│ │ │ └── README.md
│ │ └── ncnn/
│ │ ├── android/
│ │ │ ├── README.md
│ │ │ ├── app/
│ │ │ │ ├── build.gradle
│ │ │ │ └── src/
│ │ │ │ └── main/
│ │ │ │ ├── AndroidManifest.xml
│ │ │ │ ├── assets/
│ │ │ │ │ └── yolox.param
│ │ │ │ ├── java/
│ │ │ │ │ └── com/
│ │ │ │ │ └── megvii/
│ │ │ │ │ └── yoloXncnn/
│ │ │ │ │ ├── MainActivity.java
│ │ │ │ │ ├── YOLOXncnn.java
│ │ │ │ │ └── yoloXncnn.java
│ │ │ │ ├── jni/
│ │ │ │ │ ├── CMakeLists.txt
│ │ │ │ │ └── yoloXncnn_jni.cpp
│ │ │ │ └── res/
│ │ │ │ ├── layout/
│ │ │ │ │ └── main.xml
│ │ │ │ └── values/
│ │ │ │ └── strings.xml
│ │ │ ├── build.gradle
│ │ │ ├── gradle/
│ │ │ │ └── wrapper/
│ │ │ │ ├── gradle-wrapper.jar
│ │ │ │ └── gradle-wrapper.properties
│ │ │ ├── gradlew
│ │ │ ├── gradlew.bat
│ │ │ └── settings.gradle
│ │ └── cpp/
│ │ ├── README.md
│ │ └── yolox.cpp
│ ├── demo.py
│ ├── docs/
│ │ └── train_custom_data.md
│ ├── exps/
│ │ ├── default/
│ │ │ ├── nano.py
│ │ │ ├── yolov3.py
│ │ │ ├── yolox_l.py
│ │ │ ├── yolox_m.py
│ │ │ ├── yolox_s.py
│ │ │ ├── yolox_tiny.py
│ │ │ └── yolox_x.py
│ │ └── example/
│ │ └── yolox_voc/
│ │ └── yolox_voc_s.py
│ ├── requirements.txt
│ ├── setup.cfg
│ ├── setup.py
│ ├── tools/
│ │ ├── __init__.py
│ │ ├── demo.py
│ │ ├── eval.py
│ │ ├── export_onnx.py
│ │ ├── train.py
│ │ └── trt.py
│ └── yolox/
│ ├── __init__.py
│ ├── core/
│ │ ├── __init__.py
│ │ ├── launch.py
│ │ └── trainer.py
│ ├── data/
│ │ ├── __init__.py
│ │ ├── data_augment.py
│ │ ├── data_prefetcher.py
│ │ ├── dataloading.py
│ │ ├── datasets/
│ │ │ ├── __init__.py
│ │ │ ├── coco.py
│ │ │ ├── coco_classes.py
│ │ │ ├── datasets_wrapper.py
│ │ │ ├── mosaicdetection.py
│ │ │ ├── voc.py
│ │ │ └── voc_classes.py
│ │ └── samplers.py
│ ├── evaluators/
│ │ ├── __init__.py
│ │ ├── coco_evaluator.py
│ │ ├── voc_eval.py
│ │ └── voc_evaluator.py
│ ├── exp/
│ │ ├── __init__.py
│ │ ├── base_exp.py
│ │ ├── build.py
│ │ └── yolox_base.py
│ ├── layers/
│ │ ├── __init__.py
│ │ ├── csrc/
│ │ │ ├── cocoeval/
│ │ │ │ ├── cocoeval.cpp
│ │ │ │ └── cocoeval.h
│ │ │ └── vision.cpp
│ │ └── fast_coco_eval_api.py
│ ├── models/
│ │ ├── __init__.py
│ │ ├── darknet.py
│ │ ├── losses.py
│ │ ├── network_blocks.py
│ │ ├── yolo_fpn.py
│ │ ├── yolo_head.py
│ │ ├── yolo_pafpn.py
│ │ └── yolox.py
│ └── utils/
│ ├── __init__.py
│ ├── allreduce_norm.py
│ ├── boxes.py
│ ├── checkpoint.py
│ ├── demo_utils.py
│ ├── dist.py
│ ├── ema.py
│ ├── logger.py
│ ├── lr_scheduler.py
│ ├── metric.py
│ ├── model_utils.py
│ ├── setup_env.py
│ └── visualize.py
├── docs/
│ ├── DATA.md
│ ├── INSTALL.md
│ ├── MODELZOO.md
│ ├── RESULTS.md
│ └── RUN.md
├── eval/
│ ├── convert_davis.py
│ ├── davis_dummy.txt
│ ├── eval_mot.py
│ ├── eval_pck.py
│ ├── mots/
│ │ ├── Evaluator.py
│ │ ├── LICENSE
│ │ ├── MOTSVisualization.py
│ │ ├── MOTS_metrics.py
│ │ ├── Metrics.py
│ │ ├── README.md
│ │ ├── Visualize.py
│ │ ├── __init__.py
│ │ ├── evalMOTS.py
│ │ ├── mots_common/
│ │ │ ├── images_to_txt.py
│ │ │ └── io.py
│ │ └── requirements.txt
│ ├── palette.py
│ ├── poseval/
│ │ ├── .gitignore
│ │ ├── .gitmodules
│ │ ├── .pylintrc
│ │ ├── README.md
│ │ ├── evaluate.py
│ │ ├── license.txt
│ │ ├── matlab/
│ │ │ ├── external/
│ │ │ │ └── jsonlab/
│ │ │ │ ├── AUTHORS.txt
│ │ │ │ ├── ChangeLog.txt
│ │ │ │ ├── LICENSE_BSD.txt
│ │ │ │ ├── README.txt
│ │ │ │ ├── examples/
│ │ │ │ │ ├── demo_jsonlab_basic.m
│ │ │ │ │ ├── demo_ubjson_basic.m
│ │ │ │ │ ├── example1.json
│ │ │ │ │ ├── example2.json
│ │ │ │ │ ├── example3.json
│ │ │ │ │ ├── example4.json
│ │ │ │ │ ├── jsonlab_basictest.matlab
│ │ │ │ │ ├── jsonlab_selftest.m
│ │ │ │ │ ├── jsonlab_selftest.matlab
│ │ │ │ │ └── jsonlab_speedtest.m
│ │ │ │ ├── jsonopt.m
│ │ │ │ ├── loadjson.m
│ │ │ │ ├── loadubjson.m
│ │ │ │ ├── mergestruct.m
│ │ │ │ ├── savejson.m
│ │ │ │ ├── saveubjson.m
│ │ │ │ ├── struct2jdata.m
│ │ │ │ └── varargin2struct.m
│ │ │ ├── mat2json.m
│ │ │ └── startup.m
│ │ └── poseval/
│ │ ├── __init__.py
│ │ ├── convert.py
│ │ ├── eval_helpers.py
│ │ ├── evaluateAP.py
│ │ ├── evaluatePCKh.py
│ │ ├── evaluateTracking.py
│ │ └── posetrack18_id2fname.py
│ └── trackeval/
│ ├── __init__.py
│ ├── _timing.py
│ ├── datasets/
│ │ ├── __init__.py
│ │ ├── _base_dataset.py
│ │ ├── bdd100k.py
│ │ ├── davis.py
│ │ ├── kitti_2d_box.py
│ │ ├── kitti_mots.py
│ │ ├── mot_challenge_2d_box.py
│ │ ├── mots_challenge.py
│ │ ├── tao.py
│ │ └── youtube_vis.py
│ ├── eval.py
│ ├── metrics/
│ │ ├── __init__.py
│ │ ├── _base_metric.py
│ │ ├── clear.py
│ │ ├── count.py
│ │ ├── hota.py
│ │ ├── identity.py
│ │ ├── j_and_f.py
│ │ ├── track_map.py
│ │ └── vace.py
│ ├── plotting.py
│ └── utils.py
├── eval.sh
├── model/
│ ├── __init__.py
│ ├── functional.py
│ ├── hrnet.py
│ ├── model.py
│ ├── random_feat_generator.py
│ └── resnet.py
├── requirements.txt
├── setup.py
├── test/
│ ├── test_mot.py
│ ├── test_mots.py
│ ├── test_poseprop.py
│ ├── test_posetrack.py
│ ├── test_sot_cfnet.py
│ ├── test_sot_siamfc.py
│ ├── test_vis.py
│ └── test_vos.py
├── tools/
│ ├── gen_mot16_fairmot.py
│ ├── gen_mot16_gt.py
│ ├── gen_mot16_label17.py
│ ├── gen_mot19_det.py
│ ├── gen_mots_costa.py
│ └── gen_mots_gt.py
├── tracker/
│ ├── mot/
│ │ ├── basetrack.py
│ │ ├── box.py
│ │ ├── mask.py
│ │ ├── multitracker.py
│ │ └── pose.py
│ └── sot/
│ └── lib/
│ ├── core/
│ │ ├── config.py
│ │ ├── config_ocean.py
│ │ ├── config_oceanplus.py
│ │ ├── config_siamdw.py
│ │ ├── eval_davis.py
│ │ ├── eval_got10k.py
│ │ ├── eval_lasot.py
│ │ ├── eval_otb.py
│ │ ├── eval_visdrone.py
│ │ ├── extract_tune_logs.py
│ │ └── function.py
│ ├── dataset/
│ │ ├── crop/
│ │ │ ├── DAVIS/
│ │ │ │ ├── gen_json.py
│ │ │ │ ├── par_crop.py
│ │ │ │ └── readme.md
│ │ │ ├── RGBT210/
│ │ │ │ ├── RGBT210_genjson.py
│ │ │ │ ├── gen_json.py
│ │ │ │ ├── par_crop.py
│ │ │ │ └── readme.md
│ │ │ ├── RGBT234/
│ │ │ │ ├── RGBT234_genjson.py
│ │ │ │ ├── gen_json.py
│ │ │ │ ├── par_crop.py
│ │ │ │ └── readme.md
│ │ │ ├── coco/
│ │ │ │ ├── gen_json.py
│ │ │ │ ├── par_crop.py
│ │ │ │ └── readme.md
│ │ │ ├── det/
│ │ │ │ ├── gen_json.py
│ │ │ │ ├── par_crop.py
│ │ │ │ └── readme.md
│ │ │ ├── got10k/
│ │ │ │ ├── gen_json.py
│ │ │ │ ├── par_crop.py
│ │ │ │ ├── parser_got10k.py
│ │ │ │ └── readme.md
│ │ │ ├── lasot/
│ │ │ │ ├── gen_json.py
│ │ │ │ ├── par_crop.py
│ │ │ │ ├── parser_lasot.py
│ │ │ │ └── readme.md
│ │ │ ├── vid/
│ │ │ │ ├── gen_json.py
│ │ │ │ ├── par_crop.py
│ │ │ │ ├── parse_vid.py
│ │ │ │ └── readme.md
│ │ │ └── visdrone/
│ │ │ ├── gen_json.py
│ │ │ ├── par_crop.py
│ │ │ ├── parser_visdrone.py
│ │ │ └── readme.md
│ │ ├── ocean.py
│ │ └── siamfc.py
│ ├── eval_toolkit/
│ │ ├── bin/
│ │ │ ├── _init_paths.py
│ │ │ └── eval.py
│ │ ├── pysot/
│ │ │ ├── __init__.py
│ │ │ ├── datasets/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── dataset.py
│ │ │ │ ├── got10k.py
│ │ │ │ ├── lasot.py
│ │ │ │ ├── nfs.py
│ │ │ │ ├── otb.py
│ │ │ │ ├── trackingnet.py
│ │ │ │ ├── uav.py
│ │ │ │ ├── video.py
│ │ │ │ └── vot.py
│ │ │ ├── evaluation/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── ar_benchmark.py
│ │ │ │ ├── eao_benchmark.py
│ │ │ │ ├── f1_benchmark.py
│ │ │ │ └── ope_benchmark.py
│ │ │ ├── utils/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── c_region.pxd
│ │ │ │ ├── misc.py
│ │ │ │ ├── region.c
│ │ │ │ ├── region.pyx
│ │ │ │ ├── setup.py
│ │ │ │ ├── src/
│ │ │ │ │ ├── buffer.h
│ │ │ │ │ ├── region.c
│ │ │ │ │ └── region.h
│ │ │ │ └── statistics.py
│ │ │ └── visualization/
│ │ │ ├── __init__.py
│ │ │ ├── draw_eao.py
│ │ │ ├── draw_f1.py
│ │ │ ├── draw_success_precision.py
│ │ │ └── draw_utils.py
│ │ └── requirements.txt
│ ├── models/
│ │ ├── __init__.py
│ │ ├── backbones.py
│ │ ├── cfnet.py
│ │ ├── connect.py
│ │ ├── modules.py
│ │ ├── online/
│ │ │ ├── __init__.py
│ │ │ ├── backbone/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── resnet.py
│ │ │ │ └── resnet18_vggm.py
│ │ │ ├── bbreg/
│ │ │ │ ├── __init__.py
│ │ │ │ └── iou_net.py
│ │ │ ├── classifier/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── features.py
│ │ │ │ ├── initializer.py
│ │ │ │ ├── linear_filter.py
│ │ │ │ └── optimizer.py
│ │ │ └── layers/
│ │ │ ├── __init__.py
│ │ │ ├── activation.py
│ │ │ ├── blocks.py
│ │ │ ├── distance.py
│ │ │ ├── filter.py
│ │ │ ├── normalization.py
│ │ │ └── transform.py
│ │ └── siamfc.py
│ ├── online/
│ │ ├── __init__.py
│ │ ├── augmentation.py
│ │ ├── base_actor.py
│ │ ├── base_trainer.py
│ │ ├── complex.py
│ │ ├── dcf.py
│ │ ├── extractor.py
│ │ ├── fourier.py
│ │ ├── loading.py
│ │ ├── ltr_trainer.py
│ │ ├── model_constructor.py
│ │ ├── operation.py
│ │ ├── optim.py
│ │ ├── optimization.py
│ │ ├── preprocessing.py
│ │ ├── tensordict.py
│ │ ├── tensorlist.py
│ │ └── tracking.py
│ ├── tracker/
│ │ ├── ocean.py
│ │ ├── oceanplus.py
│ │ ├── online.py
│ │ └── siamfc.py
│ ├── utils/
│ │ ├── __init__.py
│ │ ├── cutout.py
│ │ ├── extract_tpejson_fc.py
│ │ ├── extract_tpejson_ocean.py
│ │ ├── extract_tpelog.py
│ │ ├── extract_tpelog_fc.py
│ │ ├── utils.py
│ │ └── watch_tpe.sh
│ └── version.py
└── utils/
├── __init__.py
├── box.py
├── io.py
├── log.py
├── mask.py
├── meter.py
├── palette.py
└── visualize.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
docs/test_video.mp4
test/test_tao.py
config/tao*
tracker/mot/tao.py
eval/error_log.txt
config/got10k*
config/lasot*
config/tc128*
config/tlp*
config/trackingnet*
config/vfs*
config/ssib*
weights/
results/
out/
vis/
*.ipynb
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
================================================
FILE: LICENSE
================================================
MIT License
Copyright (c) 2021 ZhongdaoWang
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: README.md
================================================
--------------------------------------------------------------------------------
**[NeurIPS 2021] Do different tracking tasks require different appearance model?**
**[[ArXiv](https://arxiv.org/abs/2107.02156)]** **[[Project Page](https://zhongdao.github.io/UniTrack)]**
UniTrack is a simple and Unified framework for addressing multiple tracking tasks.
Being a fundamental problem in computer vision, tracking has been fragmented into a multitude of different experimental setups. As a consequence, the literature has fragmented too, and now the novel approaches proposed by the community are usually specialized to fit only one specific setup. To understand to what extent this specialization is actually necessary, we present UniTrack, a solution to address multiple different tracking tasks within the same framework. All tasks share the same [appearance model](#appearance-model). UniTrack
- Does **NOT** need training on a specific tracking task.
- Shows [competitive performance](docs/RESULTS.md) on six out of seven tracking tasks considered.
- Can be easily adapted to even [more tasks](##Demo).
- Can be used as an evaluation platform to [test pre-trained self-supervised models](docs/MODELZOO.md).
## Demo
**Multi-Object Tracking demo for 80 COCO classes ([YOLOX](https://github.com/Megvii-BaseDetection/YOLOX) + UniTrack)**
In this demo we run the YOLOX detector and perform MOT for the 80 COCO classes. Try the demo by:
```python
python demo/mot_demo.py --classes cls1 cls2 ... clsN
```
where cls1 to clsN represent the indices of classes you would like to detect and track. See [here](https://gist.github.com/AruniRC/7b3dadd004da04c80198557db5da4bda) for the index list. By default all 80 classes are detected and tracked.
**Single-Object Tracking demo for custom videos**
```python
python demo/sot_demo.py --config ./config/imagenet_resnet18_s3.yaml --input /path/to/your/video
```
In this demo, you are asked to annotate the target to be tracked, by drawing a rectangle in the first frame of the video. Then the algorithm tracks the target in following timesteps without object detection.
## Tasks & Framework

### Tasks
We classify existing tracking tasks along four axes: (1) Single or multiple targets; (2) Users specify targets or automatic detectors specify targets; (3) Observation formats (bounding box/mask/pose); (2) Class-agnostic or class-specific (i.e. human/vehicles). We mainly experiment on 5 tasks: **SOT, VOS, MOT, MOTS, and PoseTrack**. Task setups are summarized in the above figure.
### Appearance model
An appearance model is the only learnable component in UniTrack. It should provide universal visual representation, and is usually pre-trained on large-scale dataset in supervised or unsupervised manners. Typical examples include ImageNet pre-trained ResNets (supervised), and recent self-supervised models such as MoCo and SimCLR (unsupervised).
### Propagation and Association
*Propagation* and *Association* are the two core primitives used in UniTrack to address a wide variety of tracking tasks (currently 7, but more can be added), Both use the features extracted by the pre-trained appearance model. For propagation, we adopt exiting methods such as [cross correlation](https://www.robots.ox.ac.uk/~luca/siamese-fc.html), [DCF](https://openaccess.thecvf.com/content_cvpr_2017/html/Valmadre_End-To-End_Representation_Learning_CVPR_2017_paper.html), and [mask propation](https://github.com/ajabri/videowalk). For association we employ a simple algorithm as in [JDE](https://github.com/Zhongdao/Towards-Realtime-MOT) and develop a novel reconstruction-based similairty metric that allows to compare objects across shapes and sizes.
## Getting started
1. Installation: Please check out [docs/INSTALL.md](docs/INSTALL.md)
2. Data preparation: Please check out [docs/DATA.md](docs/DATA.md)
3. Appearance model preparation: Please check out [docs/MODELZOO.md](docs/MODELZOO.md)
4. Run evaluation on all datasets: Please check out [docs/RUN.md](docs/RUN.md)
## Results
Below we show results of UniTrack with a simple **ImageNet Pre-trained ResNet-18** as the appearance model. More results can be found in [RESULTS.md](docs/RESULTS.md).
**Single Object Tracking (SOT) on OTB-2015**
**Video Object Segmentation (VOS) on DAVIS-2017 *val* split**
**Multiple Object Tracking (MOT) on MOT-16 [*test* set *private detector* track](https://motchallenge.net/method/MOT=3856&chl=5)** (Detections from FairMOT)
**Multiple Object Tracking and Segmentation (MOTS) on MOTS challenge [*test* set](https://motchallenge.net/method/MOTS=109&chl=17)** (Detections from COSTA_st)
**Pose Tracking on PoseTrack-2018 *val* split** (Detections from LightTrack)
## Acknowledgement
A part of code is borrowed from
[VideoWalk](https://github.com/ajabri/videowalk) by Allan A. Jabri
[SOT code](https://github.com/JudasDie/SOTS) by Zhipeng Zhang
## Citation
```bibtex
@article{wang2021different,
author = {Wang, Zhongdao and Zhao, Hengshuang and Li, Ya-Li and Wang, Shengjin and Torr, Philip and Bertinetto, Luca},
title = {Do different tracking tasks require different appearance models?},
journal = {Thirty-Fifth Conference on Neural Infromation Processing Systems},
year = {2021},
}
```
================================================
FILE: config/crw_resnet18_s3.yaml
================================================
common:
exp_name: crw_resnet18_s3
# Model related
model_type: crw
remove_layers: ['layer4']
im_mean: [0.4914, 0.4822, 0.4465]
im_std: [0.2023, 0.1994, 0.2010]
nopadding: False
head_depth: -1
resume: 'weights/crw.pth'
# Misc
down_factor: 8
infer2D: True
workers: 4
gpu_id: 0
device: cuda
sot:
dataset: 'OTB2015'
dataroot: '/home/wangzd/datasets/GOT/OTB100/'
epoch_test: False
vos:
davisroot: '/home/wangzd/datasets/uvc/DAVIS/'
split: 'val'
temperature: 0.05
topk: 10
radius: 12
videoLen: 5
cropSize: -1
head_depth: -1
no_l2: False
long_mem: [0]
infer2D: False
norm_mask: False
mot:
obid: 'FairMOT'
mot_root: '/home/wangzd/datasets/MOT/MOT16'
feat_size: [4,10]
save_videos: True
save_images: False
test_mot16: False
track_buffer: 30
min_box_area: 200
nms_thres: 0.4
conf_thres: 0.5
iou_thres: 0.5
dup_iou_thres: 0.15
confirm_iou_thres: 0.7
img_size: [1088, 608]
prop_flag: False
use_kalman: True
asso_with_motion: True
motion_lambda: 0.98
motion_gated: True
mots:
obid: 'COSTA'
mots_root: '/home/wangzd/datasets/GOT/MOTS'
save_videos: False
save_images: True
test: False
track_buffer: 30
nms_thres: 0.4
conf_thres: 0.5
iou_thres: 0.5
prop_flag: False
max_mask_area: 200
dup_iou_thres: 0.15
confirm_iou_thres: 0.7
first_stage_thres: 0.7
feat_size: [4,10]
use_kalman: True
asso_with_motion: True
motion_lambda: 0.98
motion_gated: False
posetrack:
obid: 'lighttrack_MSRA152'
data_root: '/home/wangzd/datasets/GOT/Posetrack2018'
split: 'val'
track_buffer: 30
nms_thres: 0.4
conf_thres: 0.5
iou_thres: 0.5
frame_rate: 6
save_videos: False
save_images: True
prop_flag: False
feat_size: [4,10]
max_mask_area: 400
dup_iou_thres: 0.2
confirm_iou_thres: 0.6
first_stage_thres: 0.7
use_kalman: True
asso_with_motion: True
motion_lambda: 0.9999
motion_gated: False
only_position: True
================================================
FILE: config/crw_resnet18_s3_womotion.yaml
================================================
common:
exp_name: crw_resnet18_s3_womotion
# Model related
model_type: crw
remove_layers: ['layer4']
im_mean: [0.4914, 0.4822, 0.4465]
im_std: [0.2023, 0.1994, 0.2010]
nopadding: False
head_depth: -1
resume: 'weights/crw.pth'
# Misc
down_factor: 8
infer2D: True
workers: 4
gpu_id: 0
device: cuda
sot:
dataset: 'OTB2015'
dataroot: '/home/wangzd/datasets/GOT/OTB100/'
epoch_test: False
vos:
davisroot: '/home/wangzd/datasets/uvc/DAVIS/'
split: 'val'
temperature: 0.05
topk: 10
radius: 12
videoLen: 5
cropSize: -1
head_depth: -1
no_l2: False
long_mem: [0]
infer2D: False
norm_mask: False
mot:
obid: 'FairMOT'
mot_root: '/home/wangzd/datasets/MOT/MOT16'
feat_size: [4,10]
save_videos: True
save_images: False
test_mot16: False
track_buffer: 30
min_box_area: 200
nms_thres: 0.4
conf_thres: 0.5
iou_thres: 0.5
dup_iou_thres: 0.15
confirm_iou_thres: 0.7
img_size: [1088, 608]
prop_flag: False
use_kalman: True
asso_with_motion: False
motion_lambda: 1
motion_gated: False
mots:
obid: 'COSTA'
mots_root: '/home/wangzd/datasets/GOT/MOTS'
save_videos: False
save_images: True
test: False
track_buffer: 30
nms_thres: 0.4
conf_thres: 0.5
iou_thres: 0.5
prop_flag: False
max_mask_area: 200
dup_iou_thres: 0.15
confirm_iou_thres: 0.7
first_stage_thres: 0.7
feat_size: [4,10]
use_kalman: True
asso_with_motion: False
motion_lambda: 1
motion_gated: False
posetrack:
obid: 'lighttrack_MSRA152'
data_root: '/home/wangzd/datasets/GOT/Posetrack2018'
split: 'val'
track_buffer: 30
nms_thres: 0.4
conf_thres: 0.5
iou_thres: 0.5
frame_rate: 6
save_videos: False
save_images: True
prop_flag: False
feat_size: [4,10]
max_mask_area: 400
dup_iou_thres: 0.2
confirm_iou_thres: 0.6
first_stage_thres: 0.7
use_kalman: True
asso_with_motion: False
motion_lambda: 1
motion_gated: False
only_position: True
================================================
FILE: config/imagenet_resnet18_s3.yaml
================================================
common:
exp_name: imagenet_resnet18_s3
# Model related
model_type: imagenet18
remove_layers: ['layer4']
im_mean: [0.485, 0.456, 0.406]
im_std: [0.229, 0.224, 0.225]
nopadding: False
resume: None
# Misc
down_factor: 8
infer2D: True
workers: 4
gpu_id: 0
device: cuda
sot:
dataset: 'OTB2015'
dataroot: '/home/wangzd/datasets/GOT/OTB100/'
epoch_test: False
vos:
davisroot: '/home/wangzd/datasets/uvc/DAVIS/'
split: 'val'
temperature: 0.05
topk: 10
radius: 12
videoLen: 5
cropSize: -1
head_depth: -1
no_l2: False
long_mem: [0]
infer2D: False
norm_mask: False
mot:
obid: 'FairMOT'
mot_root: '/home/wangzd/datasets/MOT/MOT16'
feat_size: [4,10]
save_videos: True
save_images: False
test_mot16: False
track_buffer: 30
min_box_area: 200
nms_thres: 0.4
conf_thres: 0.5
iou_thres: 0.5
dup_iou_thres: 0.15
confirm_iou_thres: 0.7
img_size: [1088, 608]
prop_flag: False
use_kalman: True
asso_with_motion: True
motion_lambda: 0.98
motion_gated: True
mots:
obid: 'COSTA'
mots_root: '/home/wangzd/datasets/GOT/MOTS'
save_videos: False
save_images: True
test: False
track_buffer: 30
nms_thres: 0.4
conf_thres: 0.5
iou_thres: 0.5
prop_flag: False
max_mask_area: 200
dup_iou_thres: 0.15
confirm_iou_thres: 0.7
first_stage_thres: 0.7
feat_size: [4,10]
use_kalman: True
asso_with_motion: True
motion_lambda: 0.98
motion_gated: False
posetrack:
obid: 'lighttrack_MSRA152'
data_root: '/home/wangzd/datasets/GOT/Posetrack2018'
split: 'val'
track_buffer: 30
nms_thres: 0.4
conf_thres: 0.5
iou_thres: 0.5
frame_rate: 6
save_videos: False
save_images: True
prop_flag: False
feat_size: [4,10]
max_mask_area: 400
dup_iou_thres: 0.2
confirm_iou_thres: 0.6
first_stage_thres: 0.7
use_kalman: True
asso_with_motion: True
motion_lambda: 0.9999
motion_gated: False
only_position: True
================================================
FILE: config/imagenet_resnet18_s3_womotion.yaml
================================================
common:
exp_name: imagenet_resnet18_s3_womotion
# Model related
model_type: imagenet18
remove_layers: ['layer4']
im_mean: [0.485, 0.456, 0.406]
im_std: [0.229, 0.224, 0.225]
nopadding: False
resume: None
# Misc
down_factor: 8
infer2D: True
workers: 4
gpu_id: 0
device: cuda
sot:
dataset: 'OTB2015'
dataroot: '/home/wangzd/datasets/GOT/OTB100/'
epoch_test: False
vos:
davisroot: '/home/wangzd/datasets/uvc/DAVIS/'
split: 'val'
temperature: 0.05
topk: 10
radius: 12
videoLen: 5
cropSize: -1
head_depth: -1
no_l2: False
long_mem: [0]
infer2D: False
norm_mask: False
mot:
obid: 'FairMOT'
mot_root: '/home/wangzd/datasets/MOT/MOT16'
feat_size: [4,10]
save_videos: True
save_images: False
test_mot16: False
track_buffer: 30
min_box_area: 200
nms_thres: 0.4
conf_thres: 0.5
iou_thres: 0.5
dup_iou_thres: 0.15
confirm_iou_thres: 0.7
img_size: [1088, 608]
prop_flag: False
use_kalman: True
asso_with_motion: False
motion_lambda: 1
motion_gated: False
mots:
obid: 'COSTA'
mots_root: '/home/wangzd/datasets/GOT/MOTS'
save_videos: False
save_images: True
test: False
track_buffer: 30
nms_thres: 0.4
conf_thres: 0.5
iou_thres: 0.5
prop_flag: False
max_mask_area: 200
dup_iou_thres: 0.15
confirm_iou_thres: 0.7
first_stage_thres: 0.7
feat_size: [4,10]
use_kalman: True
asso_with_motion: False
motion_lambda: 1
motion_gated: False
posetrack:
obid: 'lighttrack_MSRA152'
data_root: '/home/wangzd/datasets/GOT/Posetrack2018'
split: 'val'
track_buffer: 30
nms_thres: 0.4
conf_thres: 0.5
iou_thres: 0.5
frame_rate: 6
save_videos: False
save_images: True
prop_flag: False
feat_size: [4,10]
max_mask_area: 400
dup_iou_thres: 0.2
confirm_iou_thres: 0.6
first_stage_thres: 0.7
use_kalman: True
asso_with_motion: False
motion_lambda: 1
motion_gated: False
only_position: True
vis:
obid: 'MaskTrackRCNN'
data_root: '/home/wangzd/datasets/GOT/YoutubeVIS/'
split: 'val'
track_buffer: 30
nms_thres: 0.4
conf_thres: 0.5
iou_thres: 0.5
frame_rate: 6
save_videos: False
save_images: True
prop_flag: False
feat_size: [12,12]
max_mask_area: 1000
dup_iou_thres: 0.2
confirm_iou_thres: 0.6
first_stage_thres: 0.9
use_kalman: True
asso_with_motion: False
motion_lambda: 1
motion_gated: False
================================================
FILE: core/association/__init__.py
================================================
================================================
FILE: core/association/matching.py
================================================
import pdb
import cv2
import torch
import torch.nn.functional as F
import numpy as np
import scipy
from scipy.spatial.distance import cdist
import lap
from cython_bbox import bbox_overlaps as bbox_ious
from core.motion import kalman_filter
import time
def merge_matches(m1, m2, shape):
O,P,Q = shape
m1 = np.asarray(m1)
m2 = np.asarray(m2)
M1 = scipy.sparse.coo_matrix((np.ones(len(m1)), (m1[:, 0], m1[:, 1])), shape=(O, P))
M2 = scipy.sparse.coo_matrix((np.ones(len(m2)), (m2[:, 0], m2[:, 1])), shape=(P, Q))
mask = M1*M2
match = mask.nonzero()
match = list(zip(match[0], match[1]))
unmatched_O = tuple(set(range(O)) - set([i for i, j in match]))
unmatched_Q = tuple(set(range(Q)) - set([j for i, j in match]))
return match, unmatched_O, unmatched_Q
def linear_assignment(cost_matrix, thresh):
if cost_matrix.size == 0:
return np.empty((0, 2), dtype=int), tuple(range(cost_matrix.shape[0])), tuple(range(cost_matrix.shape[1]))
matches, unmatched_a, unmatched_b = [], [], []
cost, x, y = lap.lapjv(cost_matrix, extend_cost=True, cost_limit=thresh)
for ix, mx in enumerate(x):
if mx >= 0:
matches.append([ix, mx])
unmatched_a = np.where(x < 0)[0]
unmatched_b = np.where(y < 0)[0]
matches = np.asarray(matches)
return matches, unmatched_a, unmatched_b
def ious(atlbrs, btlbrs):
"""
Compute cost based on IoU
:type atlbrs: list[tlbr] | np.ndarray
:type atlbrs: list[tlbr] | np.ndarray
:rtype ious np.ndarray
"""
ious = np.zeros((len(atlbrs), len(btlbrs)), dtype=np.float)
if ious.size == 0:
return ious
ious = bbox_ious(
np.ascontiguousarray(atlbrs, dtype=np.float),
np.ascontiguousarray(btlbrs, dtype=np.float)
)
return ious
def iou_distance(atracks, btracks):
"""
Compute cost based on IoU
:type atracks: list[STrack]
:type btracks: list[STrack]
:rtype cost_matrix np.ndarray
"""
if (len(atracks)>0 and isinstance(atracks[0], np.ndarray)) or (len(btracks) > 0 and isinstance(btracks[0], np.ndarray)):
atlbrs = atracks
btlbrs = btracks
else:
atlbrs = [track.tlbr for track in atracks]
btlbrs = [track.tlbr for track in btracks]
_ious = ious(atlbrs, btlbrs)
cost_matrix = 1 - _ious
return cost_matrix
def embedding_distance(tracks, detections, metric='cosine'):
"""
:param tracks: list[STrack]
:param detections: list[BaseTrack]
:param metric:
:return: cost_matrix np.ndarray
"""
cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float)
if cost_matrix.size == 0:
return cost_matrix
det_features = np.asarray([track.curr_feat for track in detections], dtype=np.float)
track_features = np.asarray([track.smooth_feat for track in tracks], dtype=np.float)
cost_matrix = np.maximum(0.0, cdist(track_features, det_features)) # Nomalized features
return cost_matrix
def fuse_motion(kf, cost_matrix, tracks, detections, only_position=False, lambda_=0.98, gate=True):
if cost_matrix.size == 0:
return cost_matrix
gating_dim = 2 if only_position else 4
gating_threshold = kalman_filter.chi2inv95[gating_dim]
measurements = np.asarray([det.to_xyah() for det in detections])
for row, track in enumerate(tracks):
gating_distance = kf.gating_distance(
track.mean, track.covariance, measurements, only_position, metric='maha')
if gate:
cost_matrix[row, gating_distance > gating_threshold] = np.inf
cost_matrix[row] = lambda_ * cost_matrix[row] + (1-lambda_)* gating_distance
return cost_matrix
def center_emb_distance(tracks, detections, metric='cosine'):
"""
:param tracks: list[STrack]
:param detections: list[BaseTrack]
:param metric:
:return: cost_matrix np.ndarray
"""
cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float)
if cost_matrix.size == 0:
return cost_matrix
det_features = torch.stack([track.curr_feat.squeeze() for track in detections])
track_features = torch.stack([track.smooth_feat.squeeze() for track in tracks])
normed_det = F.normalize(det_features)
normed_track = F.normalize(track_features)
cost_matrix = torch.mm(normed_track, normed_det.T)
cost_matrix = 1 - cost_matrix.detach().cpu().numpy()
return cost_matrix
def recons_distance(tracks, detections, tmp=100):
"""
:param tracks: list[STrack]
:param detections: list[BaseTrack]
:param metric:
:return: cost_matrix np.ndarray
"""
cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float)
if cost_matrix.size == 0:
return cost_matrix
det_features_ = torch.stack([track.curr_feat.squeeze() for track in detections])
track_features_ = torch.stack([track.smooth_feat for track in tracks])
det_features = F.normalize(det_features_, dim=1)
track_features = F.normalize(track_features_, dim=1)
ndet, ndim, nw, nh = det_features.shape
ntrk, _, _, _ = track_features.shape
fdet = det_features.permute(0,2,3,1).reshape(-1, ndim).cuda() # ndet*nw*nh, ndim
ftrk = track_features.permute(0,2,3,1).reshape(-1, ndim).cuda() # ntrk*nw*nh, ndim
aff = torch.mm(ftrk, fdet.transpose(0,1)) # ntrk*nw*nh, ndet*nw*nh
aff_td = F.softmax(tmp*aff, dim=1)
aff_dt = F.softmax(tmp*aff, dim=0).transpose(0,1)
recons_ftrk = torch.einsum('tds,dsm->tdm', aff_td.view(ntrk*nw*nh, ndet, nw*nh),
fdet.view(ndet, nw*nh, ndim)) # ntrk*nw*nh, ndet, ndim
recons_fdet = torch.einsum('dts,tsm->dtm', aff_dt.view(ndet*nw*nh, ntrk, nw*nh),
ftrk.view(ntrk, nw*nh, ndim)) # ndet*nw*nh, ntrk, ndim
res_ftrk = (recons_ftrk.permute(0,2,1) - ftrk.unsqueeze(-1)).view(ntrk, nw*nh*ndim, ndet)
res_fdet = (recons_fdet.permute(0,2,1) - fdet.unsqueeze(-1)).view(ndet, nw*nh*ndim, ntrk)
cost_matrix = (torch.abs(res_ftrk).mean(1) + torch.abs(res_fdet).mean(1).transpose(0,1)) * 0.5
cost_matrix = cost_matrix / cost_matrix.max(1)[0].unsqueeze(-1)
#pdb.set_trace()
cost_matrix = cost_matrix.cpu().numpy()
return cost_matrix
def get_track_feat(tracks, feat_flag='curr'):
if feat_flag == 'curr':
feat_list = [track.curr_feat.squeeze(0) for track in tracks]
elif feat_flag == 'smooth':
feat_list = [track.smooth_feat.squeeze(0) for track in tracks]
else:
raise NotImplementedError
n = len(tracks)
fdim = feat_list[0].shape[0]
fdim_num = len(feat_list[0].shape)
if fdim_num > 2:
feat_list = [f.view(fdim,-1) for f in feat_list]
numels = [f.shape[1] for f in feat_list]
ret = torch.zeros(n, fdim, np.max(numels)).to(feat_list[0].device)
for i, f in enumerate(feat_list):
ret[i, :, :numels[i]] = f
return ret
def reconsdot_distance(tracks, detections, tmp=100):
"""
:param tracks: list[STrack]
:param detections: list[BaseTrack]
:param metric:
:return: cost_matrix np.ndarray
"""
cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float)
if cost_matrix.size == 0:
return cost_matrix, None
det_features_ = get_track_feat(detections)
track_features_ = get_track_feat(tracks, feat_flag='curr')
det_features = F.normalize(det_features_, dim=1)
track_features = F.normalize(track_features_, dim=1)
ndet, ndim, nsd = det_features.shape
ntrk, _, nst = track_features.shape
fdet = det_features.permute(0, 2, 1).reshape(-1, ndim).cuda()
ftrk = track_features.permute(0, 2, 1).reshape(-1, ndim).cuda()
aff = torch.mm(ftrk, fdet.transpose(0, 1))
aff_td = F.softmax(tmp*aff, dim=1)
aff_dt = F.softmax(tmp*aff, dim=0).transpose(0, 1)
recons_ftrk = torch.einsum('tds,dsm->tdm', aff_td.view(ntrk*nst, ndet, nsd),
fdet.view(ndet, nsd, ndim))
recons_fdet = torch.einsum('dts,tsm->dtm', aff_dt.view(ndet*nsd, ntrk, nst),
ftrk.view(ntrk, nst, ndim))
recons_ftrk = recons_ftrk.permute(0, 2, 1).view(ntrk, nst*ndim, ndet)
recons_ftrk_norm = F.normalize(recons_ftrk, dim=1)
recons_fdet = recons_fdet.permute(0, 2, 1).view(ndet, nsd*ndim, ntrk)
recons_fdet_norm = F.normalize(recons_fdet, dim=1)
dot_td = torch.einsum('tad,ta->td', recons_ftrk_norm,
F.normalize(ftrk.reshape(ntrk, nst*ndim), dim=1))
dot_dt = torch.einsum('dat,da->dt', recons_fdet_norm,
F.normalize(fdet.reshape(ndet, nsd*ndim), dim=1))
cost_matrix = 1 - 0.5 * (dot_td + dot_dt.transpose(0, 1))
cost_matrix = cost_matrix.detach().cpu().numpy()
return cost_matrix, None
def category_gate(cost_matrix, tracks, detections):
"""
:param tracks: list[STrack]
:param detections: list[BaseTrack]
:param metric:
:return: cost_matrix np.ndarray
"""
if cost_matrix.size == 0:
return cost_matrix
det_categories = np.array([d.category for d in detections])
trk_categories = np.array([t.category for t in tracks])
cost_matrix = cost_matrix + np.abs(
det_categories[None, :] - trk_categories[:, None])
return cost_matrix
================================================
FILE: core/motion/kalman_filter.py
================================================
# vim: expandtab:ts=4:sw=4
import numpy as np
import scipy.linalg
"""
Table for the 0.95 quantile of the chi-square distribution with N degrees of
freedom (contains values for N=1, ..., 9). Taken from MATLAB/Octave's chi2inv
function and used as Mahalanobis gating threshold.
"""
chi2inv95 = {
1: 3.8415,
2: 5.9915,
3: 7.8147,
4: 9.4877,
5: 11.070,
6: 12.592,
7: 14.067,
8: 15.507,
9: 16.919}
class KalmanFilter(object):
"""
A simple Kalman filter for tracking bounding boxes in image space.
The 8-dimensional state space
x, y, a, h, vx, vy, va, vh
contains the bounding box center position (x, y), aspect ratio a, height h,
and their respective velocities.
Object motion follows a constant velocity model. The bounding box location
(x, y, a, h) is taken as direct observation of the state space (linear
observation model).
"""
def __init__(self):
ndim, dt = 4, 1.
# Create Kalman filter model matrices.
self._motion_mat = np.eye(2 * ndim, 2 * ndim)
for i in range(ndim):
self._motion_mat[i, ndim + i] = dt
self._update_mat = np.eye(ndim, 2 * ndim)
# Motion and observation uncertainty are chosen relative to the current
# state estimate. These weights control the amount of uncertainty in
# the model. This is a bit hacky.
self._std_weight_position = 1. / 20
self._std_weight_velocity = 1. / 160
def initiate(self, measurement):
"""Create track from unassociated measurement.
Parameters
----------
measurement : ndarray
Bounding box coordinates (x, y, a, h) with center position (x, y),
aspect ratio a, and height h.
Returns
-------
(ndarray, ndarray)
Returns the mean vector (8 dimensional) and covariance matrix (8x8
dimensional) of the new track. Unobserved velocities are initialized
to 0 mean.
"""
mean_pos = measurement
mean_vel = np.zeros_like(mean_pos)
mean = np.r_[mean_pos, mean_vel]
std = [
2 * self._std_weight_position * measurement[3],
2 * self._std_weight_position * measurement[3],
1e-2,
2 * self._std_weight_position * measurement[3],
10 * self._std_weight_velocity * measurement[3],
10 * self._std_weight_velocity * measurement[3],
1e-5,
10 * self._std_weight_velocity * measurement[3]]
covariance = np.diag(np.square(std))
return mean, covariance
def predict(self, mean, covariance):
"""Run Kalman filter prediction step.
Parameters
----------
mean : ndarray
The 8 dimensional mean vector of the object state at the previous
time step.
covariance : ndarray
The 8x8 dimensional covariance matrix of the object state at the
previous time step.
Returns
-------
(ndarray, ndarray)
Returns the mean vector and covariance matrix of the predicted
state. Unobserved velocities are initialized to 0 mean.
"""
std_pos = [
self._std_weight_position * mean[3],
self._std_weight_position * mean[3],
1e-2,
self._std_weight_position * mean[3]]
std_vel = [
self._std_weight_velocity * mean[3],
self._std_weight_velocity * mean[3],
1e-5,
self._std_weight_velocity * mean[3]]
motion_cov = np.diag(np.square(np.r_[std_pos, std_vel]))
mean = np.dot(mean, self._motion_mat.T)
covariance = np.linalg.multi_dot((
self._motion_mat, covariance, self._motion_mat.T)) + motion_cov
return mean, covariance
def project(self, mean, covariance):
"""Project state distribution to measurement space.
Parameters
----------
mean : ndarray
The state's mean vector (8 dimensional array).
covariance : ndarray
The state's covariance matrix (8x8 dimensional).
Returns
-------
(ndarray, ndarray)
Returns the projected mean and covariance matrix of the given state
estimate.
"""
std = [
self._std_weight_position * mean[3],
self._std_weight_position * mean[3],
1e-1,
self._std_weight_position * mean[3]]
innovation_cov = np.diag(np.square(std))
mean = np.dot(self._update_mat, mean)
covariance = np.linalg.multi_dot((
self._update_mat, covariance, self._update_mat.T))
return mean, covariance + innovation_cov
def multi_predict(self, mean, covariance):
"""Run Kalman filter prediction step (Vectorized version).
Parameters
----------
mean : ndarray
The Nx8 dimensional mean matrix of the object states at the previous
time step.
covariance : ndarray
The Nx8x8 dimensional covariance matrics of the object states at the
previous time step.
Returns
-------
(ndarray, ndarray)
Returns the mean vector and covariance matrix of the predicted
state. Unobserved velocities are initialized to 0 mean.
"""
std_pos = [
self._std_weight_position * mean[:, 3],
self._std_weight_position * mean[:, 3],
1e-2 * np.ones_like(mean[:, 3]),
self._std_weight_position * mean[:, 3]]
std_vel = [
self._std_weight_velocity * mean[:, 3],
self._std_weight_velocity * mean[:, 3],
1e-5 * np.ones_like(mean[:, 3]),
self._std_weight_velocity * mean[:, 3]]
sqr = np.square(np.r_[std_pos, std_vel]).T
motion_cov = []
for i in range(len(mean)):
motion_cov.append(np.diag(sqr[i]))
motion_cov = np.asarray(motion_cov)
mean = np.dot(mean, self._motion_mat.T)
left = np.dot(self._motion_mat, covariance).transpose((1,0,2))
covariance = np.dot(left, self._motion_mat.T) + motion_cov
return mean, covariance
def update(self, mean, covariance, measurement):
"""Run Kalman filter correction step.
Parameters
----------
mean : ndarray
The predicted state's mean vector (8 dimensional).
covariance : ndarray
The state's covariance matrix (8x8 dimensional).
measurement : ndarray
The 4 dimensional measurement vector (x, y, a, h), where (x, y)
is the center position, a the aspect ratio, and h the height of the
bounding box.
Returns
-------
(ndarray, ndarray)
Returns the measurement-corrected state distribution.
"""
projected_mean, projected_cov = self.project(mean, covariance)
chol_factor, lower = scipy.linalg.cho_factor(
projected_cov, lower=True, check_finite=False)
kalman_gain = scipy.linalg.cho_solve(
(chol_factor, lower), np.dot(covariance, self._update_mat.T).T,
check_finite=False).T
innovation = measurement - projected_mean
new_mean = mean + np.dot(innovation, kalman_gain.T)
new_covariance = covariance - np.linalg.multi_dot((
kalman_gain, projected_cov, kalman_gain.T))
return new_mean, new_covariance
def gating_distance(self, mean, covariance, measurements,
only_position=False, metric='maha'):
"""Compute gating distance between state distribution and measurements.
A suitable distance threshold can be obtained from `chi2inv95`. If
`only_position` is False, the chi-square distribution has 4 degrees of
freedom, otherwise 2.
Parameters
----------
mean : ndarray
Mean vector over the state distribution (8 dimensional).
covariance : ndarray
Covariance of the state distribution (8x8 dimensional).
measurements : ndarray
An Nx4 dimensional matrix of N measurements, each in
format (x, y, a, h) where (x, y) is the bounding box center
position, a the aspect ratio, and h the height.
only_position : Optional[bool]
If True, distance computation is done with respect to the bounding
box center position only.
Returns
-------
ndarray
Returns an array of length N, where the i-th element contains the
squared Mahalanobis distance between (mean, covariance) and
`measurements[i]`.
"""
mean, covariance = self.project(mean, covariance)
if only_position:
mean, covariance = mean[:2], covariance[:2, :2]
measurements = measurements[:, :2]
d = measurements - mean
if metric == 'gaussian':
return np.sum(d * d, axis=1)
elif metric == 'maha':
cholesky_factor = np.linalg.cholesky(covariance)
z = scipy.linalg.solve_triangular(
cholesky_factor, d.T, lower=True, check_finite=False,
overwrite_b=True)
squared_maha = np.sum(z * z, axis=0)
return squared_maha
else:
raise ValueError('invalid distance metric')
================================================
FILE: core/propagation/__init__.py
================================================
###################################################################
# File Name: __init__.py
# Author: Zhongdao Wang
# mail: wcd17@mails.tsinghua.edu.cn
# Created Time: Mon Jan 18 15:57:34 2021
###################################################################
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import
from .propagate_box import propagate_box
from .propagate_mask import propagate_mask
from .propagate_pose import propagate_pose
def propagate(temp_feats, obs, img, model, format='box'):
if format == 'box':
return propagate_box(temp_feats, obs, img, model)
elif format == 'mask':
return propagate_box(temp_feats, obs, img, model)
elif format == 'pose':
return propagate_pose(temp_feats, obs, img, model)
else:
raise ValueError('Observation format not supported.')
================================================
FILE: core/propagation/propagate_box.py
================================================
###################################################################
# File Name: propagate_box.py
# Author: Zhongdao Wang
# mail: wcd17@mails.tsinghua.edu.cn
# Created Time: Mon Jan 18 16:01:46 2021
###################################################################
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import
def propagate_box(temp_feats, box, img, model):
pass
================================================
FILE: core/propagation/propagate_mask.py
================================================
###################################################################
# File Name: propagate_box.py
# Author: Zhongdao Wang
# mail: wcd17@mails.tsinghua.edu.cn
# Created Time: Mon Jan 18 16:01:46 2021
###################################################################
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import
def propagate_mask(temp_feats, mask, img, model):
pass
================================================
FILE: core/propagation/propagate_pose.py
================================================
###################################################################
# File Name: propagate_box.py
# Author: Zhongdao Wang
# mail: wcd17@mails.tsinghua.edu.cn
# Created Time: Mon Jan 18 16:01:46 2021
###################################################################
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import
def propagate_pose(temp_feats, pose, img, model):
pass
================================================
FILE: data/jhmdb.py
================================================
from __future__ import print_function, absolute_import
import os
import numpy as np
import math
import scipy.io as sio
import cv2
import torch
from matplotlib import cm
from utils import im_to_numpy, im_to_torch
def resize(img, owidth, oheight):
img = im_to_numpy(img)
img = cv2.resize( img, (owidth, oheight) )
img = im_to_torch(img)
return img
def load_image(img_path):
# H x W x C => C x H x W
img = cv2.imread(img_path)
# print(img_path)
img = img.astype(np.float32)
img = img / 255.0
img = img[:,:,::-1]
img = img.copy()
return im_to_torch(img)
def color_normalize(x, mean, std):
if x.size(0) == 1:
x = x.repeat(3, 1, 1)
for t, m, s in zip(x, mean, std):
t.sub_(m)
t.div_(s)
return x
import time
######################################################################
def try_np_load(p):
try:
return np.load(p)
except:
return None
def make_lbl_set(lbls):
print(lbls.shape)
t00 = time.time()
lbl_set = [np.zeros(3).astype(np.uint8)]
count_lbls = [0]
flat_lbls_0 = lbls[0].copy().reshape(-1, lbls.shape[-1]).astype(np.uint8)
lbl_set = np.unique(flat_lbls_0, axis=0)
# print(lbl_set)
# if (lbl_set > 20).sum() > 0:
# import pdb; pdb.set_trace()
# count_lbls = [np.all(flat_lbls_0 == ll, axis=-1).sum() for ll in lbl_set]
print('lbls', time.time() - t00)
return lbl_set
def texturize(onehot):
flat_onehot = onehot.reshape(-1, onehot.shape[-1])
lbl_set = np.unique(flat_onehot, axis=0)
count_lbls = [np.all(flat_onehot == ll, axis=-1).sum() for ll in lbl_set]
object_id = np.argsort(count_lbls)[::-1][1]
hidxs = []
for h in range(onehot.shape[0]):
# appears = any(np.all(onehot[h] == lbl_set[object_id], axis=-1))
appears = np.any(onehot[h, :, 1:] == 1)
if appears:
hidxs.append(h)
nstripes = min(10, len(hidxs))
out = np.zeros((*onehot.shape[:2], nstripes+1))
out[:, :, 0] = 1
for i, h in enumerate(hidxs):
cidx = int(i // (len(hidxs) / nstripes))
w = np.any(onehot[h, :, 1:] == 1, axis=-1)
out[h][w] = 0
out[h][w, cidx+1] = 1
# print(i, h, cidx)
return out
class JhmdbSet(torch.utils.data.Dataset):
def __init__(self, args, sigma=0.5):
self.filelist = args.filelist
self.imgSize = args.imgSize
self.videoLen = args.videoLen
self.mapScale = args.mapScale
self.sigma = sigma
f = open(self.filelist, 'r')
self.jpgfiles = []
self.lblfiles = []
for line in f:
rows = line.split()
jpgfile = rows[1]
lblfile = rows[0]
self.jpgfiles.append(jpgfile)
self.lblfiles.append(lblfile)
f.close()
def get_onehot_lbl(self, lbl_path):
name = '/' + '/'.join(lbl_path.split('.')[:-1]) + '_onehot.npy'
if os.path.exists(name):
return np.load(name)
else:
return None
def make_paths(self, folder_path, label_path):
I = [ ll for ll in os.listdir(folder_path) if '.png' in ll ]
frame_num = len(I) + self.videoLen
I.sort(key=lambda x:int(x.split('.')[0]))
I_out, L_out = [], []
for i in range(frame_num):
i = max(0, i - self.videoLen)
img_path = "%s/%s" % (folder_path, I[i])
I_out.append(img_path)
return I_out
def __getitem__(self, index):
folder_path = self.jpgfiles[index]
label_path = self.lblfiles[index]
imgs = []
imgs_orig = []
lbls = []
lbls_onehot = []
patches = []
target_imgs = []
img_paths = self.make_paths(folder_path, label_path)
frame_num = len(img_paths)
mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]
t000 = time.time()
# frame_num = 30
for i in range(frame_num):
t00 = time.time()
img_path = img_paths[i]
img = load_image(img_path) # CxHxW
# print('loaded', i, time.time() - t00)
ht, wd = img.size(1), img.size(2)
if self.imgSize > 0:
newh, neww = ht, wd
if ht <= wd:
ratio = 1.0 #float(wd) / float(ht)
# width, height
img = resize(img, int(self.imgSize * ratio), self.imgSize)
newh = self.imgSize
neww = int(self.imgSize * ratio)
else:
ratio = 1.0 #float(ht) / float(wd)
# width, height
img = resize(img, self.imgSize, int(self.imgSize * ratio))
newh = int(self.imgSize * ratio)
neww = self.imgSize
img_orig = img.clone()
img = color_normalize(img, mean, std)
imgs_orig.append(img_orig)
imgs.append(img)
rsz_h, rsz_w = math.ceil(img.size(1) / self.mapScale[0]), math.ceil(img.size(2) /self.mapScale[1])
lbls_mat = sio.loadmat(label_path)
lbls_coord = lbls_mat['pos_img']
lbls_coord = lbls_coord - 1
lbls_coord[0, :, :] = lbls_coord[0, :, :] * float(neww) / float(wd) / self.mapScale[0]
lbls_coord[1, :, :] = lbls_coord[1, :, :] * float(newh) / float(ht) / self.mapScale[1]
lblsize = (rsz_h, rsz_w)
lbls = np.zeros((lbls_coord.shape[2], lblsize[0], lblsize[1], lbls_coord.shape[1]))
for i in range(lbls_coord.shape[2]):
lbls_coord_now = lbls_coord[:, :, i]
scales = lbls_coord_now.max(1) - lbls_coord_now.min(1)
scale = scales.max()
scale = max(0.5, scale*0.015)
for j in range(lbls_coord.shape[1]):
if self.sigma > 0:
draw_labelmap_np(lbls[i, :, :, j], lbls_coord_now[:, j], scale)
else:
tx = int(lbls_coord_now[0, j])
ty = int(lbls_coord_now[1, j])
if tx < lblsize[1] and ty < lblsize[0] and tx >=0 and ty >=0:
lbls[i, ty, tx, j] = 1.0
lbls_tensor = torch.zeros(frame_num, lblsize[0], lblsize[1], lbls_coord.shape[1])
for i in range(frame_num):
if i < self.videoLen:
nowlbl = lbls[0]
else:
if(i - self.videoLen < len(lbls)):
nowlbl = lbls[i - self.videoLen]
lbls_tensor[i] = torch.from_numpy(nowlbl)
lbls_tensor = torch.cat([(lbls_tensor.sum(-1) == 0)[..., None] *1.0, lbls_tensor], dim=-1)
lblset = np.arange(lbls_tensor.shape[-1]-1)
lblset = np.array([[0, 0, 0]] + [cm.Paired(i)[:3] for i in lblset]) * 255.0
# Meta info
meta = dict(folder_path=folder_path, img_paths=img_paths, lbl_paths=[])
imgs = torch.stack(imgs)
imgs_orig = torch.stack(imgs_orig)
lbls_resize = lbls_tensor #np.stack(resizes)
assert lbls_resize.shape[0] == len(meta['img_paths'])
#print('vid', i, 'took', time.time() - t000)
return imgs, imgs_orig, lbls_resize, lbls_tensor, lblset, meta
def __len__(self):
return len(self.jpgfiles)
def draw_labelmap_np(img, pt, sigma, type='Gaussian'):
# Draw a 2D gaussian
# Adopted from https://github.com/anewell/pose-hg-train/blob/master/src/pypose/draw.py
# Check that any part of the gaussian is in-bounds
ul = [int(pt[0] - 3 * sigma), int(pt[1] - 3 * sigma)]
br = [int(pt[0] + 3 * sigma + 1), int(pt[1] + 3 * sigma + 1)]
if (ul[0] >= img.shape[1] or ul[1] >= img.shape[0] or
br[0] < 0 or br[1] < 0):
# If not, just return the image as is
return img
# Generate gaussian
size = 6 * sigma + 1
x = np.arange(0, size, 1, float)
y = x[:, np.newaxis]
x0 = y0 = size // 2
# The gaussian is not normalized, we want the center value to equal 1
if type == 'Gaussian':
g = np.exp(- ((x - x0) ** 2 + (y - y0) ** 2) / (2 * sigma ** 2))
elif type == 'Cauchy':
g = sigma / (((x - x0) ** 2 + (y - y0) ** 2 + sigma ** 2) ** 1.5)
# Usable gaussian range
g_x = max(0, -ul[0]), min(br[0], img.shape[1]) - ul[0]
g_y = max(0, -ul[1]), min(br[1], img.shape[0]) - ul[1]
# Image range
img_x = max(0, ul[0]), min(br[0], img.shape[1])
img_y = max(0, ul[1]), min(br[1], img.shape[0])
img[img_y[0]:img_y[1], img_x[0]:img_x[1]] = g[g_y[0]:g_y[1], g_x[0]:g_x[1]]
return img
================================================
FILE: data/kinetics.py
================================================
import torchvision.datasets.video_utils
from torchvision.datasets.video_utils import VideoClips
from torchvision.datasets.utils import list_dir
from torchvision.datasets.folder import make_dataset
from torchvision.datasets.vision import VisionDataset
import numpy as np
class Kinetics400(VisionDataset):
"""
`Kinetics-400 `_
dataset.
Kinetics-400 is an action recognition video dataset.
This dataset consider every video as a collection of video clips of fixed size, specified
by ``frames_per_clip``, where the step in frames between each clip is given by
``step_between_clips``.
To give an example, for 2 videos with 10 and 15 frames respectively, if ``frames_per_clip=5``
and ``step_between_clips=5``, the dataset size will be (2 + 3) = 5, where the first two
elements will come from video 1, and the next three elements from video 2.
Note that we drop clips which do not have exactly ``frames_per_clip`` elements, so not all
frames in a video might be present.
Internally, it uses a VideoClips object to handle clip creation.
Args:
root (string): Root directory of the Kinetics-400 Dataset.
frames_per_clip (int): number of frames in a clip
step_between_clips (int): number of frames between each clip
transform (callable, optional): A function/transform that takes in a TxHxWxC video
and returns a transformed version.
Returns:
video (Tensor[T, H, W, C]): the `T` video frames
audio(Tensor[K, L]): the audio frames, where `K` is the number of channels
and `L` is the number of points
label (int): class of the video clip
"""
def __init__(self, root, frames_per_clip, step_between_clips=1, frame_rate=None,
extensions=('mp4',), transform=None, cached=None, _precomputed_metadata=None):
super(Kinetics400, self).__init__(root)
extensions = extensions
classes = list(sorted(list_dir(root)))
class_to_idx = {classes[i]: i for i in range(len(classes))}
self.samples = make_dataset(self.root, class_to_idx, extensions, is_valid_file=None)
self.classes = classes
video_list = [x[0] for x in self.samples]
self.video_clips = VideoClips(
video_list,
frames_per_clip,
step_between_clips,
frame_rate,
_precomputed_metadata,
)
self.transform = transform
def __len__(self):
return self.video_clips.num_clips()
def __getitem__(self, idx):
success = False
while not success:
try:
video, audio, info, video_idx = self.video_clips.get_clip(idx)
success = True
except:
print('skipped idx', idx)
idx = np.random.randint(self.__len__())
label = self.samples[video_idx][1]
if self.transform is not None:
video = self.transform(video)
return video, audio, label
================================================
FILE: data/video.py
================================================
import os
import pdb
import glob
import json
import os.path as osp
import cv2
import numpy as np
import pycocotools.mask as mask_utils
from utils.box import xyxy2xywh
from torchvision.transforms import transforms as T
class LoadImages: # for inference
def __init__(self, path, img_size=(1088, 608)):
if os.path.isdir(path):
image_format = ['.jpg', '.jpeg', '.png', '.tif']
self.files = sorted(glob.glob('%s/*.*' % path))
self.files = list(filter(lambda x: os.path.splitext(x)[1].lower()
in image_format, self.files))
elif os.path.isfile(path):
self.files = [path]
self.nF = len(self.files) # number of image files
self.width = img_size[0]
self.height = img_size[1]
self.count = 0
assert self.nF > 0, 'No images found in ' + path
def __iter__(self):
self.count = -1
return self
def __next__(self):
self.count += 1
if self.count == self.nF:
raise StopIteration
img_path = self.files[self.count]
# Read image
img0 = cv2.imread(img_path) # BGR
assert img0 is not None, 'Failed to load ' + img_path
# Padded resize
img, _, _, _ = letterbox(img0, height=self.height, width=self.width)
# Normalize RGB
img = img[:, :, ::-1].transpose(2, 0, 1)
img = np.ascontiguousarray(img, dtype=np.float32)
img /= 255.0
return img_path, img, img0
def __getitem__(self, idx):
idx = idx % self.nF
img_path = self.files[idx]
# Read image
img0 = cv2.imread(img_path) # BGR
assert img0 is not None, 'Failed to load ' + img_path
# Padded resize
img, _, _, _ = letterbox(img0, height=self.height, width=self.width)
# Normalize RGB
img = img[:, :, ::-1].transpose(2, 0, 1)
img = np.ascontiguousarray(img, dtype=np.float32)
img /= 255.0
return img_path, img, img0
def __len__(self):
return self.nF # number of files
class LoadVideo: # for inference
def __init__(self, path, img_size=(1088, 608)):
self.cap = cv2.VideoCapture(path)
self.frame_rate = int(round(self.cap.get(cv2.CAP_PROP_FPS)))
self.vw = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH))
self.vh = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
self.vn = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
self.width = img_size[0]
self.height = img_size[1]
self.count = 0
self.w, self.h = self.get_size(self.vw, self.vh, self.width, self.height)
print('Lenth of the video: {:d} frames'.format(self.vn))
def get_size(self, vw, vh, dw, dh):
wa, ha = float(dw) / vw, float(dh) / vh
a = min(wa, ha)
return int(vw * a), int(vh*a)
def __iter__(self):
self.count = -1
return self
def __next__(self):
self.count += 1
if self.count == len(self):
raise StopIteration
# Read image
res, img0 = self.cap.read() # BGR
assert img0 is not None, 'Failed to load frame {:d}'.format(self.count)
img0 = cv2.resize(img0, (self.w, self.h))
# Padded resize
img, _, _, _ = letterbox(img0, height=self.height, width=self.width)
# Normalize RGB
img = img[:, :, ::-1]
img = np.ascontiguousarray(img, dtype=np.float32)
return self.count, img, img0
def __len__(self):
return self.vn # number of files
class LoadImagesAndObs:
def __init__(self, path, opt):
obid = opt.obid
img_size = getattr(opt,'img_size', None)
if os.path.isdir(path):
image_format = ['.jpg', '.jpeg', '.png', '.tif']
self.img_files = sorted(glob.glob('%s/*.*' % path))
self.img_files = list(filter(
lambda x: os.path.splitext(x)[1].lower() in image_format, self.img_files))
elif os.path.isfile(path):
self.img_files = [path,]
self.label_files = [x.replace('images', osp.join('obs', obid)).replace(
'.png', '.txt').replace('.jpg', '.txt') for x in self.img_files]
self.nF = len(self.img_files) # number of image files
self.transforms = T.Compose([T.ToTensor(), T.Normalize(opt.im_mean, opt.im_std)])
self.use_lab = getattr(opt, 'use_lab', False)
if not img_size is None:
self.width = img_size[0]
self.height = img_size[1]
def __getitem__(self, files_index):
img_path = self.img_files[files_index]
label_path = self.label_files[files_index]
return self.get_data(img_path, label_path)
def get_data(self, img_path, label_path):
height = self.height
width = self.width
img_ori = cv2.imread(img_path) # BGR
if img_ori is None:
raise ValueError('File corrupt {}'.format(img_path))
h, w, _ = img_ori.shape
img, ratio, padw, padh = letterbox(img_ori, height=height, width=width)
# Load labels
if os.path.isfile(label_path):
labels0 = np.loadtxt(label_path, dtype=np.float32).reshape(-1, 5)
# Normalized xywh to pixel xyxy format
labels = labels0.copy()
labels[:, 0] = ratio * w * (labels0[:, 0] - labels0[:, 2] / 2) + padw
labels[:, 1] = ratio * h * (labels0[:, 1] - labels0[:, 3] / 2) + padh
labels[:, 2] = ratio * w * (labels0[:, 0] + labels0[:, 2] / 2) + padw
labels[:, 3] = ratio * h * (labels0[:, 1] + labels0[:, 3] / 2) + padh
else:
labels = np.array([])
nL = len(labels)
if nL > 0:
# convert xyxy to xywh
labels[:, 0:4] = xyxy2xywh(labels[:, 0:4].copy())
labels[:, 0] /= width
labels[:, 1] /= height
labels[:, 2] /= width
labels[:, 3] /= height
if self.use_lab:
img = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
img = np.array([img[:, :, 0], ]*3)
img = img.transpose(1, 2, 0)
img = img / 255.
img = np.ascontiguousarray(img[:, :, ::-1]) # BGR to RGB
if self.transforms is not None:
img = self.transforms(img)
return img, labels, img_ori, (h, w)
def __len__(self):
return self.nF # number of batches
class LoadImagesAndObsTAO:
def __init__(self, root, video_meta, obs, opt):
self.dataroot = root
self.img_ind = [x['id'] for x in video_meta]
self.img_files = [x['file_name'] for x in video_meta]
self.img_files = [osp.join(root, 'frames', x) for x in self.img_files]
self.obs = [obs.get(x, []) for x in self.img_ind]
self.use_lab = getattr(opt, 'use_lab', False)
self.transforms = T.Compose([T.ToTensor(), T.Normalize(opt.im_mean, opt.im_std)])
def __getitem__(self, index):
img_ori = cv2.imread(self.img_files[index])
if img_ori is None:
raise ValueError('File corrupt {}'.format(img_path))
h, w, _ = img_ori.shape
img = img_ori
if self.use_lab:
img = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
img = np.array([img[:,:,0],]*3)
img = img.transpose(1,2,0)
img = img / 255.
img = np.ascontiguousarray(img[ :, :, ::-1]) # BGR to RGB
if self.transforms is not None:
img = self.transforms(img)
obs = self.obs[index]
if len(obs) == 0:
labels = np.array([[0,0,1,1,-1,-1]])
else:
boxes = np.array([x.get('bbox', [0,0,1,1]) for x in obs])
scores = np.array([x.get('score', 0) for x in obs])[:, None]
cat_ids = np.array([x.get('category_id',-1) for x in obs])[:, None]
labels = np.concatenate([boxes, scores, cat_ids], axis=1)
if len(labels) > 0:
# From tlwh to xywh: (x,y) is the box center
labels[:, 0] = labels[:, 0] + labels[:, 2] / 2
labels[:, 1] = labels[:, 1] + labels[:, 3] / 2
labels[:, 0] /= w
labels[:, 1] /= h
labels[:, 2] /= w
labels[:, 3] /= h
return img, labels, img_ori, (h,w)
def __len__(self):
return len(self.img_files)
class LoadImagesAndMaskObsVIS:
def __init__(self, path, info, obs, opt):
self.dataroot = path
self.nF = info['length']
self.img_files = [osp.join(path, p) for p in info['file_names']]
self.obsbyobj = obs
self.transforms = T.Compose([T.ToTensor(), T.Normalize(opt.im_mean, opt.im_std)])
self.use_lab = getattr(opt, 'use_lab', False)
def __getitem__(self, idx):
img_ori = cv2.imread(self.img_files[idx])
if img_ori is None:
raise ValueError('File corrupt {}'.format(img_path))
h, w, _ = img_ori.shape
img = img_ori
if self.use_lab:
img = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
img = np.array([img[:,:,0],]*3)
img = img.transpose(1,2,0)
img = img / 255.
img = np.ascontiguousarray(img[ :, :, ::-1]) # BGR to RGB
if self.transforms is not None:
img = self.transforms(img)
labels = list()
for obj in self.obsbyobj:
RLE = obj['segmentations'][idx]
if RLE: labels.append(mask_utils.decode(RLE))
else: labels.append(np.zeros((h, w), dtype=np.uint8))
labels = np.stack(labels)
return img, labels, img_ori, (h, w)
def __len__(self):
return self.nF
class LoadImagesAndMaskObsMOTS(LoadImagesAndObs):
def __init__(self, path, opt):
super(LoadImagesAndMaskObsMOTS, self).__init__(path, opt)
def get_data(self, img_path, label_path):
img_ori = cv2.imread(img_path) # BGR
if img_ori is None:
raise ValueError('File corrupt {}'.format(img_path))
h, w, _ = img_ori.shape
img = img_ori
if self.use_lab:
img = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
img = np.array([img[:,:,0],]*3)
img = img.transpose(1,2,0)
img = img / 255.
img = np.ascontiguousarray(img[ :, :, ::-1]) # BGR to RGB
if self.transforms is not None:
img = self.transforms(img)
# Load labels
labels = []
if os.path.isfile(label_path):
with open(label_path, 'r') as f:
for line in f:
labels.append(line.strip().split())
nL = len(labels)
if nL > 0:
labels = [{'size':(int(h),int(w)), 'counts':m} for \
_, _,cid,h,w,m in labels if cid=='2']
labels = [mask_utils.decode(rle) for rle in labels]
labels = np.stack(labels)
return img, labels, img_ori, (h, w)
class LoadImagesAndPoseObs(LoadImagesAndObs):
def __init__(self, obs_jpath, opt):
fjson = open(obs_jpath, 'r')
self.infoj = json.load(fjson)['annolist']
self.dataroot = opt.data_root
self.nF = len(self.infoj)
self.img_files = [osp.join(opt.data_root, p['image'][0]['name']) for p in self.infoj]
self.transforms = T.Compose([T.ToTensor(), T.Normalize(opt.im_mean, opt.im_std)])
self.use_lab = getattr(opt, 'use_lab', False)
def __getitem__(self, idx):
img_ori = cv2.imread(self.img_files[idx])
if img_ori is None:
raise ValueError('File corrupt {}'.format(img_path))
h, w, _ = img_ori.shape
img = img_ori
if self.use_lab:
img = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
img = np.array([img[:,:,0],]*3)
img = img.transpose(1,2,0)
img = img / 255.
img = np.ascontiguousarray(img[ :, :, ::-1]) # BGR to RGB
if self.transforms is not None:
img = self.transforms(img)
info_label = self.infoj[idx]['annorect']
nobj = len(info_label)
labels = list()
labels = [l['annopoints'][0]['point'] for l in info_label]
return img, labels, img_ori, (h, w)
def letterbox(img, height=608, width=1088, color=(127.5, 127.5, 127.5)): # resize a rectangular image to a padded rectangular
shape = img.shape[:2] # shape = [height, width]
ratio = min(float(height)/shape[0], float(width)/shape[1])
new_shape = (round(shape[1] * ratio), round(shape[0] * ratio)) # new_shape = [width, height]
dw = (width - new_shape[0]) / 2 # width padding
dh = (height - new_shape[1]) / 2 # height padding
top, bottom = round(dh - 0.1), round(dh + 0.1)
left, right = round(dw - 0.1), round(dw + 0.1)
img = cv2.resize(img, new_shape, interpolation=cv2.INTER_AREA) # resized, no border
img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # padded rectangular
return img, ratio, dw, dh
================================================
FILE: data/vos.py
================================================
from __future__ import print_function, absolute_import
import os
import pdb
import os.path as osp
import numpy as np
import math
import cv2
import torch
import time
from matplotlib import cm
from utils import im_to_numpy, im_to_torch
def resize(img, owidth, oheight):
img = im_to_numpy(img)
img = cv2.resize(img, (owidth, oheight))
img = im_to_torch(img)
return img
def load_image(img):
# H x W x C => C x H x W
if isinstance(img, str):
img = cv2.imread(img)
if len(img.shape) == 2:
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
img = img.astype(np.float32)
img = img / 255.0
img = img[:, :, ::-1]
img = img.copy()
return im_to_torch(img)
def color_normalize(x, mean, std):
if x.size(0) == 1:
x = x.repeat(3, 1, 1)
for t, m, s in zip(x, mean, std):
t.sub_(m)
t.div_(s)
return x
######################################################################
def try_np_load(p):
try:
return np.load(p)
except:
return None
def make_lbl_set(lbls):
lbl_set = [np.zeros(3).astype(np.uint8)]
flat_lbls_0 = lbls[0].copy().reshape(-1, lbls.shape[-1]).astype(np.uint8)
lbl_set = np.unique(flat_lbls_0, axis=0)
return lbl_set
def texturize(onehot):
flat_onehot = onehot.reshape(-1, onehot.shape[-1])
lbl_set = np.unique(flat_onehot, axis=0)
count_lbls = [np.all(flat_onehot == ll, axis=-1).sum() for ll in lbl_set]
object_id = np.argsort(count_lbls)[::-1][1]
hidxs = []
for h in range(onehot.shape[0]):
appears = np.any(onehot[h, :, 1:] == 1)
if appears:
hidxs.append(h)
nstripes = min(10, len(hidxs))
out = np.zeros((*onehot.shape[:2], nstripes+1))
out[:, :, 0] = 1
for i, h in enumerate(hidxs):
cidx = int(i // (len(hidxs) / nstripes))
w = np.any(onehot[h, :, 1:] == 1, axis=-1)
out[h][w] = 0
out[h][w, cidx+1] = 1
return out
class VOSDataset(torch.utils.data.Dataset):
def __init__(self, args):
self.davisroot = args.davisroot
self.split = args.split
self.imgSize = args.imgSize
self.videoLen = args.videoLen
self.mapScale = args.mapScale
self.texture = False
self.round = False
self.use_lab = getattr(args, 'use_lab', False)
self.im_mean = args.im_mean
self.im_std = args.im_std
filelist = osp.join(self.davisroot, 'ImageSets/2017', self.split+'.txt')
f = open(filelist, 'r')
self.jpgfiles = []
self.lblfiles = []
for line in f:
seq = line.strip()
self.jpgfiles.append(osp.join(self.davisroot,'JPEGImages','480p', seq))
self.lblfiles.append(osp.join(self.davisroot, 'Annotations','480p', seq))
f.close()
def get_onehot_lbl(self, lbl_path):
name = '/' + '/'.join(lbl_path.split('.')[:-1]) + '_onehot.npy'
if os.path.exists(name):
return np.load(name)
else:
return None
def make_paths(self, folder_path, label_path):
I, L = os.listdir(folder_path), os.listdir(label_path)
L = [ll for ll in L if 'npy' not in ll]
frame_num = len(I) + self.videoLen
I.sort(key=lambda x:int(x.split('.')[0]))
L.sort(key=lambda x:int(x.split('.')[0]))
I_out, L_out = [], []
for i in range(frame_num):
i = max(0, i - self.videoLen)
img_path = "%s/%s" % (folder_path, I[i])
lbl_path = "%s/%s" % (label_path, L[i])
I_out.append(img_path)
L_out.append(lbl_path)
return I_out, L_out
def __getitem__(self, index):
folder_path = self.jpgfiles[index]
label_path = self.lblfiles[index]
imgs = []
imgs_orig = []
lbls = []
lbls_onehot = []
patches = []
target_imgs = []
frame_num = len(os.listdir(folder_path)) + self.videoLen
img_paths, lbl_paths = self.make_paths(folder_path, label_path)
t000 = time.time()
for i in range(frame_num):
t00 = time.time()
img_path, lbl_path = img_paths[i], lbl_paths[i]
img = load_image(img_path) # CxHxW
lblimg = cv2.imread(lbl_path)
'''
Resize img to 320x320
'''
ht, wd = img.size(1), img.size(2)
if self.imgSize > 0:
newh, neww = ht, wd
if ht <= wd:
ratio = 1.0 #float(wd) / float(ht)
# width, height
img = resize(img, int(self.imgSize * ratio), self.imgSize)
newh = self.imgSize
neww = int(self.imgSize * ratio)
else:
ratio = 1.0 #float(ht) / float(wd)
# width, height
img = resize(img, self.imgSize, int(self.imgSize * ratio))
newh = int(self.imgSize * ratio)
neww = self.imgSize
lblimg = cv2.resize(lblimg, (newh, neww), cv2.INTER_NEAREST)
# Resized, but not augmented image
img_orig = img.clone()
'''
Transforms
'''
if self.use_lab:
img = im_to_numpy(img)
img = (img * 255).astype(np.uint8)[:,:,::-1]
img = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
img = im_to_torch(img) / 255.
img = color_normalize(img, self.im_mean, self.im_std)
img = torch.stack([img[0]]*3)
else:
img = color_normalize(img, self.im_mean, self.im_std)
imgs_orig.append(img_orig)
imgs.append(img)
lbls.append(lblimg.copy())
# Meta info
meta = dict(folder_path=folder_path, img_paths=img_paths, lbl_paths=lbl_paths)
########################################################
# Load reshaped label information (load cached versions if possible)
lbls = np.stack(lbls)
prefix = '/' + '/'.join(lbl_paths[0].split('.')[:-1])
# Get lblset
lblset = make_lbl_set(lbls)
if np.all((lblset[1:] - lblset[:-1]) == 1):
lblset = lblset[:, 0:1]
onehots = []
resizes = []
rsz_h, rsz_w = math.ceil(img.size(1) / self.mapScale[0]), math.ceil(img.size(2) /self.mapScale[1])
for i,p in enumerate(lbl_paths):
prefix = '/' + '/'.join(p.split('.')[:-1])
# print(prefix)
oh_path = "%s_%s.npy" % (prefix, 'onehot')
rz_path = "%s_%s.npy" % (prefix, 'size%sx%s' % (rsz_h, rsz_w))
onehot = try_np_load(oh_path)
if onehot is None:
print('computing onehot lbl for', oh_path)
onehot = np.stack([np.all(lbls[i] == ll, axis=-1) for ll in lblset], axis=-1)
np.save(oh_path, onehot)
resized = try_np_load(rz_path)
if resized is None:
print('computing resized lbl for', rz_path)
resized = cv2.resize(np.float32(onehot), (rsz_w, rsz_h), cv2.INTER_LINEAR)
np.save(rz_path, resized)
if self.texture:
texturized = texturize(resized)
resizes.append(texturized)
lblset = np.array([[0, 0, 0]] + [cm.Paired(i)[:3] for i in range(texturized.shape[-1])]) * 255.0
break
else:
resizes.append(resized)
onehots.append(onehot)
if self.texture:
resizes = resizes * self.videoLen
for _ in range(len(lbl_paths)-self.videoLen):
resizes.append(np.zeros(resizes[0].shape))
onehots = resizes
########################################################
imgs = torch.stack(imgs)
imgs_orig = torch.stack(imgs_orig)
lbls_tensor = torch.from_numpy(np.stack(lbls))
lbls_resize = np.stack(resizes)
assert lbls_resize.shape[0] == len(meta['lbl_paths'])
return imgs, imgs_orig, lbls_resize, lbls_tensor, lblset, meta
def __len__(self):
return len(self.jpgfiles)
================================================
FILE: demo/mot_demo.py
================================================
###################################################################
# File Name: mot_demo.py
# Author: Zhongdao Wang
# mail: wcd17@mails.tsinghua.edu.cn
# Created Time: Sat Jul 24 16:07:23 2021
###################################################################
import os
import sys
import yaml
import argparse
import os.path as osp
from loguru import logger
import cv2
import torch
import numpy as np
from torchvision.transforms import transforms as T
sys.path[0] = os.getcwd()
from data.video import LoadVideo
from utils.meter import Timer
from utils import visualize as vis
from detector.YOLOX.yolox.exp import get_exp
from detector.YOLOX.yolox.utils import get_model_info
from detector.YOLOX.yolox.data.datasets import COCO_CLASSES
from detector.YOLOX.tools.demo import Predictor
from utils.box import scale_box_input_size
from tracker.mot.box import BoxAssociationTracker
def make_parser():
parser = argparse.ArgumentParser("YOLOX + UniTrack MOT demo")
# Common arguments
parser.add_argument('--demo', default='video',
help='demo type, eg. video or webcam')
parser.add_argument('--path', default='./docs/test_video.mp3',
help='path to images or video')
parser.add_argument('--save_result', action='store_true',
help='whether to save result')
parser.add_argument("--nms", default=None, type=float,
help="test nms threshold")
parser.add_argument("--tsize", default=[640, 480], type=int, nargs='+',
help="test img size")
parser.add_argument("--exp_file", type=str,
default='./detector/YOLOX/exps/default/yolox_x.py',
help="pls input your expriment description file")
parser.add_argument('--output-root', default='./results/mot_demo',
help='output directory')
parser.add_argument('--classes', type=int, nargs='+',
default=list(range(90)), help='COCO_CLASSES')
# Detector related
parser.add_argument("-c", "--ckpt", type=str,
default='./detector/YOLOX/weights/yolox_x.pth',
help="model weights of the detector")
parser.add_argument("--conf", default=0.65, type=float,
help="detection confidence threshold")
# UniTrack related
parser.add_argument('--config', type=str, help='tracker config file',
default='./config/imagenet_resnet18_s3.yaml')
return parser
def dets2obs(dets, imginfo, cls):
if dets is None or len(dets) == 0:
return np.array([])
obs = dets.cpu().numpy()
h, w = imginfo['height'], imginfo['width']
# To xywh
ret = np.zeros((len(obs), 6))
ret[:, 0] = (obs[:, 0] + obs[:, 2]) * 0.5 / w
ret[:, 1] = (obs[:, 1] + obs[:, 3]) * 0.5 / h
ret[:, 2] = (obs[:, 2] - obs[:, 0]) / w
ret[:, 3] = (obs[:, 3] - obs[:, 1]) / h
ret[:, 4] = obs[:, 4] * obs[:, 5]
ret[:, 5] = obs[:, 6]
ret = [r for r in ret if int(r[5]) in cls]
ret = np.array(ret)
return ret
def eval_seq(opt, dataloader, detector, tracker,
result_filename, save_dir=None,
show_image=True):
transforms = T.Compose([T.ToTensor(),
T.Normalize(opt.im_mean, opt.im_std)])
if save_dir:
os.makedirs(save_dir, exist_ok=True)
timer = Timer()
results = []
for frame_id, (_, img, img0) in enumerate(dataloader):
if frame_id % 20 == 0:
logger.info('Processing frame {} ({:.2f} fps)'.format(
frame_id, 1./max(1e-5, timer.average_time)))
# run tracking
timer.tic()
det_outputs, img_info = detector.inference(img)
img = img / 255.
img = transforms(img)
obs = dets2obs(det_outputs[0], img_info, opt.classes)
if len(obs) == 0:
online_targets = []
else:
online_targets = tracker.update(img, img0, obs)
online_tlwhs = []
online_ids = []
for t in online_targets:
tlwh = t.tlwh
tid = t.track_id
online_tlwhs.append(tlwh)
online_ids.append(tid)
timer.toc()
# save results
results.append((frame_id + 1, online_tlwhs, online_ids))
if show_image or save_dir is not None:
online_im = vis.plot_tracking(
img0, online_tlwhs, online_ids, frame_id=frame_id,
fps=1. / timer.average_time)
if show_image:
cv2.imshow('online_im', online_im)
if save_dir is not None:
cv2.imwrite(os.path.join(
save_dir, '{:05d}.jpg'.format(frame_id)), online_im)
return frame_id, timer.average_time, timer.calls
def main(exp, args):
logger.info("Args: {}".format(args))
# Data, I/O
dataloader = LoadVideo(args.path, args.tsize)
video_name = osp.basename(args.path).split('.')[0]
result_root = osp.join(args.output_root, video_name)
result_filename = os.path.join(result_root, 'results.txt')
args.frame_rate = dataloader.frame_rate
# Detector init
det_model = exp.get_model()
logger.info("Model Summary: {}".format(
get_model_info(det_model, exp.test_size)))
det_model.cuda()
det_model.eval()
logger.info("loading checkpoint")
ckpt = torch.load(args.ckpt, map_location="cpu")
# load the model state dict
det_model.load_state_dict(ckpt["model"])
logger.info("loaded checkpoint done.")
detector = Predictor(det_model, exp, COCO_CLASSES, None, None, 'gpu')
# Tracker init
tracker = BoxAssociationTracker(args)
frame_dir = osp.join(result_root, 'frame')
try:
eval_seq(args, dataloader, detector, tracker, result_filename,
save_dir=frame_dir, show_image=False)
except Exception as e:
print(e)
output_video_path = osp.join(result_root, video_name+'.avi')
cmd_str = 'ffmpeg -f image2 -i {}/%05d.jpg -c:v copy {}'.format(
osp.join(result_root, 'frame'), output_video_path)
os.system(cmd_str)
if __name__ == '__main__':
args = make_parser().parse_args()
with open(args.config) as f:
common_args = yaml.load(f)
for k, v in common_args['common'].items():
setattr(args, k, v)
for k, v in common_args['mot'].items():
setattr(args, k, v)
exp = get_exp(args.exp_file, None)
if args.conf is not None:
args.conf_thres = args.conf
exp.test_conf = args.conf
if args.nms is not None:
exp.nmsthre = args.nms
if args.tsize is not None:
exp.test_size = args.tsize[::-1]
args.img_size = args.tsize
args.classes = [x for x in args.classes]
main(exp, args)
================================================
FILE: demo/sot_demo.py
================================================
# ------------------------------------------------------------------------------
# Copyright (c) Microsoft
# Licensed under the MIT License.
# Written by Zhipeng Zhang (zhangzhipeng2017@ia.ac.cn)
# ------------------------------------------------------------------------------
import os
import pdb
import sys
sys.path[0] = os.getcwd()
import cv2
import yaml
import argparse
from PIL import Image
from glob import glob
from os.path import exists, join
from easydict import EasyDict as edict
import torch
import numpy as np
import tracker.sot.lib.models as models
from tracker.sot.lib.utils.utils import load_dataset, crop_chw, \
gaussian_shaped_labels, cxy_wh_2_rect1, rect1_2_cxy_wh, cxy_wh_2_bbox
from tracker.sot.lib.core.eval_otb import eval_auc_tune
import utils
from model import AppearanceModel, partial_load
from data.vos import color_normalize, load_image, im_to_numpy, im_to_torch
def get_frames(video_name):
if not video_name:
cap = cv2.VideoCapture(0)
# warmup
for i in range(5):
cap.read()
while True:
ret, frame = cap.read()
if ret:
yield frame
else:
break
elif video_name.endswith('avi') or video_name.endswith('mp4'):
cap = cv2.VideoCapture(video_name)
while True:
ret, frame = cap.read()
if ret:
yield frame
else:
break
else:
images = glob(os.path.join(video_name, '*.jp*'))
images = sorted(images,
key=lambda x: int(x.split('/')[-1].split('.')[0]))
for img in images:
frame = cv2.imread(img)
yield frame
def preproc(img, im_mean, im_std, use_lab=False):
img = load_image(img)
if use_lab:
img = im_to_numpy(img)
img = (img*255).astype(np.uint8)[:, :, ::-1]
img = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
img = im_to_torch(img) / 255.
img = color_normalize(img, im_mean, im_std)
if use_lab:
img = torch.stack([img[0], ]*3)
img = img.permute(1, 2, 0).numpy() # H, W, C
return img
class TrackerConfig(object):
crop_sz = 512 + 8
downscale = 8
temp_sz = crop_sz // downscale
lambda0 = 1e-4
padding = 3.5
interp_factor = 0.01
num_scale = 3
scale_step = 1.0275
scale_factor = scale_step ** (np.arange(num_scale) - num_scale // 2)
min_scale_factor = 0.2
max_scale_factor = 5
scale_penalty = 0.985
scale_penalties = scale_penalty ** (np.abs((np.arange(num_scale) - num_scale // 2)))
net_output_size = [temp_sz, temp_sz]
cos_window = torch.Tensor(np.outer(np.hanning(temp_sz), np.hanning(temp_sz))).cuda()
def track(net, args):
toc = 0
config = TrackerConfig()
video_name = os.path.basename(args.input) if args.input else 'webcam'
regions = [] # FINAL RESULTS
for f, img_raw in enumerate(get_frames(args.input)):
img_raw = cv2.resize(img_raw, (640,480))
use_lab = getattr(args, 'use_lab', False)
im = preproc(img_raw, args.im_mean, args.im_std, use_lab)
tic = cv2.getTickCount()
# Init
if f == 0:
try:
init_rect = cv2.selectROI(video_name, img_raw, False, False)
except Exception:
exit()
target_pos, target_sz = rect1_2_cxy_wh(init_rect)
min_sz = np.maximum(config.min_scale_factor * target_sz, 4)
max_sz = np.minimum(im.shape[:2], config.max_scale_factor * target_sz)
# crop template
window_sz = target_sz * (1 + config.padding)
bbox = cxy_wh_2_bbox(target_pos, window_sz)
patch = crop_chw(im, bbox, config.crop_sz)
target = patch
net.update(torch.Tensor(np.expand_dims(target, axis=0)).cuda(), lr=1)
regions.append(cxy_wh_2_rect1(target_pos, target_sz))
patch_crop = np.zeros((config.num_scale, patch.shape[0],
patch.shape[1], patch.shape[2]), np.float32)
# Track
else:
for i in range(config.num_scale): # crop multi-scale search region
window_sz = target_sz * (config.scale_factor[i] * (1 + config.padding))
bbox = cxy_wh_2_bbox(target_pos, window_sz)
patch_crop[i, :] = crop_chw(im, bbox, config.crop_sz)
search = patch_crop
response = net(torch.Tensor(search).cuda())
net_output_size = [response.shape[-2], response.shape[-1]]
peak, idx = torch.max(response.view(config.num_scale, -1), 1)
peak = peak.data.cpu().numpy() * config.scale_penalties
best_scale = np.argmax(peak)
r_max, c_max = np.unravel_index(idx[best_scale].cpu(), net_output_size)
r_max = r_max - net_output_size[0] * 0.5
c_max = c_max - net_output_size[1] * 0.5
window_sz = target_sz * (config.scale_factor[best_scale] * (1 + config.padding))
target_pos = target_pos + np.array([c_max, r_max]) * window_sz / net_output_size
target_sz = np.minimum(np.maximum(window_sz / (1 + config.padding), min_sz), max_sz)
# model update
window_sz = target_sz * (1 + config.padding)
bbox = cxy_wh_2_bbox(target_pos, window_sz)
patch = crop_chw(im, bbox, config.crop_sz)
target = patch
regions.append(cxy_wh_2_rect1(target_pos, target_sz)) # 1-index
toc += cv2.getTickCount() - tic
bbox = list(map(int, regions[-1]))
cv2.rectangle(img_raw, (bbox[0], bbox[1]),
(bbox[0]+bbox[2], bbox[1]+bbox[3]), (0, 255, 0), 3)
cv2.imshow(video_name, img_raw)
cv2.waitKey(40)
toc /= cv2.getTickFrequency()
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--config', default='', required=True, type=str)
parser.add_argument('--input', required=True, type=str)
args = parser.parse_args()
with open(args.config) as f:
common_args = yaml.load(f)
for k, v in common_args['common'].items():
setattr(args, k, v)
for k, v in common_args['sot'].items():
setattr(args, k, v)
args.arch = 'SiamFC'
# prepare model
base = AppearanceModel(args).to(args.device)
print('Total params: %.2fM' %
(sum(p.numel() for p in base.parameters())/1e6))
print(base)
net = models.__dict__[args.arch](base=base, config=TrackerConfig())
net.eval()
net = net.cuda()
track(net, args)
if __name__ == '__main__':
main()
================================================
FILE: detector/YOLOX/.gitignore
================================================
### Linux ###
*~
# temporary files which can be created if a process still has a handle open of a deleted file
.fuse_hidden*
# KDE directory preferences
.directory
# Linux trash folder which might appear on any partition or disk
.Trash-*
# .nfs files are created when an open file is removed but is still being accessed
.nfs*
### PyCharm ###
# User-specific stuff
.idea
# CMake
cmake-build-*/
# Mongo Explorer plugin
.idea/**/mongoSettings.xml
# File-based project format
*.iws
# IntelliJ
out/
# mpeltonen/sbt-idea plugin
.idea_modules/
# JIRA plugin
atlassian-ide-plugin.xml
# Cursive Clojure plugin
.idea/replstate.xml
# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties
# Editor-based Rest Client
.idea/httpRequests
# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser
# JetBrains templates
**___jb_tmp___
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
docs/build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don’t work, or not
# install all needed dependencies.
#Pipfile.lock
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
### Vim ###
# Swap
[._]*.s[a-v][a-z]
[._]*.sw[a-p]
[._]s[a-rt-v][a-z]
[._]ss[a-gi-z]
[._]sw[a-p]
# Session
Session.vim
# Temporary
.netrwhist
# Auto-generated tag files
tags
# Persistent undo
[._]*.un~
# output
docs/api
.code-workspace.code-workspace
*.pkl
*.npy
*.pth
*.onnx
events.out.tfevents*
# vscode
*.code-workspace
.vscode
# vim
.vim
================================================
FILE: detector/YOLOX/LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "{}"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2021 Megvii, Base Detection
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: detector/YOLOX/README.md
================================================
## Introduction
YOLOX is an anchor-free version of YOLO, with a simpler design but better performance! It aims to bridge the gap between research and industrial communities.
For more details, please refer to our [report on Arxiv](https://arxiv.org/abs/2107.08430).
## Updates!!
* 【2021/07/20】 We have released our technical report on [Arxiv](https://arxiv.org/abs/2107.08430).
## Comming soon
- [ ] YOLOX-P6 and larger model.
- [ ] Objects365 pretrain.
- [ ] Transformer modules.
- [ ] More features in need.
## Benchmark
#### Standard Models.
|Model |size |mAPtest 0.5:0.95 | Speed V100 (ms) | Params (M) |FLOPs (G)| weights |
| ------ |:---: | :---: |:---: |:---: | :---: | :----: |
|[YOLOX-s](./exps/default/yolox_s.py) |640 |39.6 |9.8 |9.0 | 26.8 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EW62gmO2vnNNs5npxjzunVwB9p307qqygaCkXdTO88BLUg?e=NMTQYw)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_s.pth) |
|[YOLOX-m](./exps/default/yolox_m.py) |640 |46.4 |12.3 |25.3 |73.8| [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/ERMTP7VFqrVBrXKMU7Vl4TcBQs0SUeCT7kvc-JdIbej4tQ?e=1MDo9y)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_m.pth) |
|[YOLOX-l](./exps/default/yolox_l.py) |640 |50.0 |14.5 |54.2| 155.6 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EWA8w_IEOzBKvuueBqfaZh0BeoG5sVzR-XYbOJO4YlOkRw?e=wHWOBE)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_l.pth) |
|[YOLOX-x](./exps/default/yolox_x.py) |640 |**51.2** | 17.3 |99.1 |281.9 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EdgVPHBziOVBtGAXHfeHI5kBza0q9yyueMGdT0wXZfI1rQ?e=tABO5u)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_x.pth) |
|[YOLOX-Darknet53](./exps/default/yolov3.py) |640 | 47.4 | 11.1 |63.7 | 185.3 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EZ-MV1r_fMFPkPrNjvbJEMoBLOLAnXH-XKEB77w8LhXL6Q?e=mf6wOc)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_darknet53.pth) |
#### Light Models.
|Model |size |mAPval 0.5:0.95 | Params (M) |FLOPs (G)| weights |
| ------ |:---: | :---: |:---: |:---: | :---: |
|[YOLOX-Nano](./exps/default/nano.py) |416 |25.3 | 0.91 |1.08 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EdcREey-krhLtdtSnxolxiUBjWMy6EFdiaO9bdOwZ5ygCQ?e=yQpdds)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_nano.pth) |
|[YOLOX-Tiny](./exps/default/yolox_tiny.py) |416 |31.7 | 5.06 |6.45 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EYtjNFPqvZBBrQ-VowLcSr4B6Z5TdTflUsr_gO2CwhC3bQ?e=SBTwXj)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_tiny.pth) |
## Quick Start
Installation
Step1. Install YOLOX.
```shell
git clone git@github.com:Megvii-BaseDetection/YOLOX.git
cd YOLOX
pip3 install -U pip && pip3 install -r requirements.txt
pip3 install -v -e . # or python3 setup.py develop
```
Step2. Install [apex](https://github.com/NVIDIA/apex).
```shell
# skip this step if you don't want to train model.
git clone https://github.com/NVIDIA/apex
cd apex
pip3 install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
```
Step3. Install [pycocotools](https://github.com/cocodataset/cocoapi).
```shell
pip3 install cython; pip3 install 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'
```
Demo
Step1. Download a pretrained model from the benchmark table.
Step2. Use either -n or -f to specify your detector's config. For example:
```shell
python tools/demo.py image -n yolox-s -c /path/to/your/yolox_s.pth.tar --path assets/dog.jpg --conf 0.3 --nms 0.65 --tsize 640 --save_result --device [cpu/gpu]
```
or
```shell
python tools/demo.py image -f exps/default/yolox_s.py -c /path/to/your/yolox_s.pth.tar --path assets/dog.jpg --conf 0.3 --nms 0.65 --tsize 640 --save_result --device [cpu/gpu]
```
Demo for video:
```shell
python tools/demo.py video -n yolox-s -c /path/to/your/yolox_s.pth.tar --path /path/to/your/video --conf 0.3 --nms 0.65 --tsize 640 --save_result --device [cpu/gpu]
```
Reproduce our results on COCO
Step1. Prepare COCO dataset
```shell
cd
ln -s /path/to/your/COCO ./datasets/COCO
```
Step2. Reproduce our results on COCO by specifying -n:
```shell
python tools/train.py -n yolox-s -d 8 -b 64 --fp16 -o
yolox-m
yolox-l
yolox-x
```
* -d: number of gpu devices
* -b: total batch size, the recommended number for -b is num-gpu * 8
* --fp16: mixed precision training
When using -f, the above commands are equivalent to:
```shell
python tools/train.py -f exps/default/yolox-s.py -d 8 -b 64 --fp16 -o
exps/default/yolox-m.py
exps/default/yolox-l.py
exps/default/yolox-x.py
```
Evaluation
We support batch testing for fast evaluation:
```shell
python tools/eval.py -n yolox-s -c yolox_s.pth.tar -b 64 -d 8 --conf 0.001 [--fp16] [--fuse]
yolox-m
yolox-l
yolox-x
```
* --fuse: fuse conv and bn
* -d: number of GPUs used for evaluation. DEFAULT: All GPUs available will be used.
* -b: total batch size across on all GPUs
To reproduce speed test, we use the following command:
```shell
python tools/eval.py -n yolox-s -c yolox_s.pth.tar -b 1 -d 1 --conf 0.001 --fp16 --fuse
yolox-m
yolox-l
yolox-x
```
Tutorials
* [Training on custom data](docs/train_custom_data.md).
## Deployment
1. [ONNX export and an ONNXRuntime](./demo/ONNXRuntime)
2. [TensorRT in C++ and Python](./demo/TensorRT)
3. [ncnn in C++ and Java](./demo/ncnn)
4. [OpenVINO in C++ and Python](./demo/OpenVINO)
## Third-party resources
* The ncnn android app with video support: [ncnn-android-yolox](https://github.com/FeiGeChuanShu/ncnn-android-yolox) from [FeiGeChuanShu](https://github.com/FeiGeChuanShu)
* YOLOX with Tengine support: [Tengine](https://github.com/OAID/Tengine/blob/tengine-lite/examples/tm_yolox.cpp) from [BUG1989](https://github.com/BUG1989)
* YOLOX + ROS2 Foxy: [YOLOX-ROS](https://github.com/Ar-Ray-code/YOLOX-ROS) from [Ar-Ray](https://github.com/Ar-Ray-code)
* YOLOX Deploy DeepStream: [YOLOX-deepstream](https://github.com/nanmi/YOLOX-deepstream) from [nanmi](https://github.com/nanmi)
* YOLOX ONNXRuntime C++ Demo: [lite.ai](https://github.com/DefTruth/lite.ai/blob/main/ort/cv/yolox.cpp) from [DefTruth](https://github.com/DefTruth)
## Cite YOLOX
If you use YOLOX in your research, please cite our work by using the following BibTeX entry:
```latex
@article{yolox2021,
title={YOLOX: Exceeding YOLO Series in 2021},
author={Ge, Zheng and Liu, Songtao and Wang, Feng and Li, Zeming and Sun, Jian},
journal={arXiv preprint arXiv:2107.08430},
year={2021}
}
```
================================================
FILE: detector/YOLOX/datasets/README.md
================================================
# Prepare datasets
If you have a dataset directory, you could use os environment variable named `YOLOX_DATADIR`. Under this directory, YOLOX will look for datasets in the structure described below, if needed.
```
$YOLOX_DATADIR/
COCO/
```
You can set the location for builtin datasets by
```shell
export YOLOX_DATADIR=/path/to/your/datasets
```
If `YOLOX_DATADIR` is not set, the default value of dataset directory is `./datasets` relative to your current working directory.
## Expected dataset structure for [COCO detection](https://cocodataset.org/#download):
```
COCO/
annotations/
instances_{train,val}2017.json
{train,val}2017/
# image files that are mentioned in the corresponding json
```
You can use the 2014 version of the dataset as well.
================================================
FILE: detector/YOLOX/demo/ONNXRuntime/README.md
================================================
## YOLOX-ONNXRuntime in Python
This doc introduces how to convert your pytorch model into onnx, and how to run an onnxruntime demo to verify your convertion.
### Download ONNX models.
| Model | Parameters | GFLOPs | Test Size | mAP | Weights |
|:------| :----: | :----: | :---: | :---: | :---: |
| YOLOX-Nano | 0.91M | 1.08 | 416x416 | 25.3 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EfAGwvevU-lNhW5OqFAyHbwBJdI_7EaKu5yU04fgF5BU7w?e=gvq4hf)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_nano.onnx) |
| YOLOX-Tiny | 5.06M | 6.45 | 416x416 |31.7 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EVigCszU1ilDn-MwLwHCF1ABsgTy06xFdVgZ04Yyo4lHVA?e=hVKiCw)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_tiny.onnx) |
| YOLOX-S | 9.0M | 26.8 | 640x640 |39.6 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/Ec0L1d1x2UtIpbfiahgxhtgBZVjb1NCXbotO8SCOdMqpQQ?e=siyIsK)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_s.onnx) |
| YOLOX-M | 25.3M | 73.8 | 640x640 |46.4 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/ERUKlQe-nlxBoTKPy1ynbxsBmAZ_h-VBEV-nnfPdzUIkZQ?e=hyQQtl)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_m.onnx) |
| YOLOX-L | 54.2M | 155.6 | 640x640 |50.0 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/ET5w926jCA5GlVfg9ixB4KEBiW0HYl7SzaHNRaRG9dYO_A?e=ISmCYX)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_l.onnx) |
| YOLOX-Darknet53| 63.72M | 185.3 | 640x640 |47.3 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/ESArloSW-MlPlLuemLh9zKkBdovgweKbfu4zkvzKAp7pPQ?e=f81Ikw)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_darknet53.onnx) |
| YOLOX-X | 99.1M | 281.9 | 640x640 |51.2 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/ERjqoeMJlFdGuM3tQfXQmhABmGHlIHydWCwhlugeWLE9AA)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox.onnx) |
### Convert Your Model to ONNX
First, you should move to by:
```shell
cd
```
Then, you can:
1. Convert a standard YOLOX model by -n:
```shell
python3 tools/export_onnx.py --output-name yolox_s.onnx -n yolox-s -c yolox_s.pth.tar
```
Notes:
* -n: specify a model name. The model name must be one of the [yolox-s,m,l,x and yolox-nane, yolox-tiny, yolov3]
* -c: the model you have trained
* -o: opset version, default 11. **However, if you will further convert your onnx model to [OpenVINO](../OpenVINO/), please specify the opset version to 10.**
* --no-onnxsim: disable onnxsim
* To customize an input shape for onnx model, modify the following code in tools/export.py:
```python
dummy_input = torch.randn(1, 3, exp.test_size[0], exp.test_size[1])
```
2. Convert a standard YOLOX model by -f. When using -f, the above command is equivalent to:
```shell
python3 tools/export_onnx.py --output-name yolox_s.onnx -f exps/default/yolox_s.py -c yolox_s.pth.tar
```
3. To convert your customized model, please use -f:
```shell
python3 tools/export_onnx.py --output-name your_yolox.onnx -f exps/your_dir/your_yolox.py -c your_yolox.pth.tar
```
### ONNXRuntime Demo
Step1.
```shell
cd /demo/ONNXRuntime
```
Step2.
```shell
python3 onnx_inference.py -m -i -o -s 0.3 --input_shape 640,640
```
Notes:
* -m: your converted onnx model
* -i: input_image
* -s: score threshold for visualization.
* --input_shape: should be consistent with the shape you used for onnx convertion.
================================================
FILE: detector/YOLOX/demo/ONNXRuntime/onnx_inference.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
import argparse
import os
import cv2
import numpy as np
import onnxruntime
from yolox.data.data_augment import preproc as preprocess
from yolox.data.datasets import COCO_CLASSES
from yolox.utils import mkdir, multiclass_nms, demo_postprocess, vis
def make_parser():
parser = argparse.ArgumentParser("onnxruntime inference sample")
parser.add_argument(
"-m",
"--model",
type=str,
default="yolox.onnx",
help="Input your onnx model.",
)
parser.add_argument(
"-i",
"--image_path",
type=str,
default='test_image.png',
help="Path to your input image.",
)
parser.add_argument(
"-o",
"--output_dir",
type=str,
default='demo_output',
help="Path to your output directory.",
)
parser.add_argument(
"-s",
"--score_thr",
type=float,
default=0.3,
help="Score threshould to filter the result.",
)
parser.add_argument(
"--input_shape",
type=str,
default="640,640",
help="Specify an input shape for inference.",
)
parser.add_argument(
"--with_p6",
action="store_true",
help="Whether your model uses p6 in FPN/PAN.",
)
return parser
if __name__ == '__main__':
args = make_parser().parse_args()
input_shape = tuple(map(int, args.input_shape.split(',')))
origin_img = cv2.imread(args.image_path)
mean = (0.485, 0.456, 0.406)
std = (0.229, 0.224, 0.225)
img, ratio = preprocess(origin_img, input_shape, mean, std)
session = onnxruntime.InferenceSession(args.model)
ort_inputs = {session.get_inputs()[0].name: img[None, :, :, :]}
output = session.run(None, ort_inputs)
predictions = demo_postprocess(output[0], input_shape, p6=args.with_p6)[0]
boxes = predictions[:, :4]
scores = predictions[:, 4:5] * predictions[:, 5:]
boxes_xyxy = np.ones_like(boxes)
boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2]/2.
boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3]/2.
boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2]/2.
boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3]/2.
boxes_xyxy /= ratio
dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.65, score_thr=0.1)
if dets is not None:
final_boxes, final_scores, final_cls_inds = dets[:, :4], dets[:, 4], dets[:, 5]
origin_img = vis(origin_img, final_boxes, final_scores, final_cls_inds,
conf=args.score_thr, class_names=COCO_CLASSES)
mkdir(args.output_dir)
output_path = os.path.join(args.output_dir, args.image_path.split("/")[-1])
cv2.imwrite(output_path, origin_img)
================================================
FILE: detector/YOLOX/demo/OpenVINO/README.md
================================================
## YOLOX for OpenVINO
* [C++ Demo](./cpp)
* [Python Demo](./python)
================================================
FILE: detector/YOLOX/demo/OpenVINO/cpp/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.4.1)
set(CMAKE_CXX_STANDARD 14)
project(yolox_openvino_demo)
find_package(OpenCV REQUIRED)
find_package(InferenceEngine REQUIRED)
find_package(ngraph REQUIRED)
include_directories(
${OpenCV_INCLUDE_DIRS}
${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_CURRENT_BINARY_DIR}
)
add_executable(yolox_openvino yolox_openvino.cpp)
target_link_libraries(
yolox_openvino
${InferenceEngine_LIBRARIES}
${NGRAPH_LIBRARIES}
${OpenCV_LIBS}
)
================================================
FILE: detector/YOLOX/demo/OpenVINO/cpp/README.md
================================================
# YOLOX-OpenVINO in C++
This toturial includes a C++ demo for OpenVINO, as well as some converted models.
### Download OpenVINO models.
| Model | Parameters | GFLOPs | Test Size | mAP | Weights |
|:------| :----: | :----: | :---: | :---: | :---: |
| [YOLOX-Nano](../../../exps/nano.py) | 0.91M | 1.08 | 416x416 | 25.3 | [Download](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EeWY57o5wQZFtXYd1KJw6Z8B4vxZru649XxQHYIFgio3Qw?e=ZS81ce)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_nano_openvino.tar.gz) |
| [YOLOX-Tiny](../../../exps/yolox_tiny.py) | 5.06M | 6.45 | 416x416 |31.7 | [Download](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/ETfvOoCXdVZNinoSpKA_sEYBIQVqfjjF5_M6VvHRnLVcsA?e=STL1pi)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_tiny_openvino.tar.gz) |
| [YOLOX-S](../../../exps/yolox_s.py) | 9.0M | 26.8 | 640x640 |39.6 | [Download](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EXUjf3PQnbBLrxNrXPueqaIBzVZOrYQOnJpLK1Fytj5ssA?e=GK0LOM)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_s_openvino.tar.gz) |
| [YOLOX-M](../../../exps/yolox_m.py) | 25.3M | 73.8 | 640x640 |46.4 | [Download](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EcoT1BPpeRpLvE_4c441zn8BVNCQ2naxDH3rho7WqdlgLQ?e=95VaM9)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_m_openvino.tar.gz) |
| [YOLOX-L](../../../exps/yolox_l.py) | 54.2M | 155.6 | 640x640 |50.0 | [Download](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EZvmn-YLRuVPh0GAP_w3xHMB2VGvrKqQXyK_Cv5yi_DXUg?e=YRh6Eq)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_l_openvino.tar.gz) |
| [YOLOX-Darknet53](../../../exps/yolov3.py) | 63.72M | 185.3 | 640x640 |47.3 | [Download](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EQP8LSroikFHuwX0jFRetmcBOCDWSFmylHxolV7ezUPXGw?e=bEw5iq)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_darknet53_openvino.tar.gz) |
| [YOLOX-X](../../../exps/yolox_x.py) | 99.1M | 281.9 | 640x640 |51.2 | [Download](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EZFPnLqiD-xIlt7rcZYDjQgB4YXE9wnq1qaSXQwJrsKbdg?e=83nwEz)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_x_openvino.tar.gz) |
## Install OpenVINO Toolkit
Please visit [Openvino Homepage](https://docs.openvinotoolkit.org/latest/get_started_guides.html) for more details.
## Set up the Environment
### For Linux
**Option1. Set up the environment tempororally. You need to run this command everytime you start a new shell window.**
```shell
source /opt/intel/openvino_2021/bin/setupvars.sh
```
**Option2. Set up the environment permenantly.**
*Step1.* For Linux:
```shell
vim ~/.bashrc
```
*Step2.* Add the following line into your file:
```shell
source /opt/intel/openvino_2021/bin/setupvars.sh
```
*Step3.* Save and exit the file, then run:
```shell
source ~/.bashrc
```
## Convert model
1. Export ONNX model
Please refer to the [ONNX toturial](../../ONNXRuntime). **Note that you should set --opset to 10, otherwise your next step will fail.**
2. Convert ONNX to OpenVINO
``` shell
cd /openvino_2021/deployment_tools/model_optimizer
```
Install requirements for convert tool
```shell
sudo ./install_prerequisites/install_prerequisites_onnx.sh
```
Then convert model.
```shell
python3 mo.py --input_model --input_shape [--data_type FP16]
```
For example:
```shell
python3 mo.py --input_model yolox.onnx --input_shape (1,3,640,640) --data_type FP16
```
## Build
### Linux
```shell
source /opt/intel/openvino_2021/bin/setupvars.sh
mkdir build
cd build
cmake ..
make
```
## Demo
### c++
```shell
./yolox_openvino
```
================================================
FILE: detector/YOLOX/demo/OpenVINO/cpp/yolox_openvino.cpp
================================================
// Copyright (C) 2018-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#include
#include
#include
#include
#include
#include
#include
using namespace InferenceEngine;
/**
* @brief Define names based depends on Unicode path support
*/
#define tcout std::cout
#define file_name_t std::string
#define imread_t cv::imread
#define NMS_THRESH 0.65
#define BBOX_CONF_THRESH 0.3
static const int INPUT_W = 416;
static const int INPUT_H = 416;
cv::Mat static_resize(cv::Mat& img) {
float r = std::min(INPUT_W / (img.cols*1.0), INPUT_H / (img.rows*1.0));
// r = std::min(r, 1.0f);
int unpad_w = r * img.cols;
int unpad_h = r * img.rows;
cv::Mat re(unpad_h, unpad_w, CV_8UC3);
cv::resize(img, re, re.size());
cv::Mat out(INPUT_W, INPUT_H, CV_8UC3, cv::Scalar(114, 114, 114));
re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)));
return out;
}
void blobFromImage(cv::Mat& img, Blob::Ptr& blob){
cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
int channels = 3;
int img_h = img.rows;
int img_w = img.cols;
std::vector mean = {0.485, 0.456, 0.406};
std::vector std = {0.229, 0.224, 0.225};
InferenceEngine::MemoryBlob::Ptr mblob = InferenceEngine::as(blob);
if (!mblob)
{
THROW_IE_EXCEPTION << "We expect blob to be inherited from MemoryBlob in matU8ToBlob, "
<< "but by fact we were not able to cast inputBlob to MemoryBlob";
}
// locked memory holder should be alive all time while access to its buffer happens
auto mblobHolder = mblob->wmap();
float *blob_data = mblobHolder.as();
for (size_t c = 0; c < channels; c++)
{
for (size_t h = 0; h < img_h; h++)
{
for (size_t w = 0; w < img_w; w++)
{
blob_data[c * img_w * img_h + h * img_w + w] =
(((float)img.at(h, w)[c]) / 255.0f - mean[c]) / std[c];
}
}
}
}
struct Object
{
cv::Rect_ rect;
int label;
float prob;
};
struct GridAndStride
{
int grid0;
int grid1;
int stride;
};
static void generate_grids_and_stride(const int target_size, std::vector& strides, std::vector& grid_strides)
{
for (auto stride : strides)
{
int num_grid = target_size / stride;
for (int g1 = 0; g1 < num_grid; g1++)
{
for (int g0 = 0; g0 < num_grid; g0++)
{
grid_strides.push_back((GridAndStride){g0, g1, stride});
}
}
}
}
static void generate_yolox_proposals(std::vector grid_strides, const float* feat_ptr, float prob_threshold, std::vector& objects)
{
const int num_class = 80; // COCO has 80 classes. Modify this value on your own dataset.
const int num_anchors = grid_strides.size();
for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++)
{
const int grid0 = grid_strides[anchor_idx].grid0;
const int grid1 = grid_strides[anchor_idx].grid1;
const int stride = grid_strides[anchor_idx].stride;
const int basic_pos = anchor_idx * 85;
// yolox/models/yolo_head.py decode logic
// outputs[..., :2] = (outputs[..., :2] + grids) * strides
// outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides
float x_center = (feat_ptr[basic_pos + 0] + grid0) * stride;
float y_center = (feat_ptr[basic_pos + 1] + grid1) * stride;
float w = exp(feat_ptr[basic_pos + 2]) * stride;
float h = exp(feat_ptr[basic_pos + 3]) * stride;
float x0 = x_center - w * 0.5f;
float y0 = y_center - h * 0.5f;
float box_objectness = feat_ptr[basic_pos + 4];
for (int class_idx = 0; class_idx < num_class; class_idx++)
{
float box_cls_score = feat_ptr[basic_pos + 5 + class_idx];
float box_prob = box_objectness * box_cls_score;
if (box_prob > prob_threshold)
{
Object obj;
obj.rect.x = x0;
obj.rect.y = y0;
obj.rect.width = w;
obj.rect.height = h;
obj.label = class_idx;
obj.prob = box_prob;
objects.push_back(obj);
}
} // class loop
} // point anchor loop
}
static inline float intersection_area(const Object& a, const Object& b)
{
cv::Rect_ inter = a.rect & b.rect;
return inter.area();
}
static void qsort_descent_inplace(std::vector& faceobjects, int left, int right)
{
int i = left;
int j = right;
float p = faceobjects[(left + right) / 2].prob;
while (i <= j)
{
while (faceobjects[i].prob > p)
i++;
while (faceobjects[j].prob < p)
j--;
if (i <= j)
{
// swap
std::swap(faceobjects[i], faceobjects[j]);
i++;
j--;
}
}
#pragma omp parallel sections
{
#pragma omp section
{
if (left < j) qsort_descent_inplace(faceobjects, left, j);
}
#pragma omp section
{
if (i < right) qsort_descent_inplace(faceobjects, i, right);
}
}
}
static void qsort_descent_inplace(std::vector& objects)
{
if (objects.empty())
return;
qsort_descent_inplace(objects, 0, objects.size() - 1);
}
static void nms_sorted_bboxes(const std::vector& faceobjects, std::vector& picked, float nms_threshold)
{
picked.clear();
const int n = faceobjects.size();
std::vector areas(n);
for (int i = 0; i < n; i++)
{
areas[i] = faceobjects[i].rect.area();
}
for (int i = 0; i < n; i++)
{
const Object& a = faceobjects[i];
int keep = 1;
for (int j = 0; j < (int)picked.size(); j++)
{
const Object& b = faceobjects[picked[j]];
// intersection over union
float inter_area = intersection_area(a, b);
float union_area = areas[i] + areas[picked[j]] - inter_area;
// float IoU = inter_area / union_area
if (inter_area / union_area > nms_threshold)
keep = 0;
}
if (keep)
picked.push_back(i);
}
}
static void decode_outputs(const float* prob, std::vector& objects, float scale, const int img_w, const int img_h) {
std::vector proposals;
std::vector strides = {8, 16, 32};
std::vector grid_strides;
generate_grids_and_stride(INPUT_W, strides, grid_strides);
generate_yolox_proposals(grid_strides, prob, BBOX_CONF_THRESH, proposals);
qsort_descent_inplace(proposals);
std::vector picked;
nms_sorted_bboxes(proposals, picked, NMS_THRESH);
int count = picked.size();
objects.resize(count);
for (int i = 0; i < count; i++)
{
objects[i] = proposals[picked[i]];
// adjust offset to original unpadded
float x0 = (objects[i].rect.x) / scale;
float y0 = (objects[i].rect.y) / scale;
float x1 = (objects[i].rect.x + objects[i].rect.width) / scale;
float y1 = (objects[i].rect.y + objects[i].rect.height) / scale;
// clip
x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);
objects[i].rect.x = x0;
objects[i].rect.y = y0;
objects[i].rect.width = x1 - x0;
objects[i].rect.height = y1 - y0;
}
}
const float color_list[80][3] =
{
{0.000, 0.447, 0.741},
{0.850, 0.325, 0.098},
{0.929, 0.694, 0.125},
{0.494, 0.184, 0.556},
{0.466, 0.674, 0.188},
{0.301, 0.745, 0.933},
{0.635, 0.078, 0.184},
{0.300, 0.300, 0.300},
{0.600, 0.600, 0.600},
{1.000, 0.000, 0.000},
{1.000, 0.500, 0.000},
{0.749, 0.749, 0.000},
{0.000, 1.000, 0.000},
{0.000, 0.000, 1.000},
{0.667, 0.000, 1.000},
{0.333, 0.333, 0.000},
{0.333, 0.667, 0.000},
{0.333, 1.000, 0.000},
{0.667, 0.333, 0.000},
{0.667, 0.667, 0.000},
{0.667, 1.000, 0.000},
{1.000, 0.333, 0.000},
{1.000, 0.667, 0.000},
{1.000, 1.000, 0.000},
{0.000, 0.333, 0.500},
{0.000, 0.667, 0.500},
{0.000, 1.000, 0.500},
{0.333, 0.000, 0.500},
{0.333, 0.333, 0.500},
{0.333, 0.667, 0.500},
{0.333, 1.000, 0.500},
{0.667, 0.000, 0.500},
{0.667, 0.333, 0.500},
{0.667, 0.667, 0.500},
{0.667, 1.000, 0.500},
{1.000, 0.000, 0.500},
{1.000, 0.333, 0.500},
{1.000, 0.667, 0.500},
{1.000, 1.000, 0.500},
{0.000, 0.333, 1.000},
{0.000, 0.667, 1.000},
{0.000, 1.000, 1.000},
{0.333, 0.000, 1.000},
{0.333, 0.333, 1.000},
{0.333, 0.667, 1.000},
{0.333, 1.000, 1.000},
{0.667, 0.000, 1.000},
{0.667, 0.333, 1.000},
{0.667, 0.667, 1.000},
{0.667, 1.000, 1.000},
{1.000, 0.000, 1.000},
{1.000, 0.333, 1.000},
{1.000, 0.667, 1.000},
{0.333, 0.000, 0.000},
{0.500, 0.000, 0.000},
{0.667, 0.000, 0.000},
{0.833, 0.000, 0.000},
{1.000, 0.000, 0.000},
{0.000, 0.167, 0.000},
{0.000, 0.333, 0.000},
{0.000, 0.500, 0.000},
{0.000, 0.667, 0.000},
{0.000, 0.833, 0.000},
{0.000, 1.000, 0.000},
{0.000, 0.000, 0.167},
{0.000, 0.000, 0.333},
{0.000, 0.000, 0.500},
{0.000, 0.000, 0.667},
{0.000, 0.000, 0.833},
{0.000, 0.000, 1.000},
{0.000, 0.000, 0.000},
{0.143, 0.143, 0.143},
{0.286, 0.286, 0.286},
{0.429, 0.429, 0.429},
{0.571, 0.571, 0.571},
{0.714, 0.714, 0.714},
{0.857, 0.857, 0.857},
{0.000, 0.447, 0.741},
{0.314, 0.717, 0.741},
{0.50, 0.5, 0}
};
static void draw_objects(const cv::Mat& bgr, const std::vector& objects)
{
static const char* class_names[] = {
"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
"elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
"tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
"sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
"potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
"microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
"hair drier", "toothbrush"
};
cv::Mat image = bgr.clone();
for (size_t i = 0; i < objects.size(); i++)
{
const Object& obj = objects[i];
fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
cv::Scalar color = cv::Scalar(color_list[obj.label][0], color_list[obj.label][1], color_list[obj.label][2]);
float c_mean = cv::mean(color)[0];
cv::Scalar txt_color;
if (c_mean > 0.5){
txt_color = cv::Scalar(0, 0, 0);
}else{
txt_color = cv::Scalar(255, 255, 255);
}
cv::rectangle(image, obj.rect, color * 255, 2);
char text[256];
sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
int baseLine = 0;
cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.4, 1, &baseLine);
cv::Scalar txt_bk_color = color * 0.7 * 255;
int x = obj.rect.x;
int y = obj.rect.y + 1;
//int y = obj.rect.y - label_size.height - baseLine;
if (y > image.rows)
y = image.rows;
//if (x + label_size.width > image.cols)
//x = image.cols - label_size.width;
cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
txt_bk_color, -1);
cv::putText(image, text, cv::Point(x, y + label_size.height),
cv::FONT_HERSHEY_SIMPLEX, 0.4, txt_color, 1);
}
cv::imwrite("_demo.jpg" , image);
fprintf(stderr, "save vis file\n");
/* cv::imshow("image", image); */
/* cv::waitKey(0); */
}
int main(int argc, char* argv[]) {
try {
// ------------------------------ Parsing and validation of input arguments
// ---------------------------------
if (argc != 4) {
tcout << "Usage : " << argv[0] << " " << std::endl;
return EXIT_FAILURE;
}
const file_name_t input_model {argv[1]};
const file_name_t input_image_path {argv[2]};
const std::string device_name {argv[3]};
// -----------------------------------------------------------------------------------------------------
// --------------------------- Step 1. Initialize inference engine core
// -------------------------------------
Core ie;
// -----------------------------------------------------------------------------------------------------
// Step 2. Read a model in OpenVINO Intermediate Representation (.xml and
// .bin files) or ONNX (.onnx file) format
CNNNetwork network = ie.ReadNetwork(input_model);
if (network.getOutputsInfo().size() != 1)
throw std::logic_error("Sample supports topologies with 1 output only");
if (network.getInputsInfo().size() != 1)
throw std::logic_error("Sample supports topologies with 1 input only");
// -----------------------------------------------------------------------------------------------------
// --------------------------- Step 3. Configure input & output
// ---------------------------------------------
// --------------------------- Prepare input blobs
// -----------------------------------------------------
InputInfo::Ptr input_info = network.getInputsInfo().begin()->second;
std::string input_name = network.getInputsInfo().begin()->first;
/* Mark input as resizable by setting of a resize algorithm.
* In this case we will be able to set an input blob of any shape to an
* infer request. Resize and layout conversions are executed automatically
* during inference */
//input_info->getPreProcess().setResizeAlgorithm(RESIZE_BILINEAR);
//input_info->setLayout(Layout::NHWC);
//input_info->setPrecision(Precision::FP32);
// --------------------------- Prepare output blobs
// ----------------------------------------------------
if (network.getOutputsInfo().empty()) {
std::cerr << "Network outputs info is empty" << std::endl;
return EXIT_FAILURE;
}
DataPtr output_info = network.getOutputsInfo().begin()->second;
std::string output_name = network.getOutputsInfo().begin()->first;
output_info->setPrecision(Precision::FP32);
// -----------------------------------------------------------------------------------------------------
// --------------------------- Step 4. Loading a model to the device
// ------------------------------------------
ExecutableNetwork executable_network = ie.LoadNetwork(network, device_name);
// -----------------------------------------------------------------------------------------------------
// --------------------------- Step 5. Create an infer request
// -------------------------------------------------
InferRequest infer_request = executable_network.CreateInferRequest();
// -----------------------------------------------------------------------------------------------------
// --------------------------- Step 6. Prepare input
// --------------------------------------------------------
/* Read input image to a blob and set it to an infer request without resize
* and layout conversions. */
cv::Mat image = imread_t(input_image_path);
cv::Mat pr_img = static_resize(image);
Blob::Ptr imgBlob = infer_request.GetBlob(input_name); // just wrap Mat data by Blob::Ptr
blobFromImage(pr_img, imgBlob);
// infer_request.SetBlob(input_name, imgBlob); // infer_request accepts input blob of any size
// -----------------------------------------------------------------------------------------------------
// --------------------------- Step 7. Do inference
// --------------------------------------------------------
/* Running the request synchronously */
infer_request.Infer();
// -----------------------------------------------------------------------------------------------------
// --------------------------- Step 8. Process output
// ------------------------------------------------------
const Blob::Ptr output_blob = infer_request.GetBlob(output_name);
MemoryBlob::CPtr moutput = as(output_blob);
if (!moutput) {
throw std::logic_error("We expect output to be inherited from MemoryBlob, "
"but by fact we were not able to cast output to MemoryBlob");
}
// locked memory holder should be alive all time while access to its buffer
// happens
auto moutputHolder = moutput->rmap();
const float* net_pred = moutputHolder.as::value_type*>();
const int image_size = 416;
int img_w = image.cols;
int img_h = image.rows;
float scale = std::min(INPUT_W / (image.cols*1.0), INPUT_H / (image.rows*1.0));
std::vector objects;
decode_outputs(net_pred, objects, scale, img_w, img_h);
draw_objects(image, objects);
// -----------------------------------------------------------------------------------------------------
} catch (const std::exception& ex) {
std::cerr << ex.what() << std::endl;
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}
================================================
FILE: detector/YOLOX/demo/OpenVINO/python/README.md
================================================
# YOLOX-OpenVINO in Python
This toturial includes a Python demo for OpenVINO, as well as some converted models.
### Download OpenVINO models.
| Model | Parameters | GFLOPs | Test Size | mAP | Weights |
|:------| :----: | :----: | :---: | :---: | :---: |
| [YOLOX-Nano](../../../exps/default/nano.py) | 0.91M | 1.08 | 416x416 | 25.3 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EeWY57o5wQZFtXYd1KJw6Z8B4vxZru649XxQHYIFgio3Qw?e=ZS81ce)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_nano_openvino.tar.gz) |
| [YOLOX-Tiny](../../../exps/default/yolox_tiny.py) | 5.06M | 6.45 | 416x416 |31.7 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/ETfvOoCXdVZNinoSpKA_sEYBIQVqfjjF5_M6VvHRnLVcsA?e=STL1pi)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_tiny_openvino.tar.gz) |
| [YOLOX-S](../../../exps/default/yolox_s.py) | 9.0M | 26.8 | 640x640 |39.6 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EXUjf3PQnbBLrxNrXPueqaIBzVZOrYQOnJpLK1Fytj5ssA?e=GK0LOM)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_s_openvino.tar.gz) |
| [YOLOX-M](../../../exps/default/yolox_m.py) | 25.3M | 73.8 | 640x640 |46.4 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EcoT1BPpeRpLvE_4c441zn8BVNCQ2naxDH3rho7WqdlgLQ?e=95VaM9)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_m_openvino.tar.gz) |
| [YOLOX-L](../../../exps/default/yolox_l.py) | 54.2M | 155.6 | 640x640 |50.0 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EZvmn-YLRuVPh0GAP_w3xHMB2VGvrKqQXyK_Cv5yi_DXUg?e=YRh6Eq)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_l_openvino.tar.gz) |
| [YOLOX-Darknet53](../../../exps/default/yolov3.py) | 63.72M | 185.3 | 640x640 |47.3 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EQP8LSroikFHuwX0jFRetmcBOCDWSFmylHxolV7ezUPXGw?e=bEw5iq)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_darknet53_openvino.tar.gz) |
| [YOLOX-X](../../../exps/default/yolox_x.py) | 99.1M | 281.9 | 640x640 |51.2 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EZFPnLqiD-xIlt7rcZYDjQgB4YXE9wnq1qaSXQwJrsKbdg?e=83nwEz)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_x_openvino.tar.gz) |
## Install OpenVINO Toolkit
Please visit [Openvino Homepage](https://docs.openvinotoolkit.org/latest/get_started_guides.html) for more details.
## Set up the Environment
### For Linux
**Option1. Set up the environment tempororally. You need to run this command everytime you start a new shell window.**
```shell
source /opt/intel/openvino_2021/bin/setupvars.sh
```
**Option2. Set up the environment permenantly.**
*Step1.* For Linux:
```shell
vim ~/.bashrc
```
*Step2.* Add the following line into your file:
```shell
source /opt/intel/openvino_2021/bin/setupvars.sh
```
*Step3.* Save and exit the file, then run:
```shell
source ~/.bashrc
```
## Convert model
1. Export ONNX model
Please refer to the [ONNX toturial](../../ONNXRuntime). **Note that you should set --opset to 10, otherwise your next step will fail.**
2. Convert ONNX to OpenVINO
``` shell
cd /openvino_2021/deployment_tools/model_optimizer
```
Install requirements for convert tool
```shell
sudo ./install_prerequisites/install_prerequisites_onnx.sh
```
Then convert model.
```shell
python3 mo.py --input_model --input_shape [--data_type FP16]
```
For example:
```shell
python3 mo.py --input_model yolox.onnx --input_shape [1,3,640,640] --data_type FP16 --output_dir converted_output
```
## Demo
### python
```shell
python openvino_inference.py -m -i
```
or
```shell
python openvino_inference.py -m -i -o -s -d
```
================================================
FILE: detector/YOLOX/demo/OpenVINO/python/openvino_inference.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright (C) 2018-2021 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) Megvii, Inc. and its affiliates.
import argparse
import logging as log
import os
import sys
import cv2
import numpy as np
from openvino.inference_engine import IECore
from yolox.data.data_augment import preproc as preprocess
from yolox.data.datasets import COCO_CLASSES
from yolox.utils import mkdir, multiclass_nms, demo_postprocess, vis
def parse_args() -> argparse.Namespace:
"""Parse and return command line arguments"""
parser = argparse.ArgumentParser(add_help=False)
args = parser.add_argument_group('Options')
args.add_argument(
'-h',
'--help',
action='help',
help='Show this help message and exit.')
args.add_argument(
'-m',
'--model',
required=True,
type=str,
help='Required. Path to an .xml or .onnx file with a trained model.')
args.add_argument(
'-i',
'--input',
required=True,
type=str,
help='Required. Path to an image file.')
args.add_argument(
'-o',
'--output_dir',
type=str,
default='demo_output',
help='Path to your output dir.')
args.add_argument(
'-s',
'--score_thr',
type=float,
default=0.3,
help="Score threshould to visualize the result.")
args.add_argument(
'-d',
'--device',
default='CPU',
type=str,
help='Optional. Specify the target device to infer on; CPU, GPU, \
MYRIAD, HDDL or HETERO: is acceptable. The sample will look \
for a suitable plugin for device specified. Default value \
is CPU.')
args.add_argument(
'--labels',
default=None,
type=str,
help='Option:al. Path to a labels mapping file.')
args.add_argument(
'-nt',
'--number_top',
default=10,
type=int,
help='Optional. Number of top results.')
return parser.parse_args()
def main():
log.basicConfig(format='[ %(levelname)s ] %(message)s', level=log.INFO, stream=sys.stdout)
args = parse_args()
# ---------------------------Step 1. Initialize inference engine core--------------------------------------------------
log.info('Creating Inference Engine')
ie = IECore()
# ---------------------------Step 2. Read a model in OpenVINO Intermediate Representation or ONNX format---------------
log.info(f'Reading the network: {args.model}')
# (.xml and .bin files) or (.onnx file)
net = ie.read_network(model=args.model)
if len(net.input_info) != 1:
log.error('Sample supports only single input topologies')
return -1
if len(net.outputs) != 1:
log.error('Sample supports only single output topologies')
return -1
# ---------------------------Step 3. Configure input & output----------------------------------------------------------
log.info('Configuring input and output blobs')
# Get names of input and output blobs
input_blob = next(iter(net.input_info))
out_blob = next(iter(net.outputs))
# Set input and output precision manually
net.input_info[input_blob].precision = 'FP32'
net.outputs[out_blob].precision = 'FP16'
# Get a number of classes recognized by a model
num_of_classes = max(net.outputs[out_blob].shape)
# ---------------------------Step 4. Loading model to the device-------------------------------------------------------
log.info('Loading the model to the plugin')
exec_net = ie.load_network(network=net, device_name=args.device)
# ---------------------------Step 5. Create infer request--------------------------------------------------------------
# load_network() method of the IECore class with a specified number of requests (default 1) returns an ExecutableNetwork
# instance which stores infer requests. So you already created Infer requests in the previous step.
# ---------------------------Step 6. Prepare input---------------------------------------------------------------------
origin_img = cv2.imread(args.input)
_, _, h, w = net.input_info[input_blob].input_data.shape
mean = (0.485, 0.456, 0.406)
std = (0.229, 0.224, 0.225)
image, ratio = preprocess(origin_img, (h, w), mean, std)
# ---------------------------Step 7. Do inference----------------------------------------------------------------------
log.info('Starting inference in synchronous mode')
res = exec_net.infer(inputs={input_blob: image})
# ---------------------------Step 8. Process output--------------------------------------------------------------------
res = res[out_blob]
predictions = demo_postprocess(res, (h, w), p6=False)[0]
boxes = predictions[:, :4]
scores = predictions[:, 4, None] * predictions[:, 5:]
boxes_xyxy = np.ones_like(boxes)
boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2]/2.
boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3]/2.
boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2]/2.
boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3]/2.
boxes_xyxy /= ratio
dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.65, score_thr=0.1)
if dets is not None:
final_boxes = dets[:, :4]
final_scores, final_cls_inds = dets[:, 4], dets[:, 5]
origin_img = vis(origin_img, final_boxes, final_scores, final_cls_inds,
conf=args.score_thr, class_names=COCO_CLASSES)
mkdir(args.output_dir)
output_path = os.path.join(args.output_dir, args.image_path.split("/")[-1])
cv2.imwrite(output_path, origin_img)
if __name__ == '__main__':
sys.exit(main())
================================================
FILE: detector/YOLOX/demo/TensorRT/cpp/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 2.6)
project(yolox)
add_definitions(-std=c++11)
option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_BUILD_TYPE Debug)
find_package(CUDA REQUIRED)
include_directories(${PROJECT_SOURCE_DIR}/include)
# include and link dirs of cuda and tensorrt, you need adapt them if yours are different
# cuda
include_directories(/data/cuda/cuda-10.2/cuda/include)
link_directories(/data/cuda/cuda-10.2/cuda/lib64)
# cudnn
include_directories(/data/cuda/cuda-10.2/cudnn/v8.0.4/include)
link_directories(/data/cuda/cuda-10.2/cudnn/v8.0.4/lib64)
# tensorrt
include_directories(/data/cuda/cuda-10.2/TensorRT/v7.2.1.6/include)
link_directories(/data/cuda/cuda-10.2/TensorRT/v7.2.1.6/lib)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
find_package(OpenCV)
include_directories(${OpenCV_INCLUDE_DIRS})
add_executable(yolox ${PROJECT_SOURCE_DIR}/yolox.cpp)
target_link_libraries(yolox nvinfer)
target_link_libraries(yolox cudart)
target_link_libraries(yolox ${OpenCV_LIBS})
add_definitions(-O2 -pthread)
================================================
FILE: detector/YOLOX/demo/TensorRT/cpp/README.md
================================================
# YOLOX-TensorRT in C++
As YOLOX models is easy to converted to tensorrt using [torch2trt gitrepo](https://github.com/NVIDIA-AI-IOT/torch2trt),
our C++ demo will not include the model converting or constructing like other tenorrt demos.
## Step 1: Prepare serialized engine file
Follow the trt [python demo README](../python/README.md) to convert and save the serialized engine file.
Check the 'model_trt.engine' file generated from Step 1, which will automatically saved at the current demo dir.
## Step 2: build the demo
Please follow the [TensorRT Installation Guide](https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html) to install TensorRT.
Install opencv with ```sudo apt-get install libopencv-dev```.
build the demo:
```shell
mkdir build
cd build
cmake ..
make
```
Then run the demo:
```shell
./yolox ../model_trt.engine -i ../../../../assets/dog.jpg
```
or
```shell
./yolox -i
```
================================================
FILE: detector/YOLOX/demo/TensorRT/cpp/logging.h
================================================
/*
* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef TENSORRT_LOGGING_H
#define TENSORRT_LOGGING_H
#include "NvInferRuntimeCommon.h"
#include
#include
#include
#include
#include
#include
#include
using Severity = nvinfer1::ILogger::Severity;
class LogStreamConsumerBuffer : public std::stringbuf
{
public:
LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog)
: mOutput(stream)
, mPrefix(prefix)
, mShouldLog(shouldLog)
{
}
LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other)
: mOutput(other.mOutput)
{
}
~LogStreamConsumerBuffer()
{
// std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence
// std::streambuf::pptr() gives a pointer to the current position of the output sequence
// if the pointer to the beginning is not equal to the pointer to the current position,
// call putOutput() to log the output to the stream
if (pbase() != pptr())
{
putOutput();
}
}
// synchronizes the stream buffer and returns 0 on success
// synchronizing the stream buffer consists of inserting the buffer contents into the stream,
// resetting the buffer and flushing the stream
virtual int sync()
{
putOutput();
return 0;
}
void putOutput()
{
if (mShouldLog)
{
// prepend timestamp
std::time_t timestamp = std::time(nullptr);
tm* tm_local = std::localtime(×tamp);
std::cout << "[";
std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/";
std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/";
std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-";
std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":";
std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":";
std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] ";
// std::stringbuf::str() gets the string contents of the buffer
// insert the buffer contents pre-appended by the appropriate prefix into the stream
mOutput << mPrefix << str();
// set the buffer to empty
str("");
// flush the stream
mOutput.flush();
}
}
void setShouldLog(bool shouldLog)
{
mShouldLog = shouldLog;
}
private:
std::ostream& mOutput;
std::string mPrefix;
bool mShouldLog;
};
//!
//! \class LogStreamConsumerBase
//! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer
//!
class LogStreamConsumerBase
{
public:
LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog)
: mBuffer(stream, prefix, shouldLog)
{
}
protected:
LogStreamConsumerBuffer mBuffer;
};
//!
//! \class LogStreamConsumer
//! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages.
//! Order of base classes is LogStreamConsumerBase and then std::ostream.
//! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field
//! in LogStreamConsumer and then the address of the buffer is passed to std::ostream.
//! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream.
//! Please do not change the order of the parent classes.
//!
class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream
{
public:
//! \brief Creates a LogStreamConsumer which logs messages with level severity.
//! Reportable severity determines if the messages are severe enough to be logged.
LogStreamConsumer(Severity reportableSeverity, Severity severity)
: LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity)
, std::ostream(&mBuffer) // links the stream buffer with the stream
, mShouldLog(severity <= reportableSeverity)
, mSeverity(severity)
{
}
LogStreamConsumer(LogStreamConsumer&& other)
: LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog)
, std::ostream(&mBuffer) // links the stream buffer with the stream
, mShouldLog(other.mShouldLog)
, mSeverity(other.mSeverity)
{
}
void setReportableSeverity(Severity reportableSeverity)
{
mShouldLog = mSeverity <= reportableSeverity;
mBuffer.setShouldLog(mShouldLog);
}
private:
static std::ostream& severityOstream(Severity severity)
{
return severity >= Severity::kINFO ? std::cout : std::cerr;
}
static std::string severityPrefix(Severity severity)
{
switch (severity)
{
case Severity::kINTERNAL_ERROR: return "[F] ";
case Severity::kERROR: return "[E] ";
case Severity::kWARNING: return "[W] ";
case Severity::kINFO: return "[I] ";
case Severity::kVERBOSE: return "[V] ";
default: assert(0); return "";
}
}
bool mShouldLog;
Severity mSeverity;
};
//! \class Logger
//!
//! \brief Class which manages logging of TensorRT tools and samples
//!
//! \details This class provides a common interface for TensorRT tools and samples to log information to the console,
//! and supports logging two types of messages:
//!
//! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal)
//! - Test pass/fail messages
//!
//! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is
//! that the logic for controlling the verbosity and formatting of sample output is centralized in one location.
//!
//! In the future, this class could be extended to support dumping test results to a file in some standard format
//! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run).
//!
//! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger
//! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT
//! library and messages coming from the sample.
//!
//! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the
//! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger
//! object.
class Logger : public nvinfer1::ILogger
{
public:
Logger(Severity severity = Severity::kWARNING)
: mReportableSeverity(severity)
{
}
//!
//! \enum TestResult
//! \brief Represents the state of a given test
//!
enum class TestResult
{
kRUNNING, //!< The test is running
kPASSED, //!< The test passed
kFAILED, //!< The test failed
kWAIVED //!< The test was waived
};
//!
//! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger
//! \return The nvinfer1::ILogger associated with this Logger
//!
//! TODO Once all samples are updated to use this method to register the logger with TensorRT,
//! we can eliminate the inheritance of Logger from ILogger
//!
nvinfer1::ILogger& getTRTLogger()
{
return *this;
}
//!
//! \brief Implementation of the nvinfer1::ILogger::log() virtual method
//!
//! Note samples should not be calling this function directly; it will eventually go away once we eliminate the
//! inheritance from nvinfer1::ILogger
//!
void log(Severity severity, const char* msg) override
{
LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl;
}
//!
//! \brief Method for controlling the verbosity of logging output
//!
//! \param severity The logger will only emit messages that have severity of this level or higher.
//!
void setReportableSeverity(Severity severity)
{
mReportableSeverity = severity;
}
//!
//! \brief Opaque handle that holds logging information for a particular test
//!
//! This object is an opaque handle to information used by the Logger to print test results.
//! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used
//! with Logger::reportTest{Start,End}().
//!
class TestAtom
{
public:
TestAtom(TestAtom&&) = default;
private:
friend class Logger;
TestAtom(bool started, const std::string& name, const std::string& cmdline)
: mStarted(started)
, mName(name)
, mCmdline(cmdline)
{
}
bool mStarted;
std::string mName;
std::string mCmdline;
};
//!
//! \brief Define a test for logging
//!
//! \param[in] name The name of the test. This should be a string starting with
//! "TensorRT" and containing dot-separated strings containing
//! the characters [A-Za-z0-9_].
//! For example, "TensorRT.sample_googlenet"
//! \param[in] cmdline The command line used to reproduce the test
//
//! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
//!
static TestAtom defineTest(const std::string& name, const std::string& cmdline)
{
return TestAtom(false, name, cmdline);
}
//!
//! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments
//! as input
//!
//! \param[in] name The name of the test
//! \param[in] argc The number of command-line arguments
//! \param[in] argv The array of command-line arguments (given as C strings)
//!
//! \return a TestAtom that can be used in Logger::reportTest{Start,End}().
static TestAtom defineTest(const std::string& name, int argc, char const* const* argv)
{
auto cmdline = genCmdlineString(argc, argv);
return defineTest(name, cmdline);
}
//!
//! \brief Report that a test has started.
//!
//! \pre reportTestStart() has not been called yet for the given testAtom
//!
//! \param[in] testAtom The handle to the test that has started
//!
static void reportTestStart(TestAtom& testAtom)
{
reportTestResult(testAtom, TestResult::kRUNNING);
assert(!testAtom.mStarted);
testAtom.mStarted = true;
}
//!
//! \brief Report that a test has ended.
//!
//! \pre reportTestStart() has been called for the given testAtom
//!
//! \param[in] testAtom The handle to the test that has ended
//! \param[in] result The result of the test. Should be one of TestResult::kPASSED,
//! TestResult::kFAILED, TestResult::kWAIVED
//!
static void reportTestEnd(const TestAtom& testAtom, TestResult result)
{
assert(result != TestResult::kRUNNING);
assert(testAtom.mStarted);
reportTestResult(testAtom, result);
}
static int reportPass(const TestAtom& testAtom)
{
reportTestEnd(testAtom, TestResult::kPASSED);
return EXIT_SUCCESS;
}
static int reportFail(const TestAtom& testAtom)
{
reportTestEnd(testAtom, TestResult::kFAILED);
return EXIT_FAILURE;
}
static int reportWaive(const TestAtom& testAtom)
{
reportTestEnd(testAtom, TestResult::kWAIVED);
return EXIT_SUCCESS;
}
static int reportTest(const TestAtom& testAtom, bool pass)
{
return pass ? reportPass(testAtom) : reportFail(testAtom);
}
Severity getReportableSeverity() const
{
return mReportableSeverity;
}
private:
//!
//! \brief returns an appropriate string for prefixing a log message with the given severity
//!
static const char* severityPrefix(Severity severity)
{
switch (severity)
{
case Severity::kINTERNAL_ERROR: return "[F] ";
case Severity::kERROR: return "[E] ";
case Severity::kWARNING: return "[W] ";
case Severity::kINFO: return "[I] ";
case Severity::kVERBOSE: return "[V] ";
default: assert(0); return "";
}
}
//!
//! \brief returns an appropriate string for prefixing a test result message with the given result
//!
static const char* testResultString(TestResult result)
{
switch (result)
{
case TestResult::kRUNNING: return "RUNNING";
case TestResult::kPASSED: return "PASSED";
case TestResult::kFAILED: return "FAILED";
case TestResult::kWAIVED: return "WAIVED";
default: assert(0); return "";
}
}
//!
//! \brief returns an appropriate output stream (cout or cerr) to use with the given severity
//!
static std::ostream& severityOstream(Severity severity)
{
return severity >= Severity::kINFO ? std::cout : std::cerr;
}
//!
//! \brief method that implements logging test results
//!
static void reportTestResult(const TestAtom& testAtom, TestResult result)
{
severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # "
<< testAtom.mCmdline << std::endl;
}
//!
//! \brief generate a command line string from the given (argc, argv) values
//!
static std::string genCmdlineString(int argc, char const* const* argv)
{
std::stringstream ss;
for (int i = 0; i < argc; i++)
{
if (i > 0)
ss << " ";
ss << argv[i];
}
return ss.str();
}
Severity mReportableSeverity;
};
namespace
{
//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE
//!
//! Example usage:
//!
//! LOG_VERBOSE(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_VERBOSE(const Logger& logger)
{
return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE);
}
//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO
//!
//! Example usage:
//!
//! LOG_INFO(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_INFO(const Logger& logger)
{
return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO);
}
//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING
//!
//! Example usage:
//!
//! LOG_WARN(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_WARN(const Logger& logger)
{
return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING);
}
//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR
//!
//! Example usage:
//!
//! LOG_ERROR(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_ERROR(const Logger& logger)
{
return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR);
}
//!
//! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR
// ("fatal" severity)
//!
//! Example usage:
//!
//! LOG_FATAL(logger) << "hello world" << std::endl;
//!
inline LogStreamConsumer LOG_FATAL(const Logger& logger)
{
return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR);
}
} // anonymous namespace
#endif // TENSORRT_LOGGING_H
================================================
FILE: detector/YOLOX/demo/TensorRT/cpp/yolox.cpp
================================================
#include
#include
#include
#include
#include
#include
#include
#include
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include "logging.h"
#define CHECK(status) \
do\
{\
auto ret = (status);\
if (ret != 0)\
{\
std::cerr << "Cuda failure: " << ret << std::endl;\
abort();\
}\
} while (0)
#define DEVICE 0 // GPU id
#define NMS_THRESH 0.65
#define BBOX_CONF_THRESH 0.3
using namespace nvinfer1;
// stuff we know about the network and the input/output blobs
static const int INPUT_W = 640;
static const int INPUT_H = 640;
const char* INPUT_BLOB_NAME = "input_0";
const char* OUTPUT_BLOB_NAME = "output_0";
static Logger gLogger;
cv::Mat static_resize(cv::Mat& img) {
float r = std::min(INPUT_W / (img.cols*1.0), INPUT_H / (img.rows*1.0));
// r = std::min(r, 1.0f);
int unpad_w = r * img.cols;
int unpad_h = r * img.rows;
cv::Mat re(unpad_h, unpad_w, CV_8UC3);
cv::resize(img, re, re.size());
cv::Mat out(INPUT_W, INPUT_H, CV_8UC3, cv::Scalar(114, 114, 114));
re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)));
return out;
}
struct Object
{
cv::Rect_ rect;
int label;
float prob;
};
struct GridAndStride
{
int grid0;
int grid1;
int stride;
};
static void generate_grids_and_stride(const int target_size, std::vector& strides, std::vector& grid_strides)
{
for (auto stride : strides)
{
int num_grid = target_size / stride;
for (int g1 = 0; g1 < num_grid; g1++)
{
for (int g0 = 0; g0 < num_grid; g0++)
{
grid_strides.push_back((GridAndStride){g0, g1, stride});
}
}
}
}
static inline float intersection_area(const Object& a, const Object& b)
{
cv::Rect_ inter = a.rect & b.rect;
return inter.area();
}
static void qsort_descent_inplace(std::vector& faceobjects, int left, int right)
{
int i = left;
int j = right;
float p = faceobjects[(left + right) / 2].prob;
while (i <= j)
{
while (faceobjects[i].prob > p)
i++;
while (faceobjects[j].prob < p)
j--;
if (i <= j)
{
// swap
std::swap(faceobjects[i], faceobjects[j]);
i++;
j--;
}
}
#pragma omp parallel sections
{
#pragma omp section
{
if (left < j) qsort_descent_inplace(faceobjects, left, j);
}
#pragma omp section
{
if (i < right) qsort_descent_inplace(faceobjects, i, right);
}
}
}
static void qsort_descent_inplace(std::vector& objects)
{
if (objects.empty())
return;
qsort_descent_inplace(objects, 0, objects.size() - 1);
}
static void nms_sorted_bboxes(const std::vector& faceobjects, std::vector& picked, float nms_threshold)
{
picked.clear();
const int n = faceobjects.size();
std::vector areas(n);
for (int i = 0; i < n; i++)
{
areas[i] = faceobjects[i].rect.area();
}
for (int i = 0; i < n; i++)
{
const Object& a = faceobjects[i];
int keep = 1;
for (int j = 0; j < (int)picked.size(); j++)
{
const Object& b = faceobjects[picked[j]];
// intersection over union
float inter_area = intersection_area(a, b);
float union_area = areas[i] + areas[picked[j]] - inter_area;
// float IoU = inter_area / union_area
if (inter_area / union_area > nms_threshold)
keep = 0;
}
if (keep)
picked.push_back(i);
}
}
static void generate_yolox_proposals(std::vector grid_strides, float* feat_blob, float prob_threshold, std::vector& objects)
{
const int num_class = 80;
const int num_anchors = grid_strides.size();
for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++)
{
const int grid0 = grid_strides[anchor_idx].grid0;
const int grid1 = grid_strides[anchor_idx].grid1;
const int stride = grid_strides[anchor_idx].stride;
const int basic_pos = anchor_idx * 85;
// yolox/models/yolo_head.py decode logic
float x_center = (feat_blob[basic_pos+0] + grid0) * stride;
float y_center = (feat_blob[basic_pos+1] + grid1) * stride;
float w = exp(feat_blob[basic_pos+2]) * stride;
float h = exp(feat_blob[basic_pos+3]) * stride;
float x0 = x_center - w * 0.5f;
float y0 = y_center - h * 0.5f;
float box_objectness = feat_blob[basic_pos+4];
for (int class_idx = 0; class_idx < num_class; class_idx++)
{
float box_cls_score = feat_blob[basic_pos + 5 + class_idx];
float box_prob = box_objectness * box_cls_score;
if (box_prob > prob_threshold)
{
Object obj;
obj.rect.x = x0;
obj.rect.y = y0;
obj.rect.width = w;
obj.rect.height = h;
obj.label = class_idx;
obj.prob = box_prob;
objects.push_back(obj);
}
} // class loop
} // point anchor loop
}
float* blobFromImage(cv::Mat& img){
cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
float* blob = new float[img.total()*3];
int channels = 3;
int img_h = 640;
int img_w = 640;
std::vector mean = {0.485, 0.456, 0.406};
std::vector std = {0.229, 0.224, 0.225};
for (size_t c = 0; c < channels; c++)
{
for (size_t h = 0; h < img_h; h++)
{
for (size_t w = 0; w < img_w; w++)
{
blob[c * img_w * img_h + h * img_w + w] =
(((float)img.at(h, w)[c]) / 255.0f - mean[c]) / std[c];
}
}
}
return blob;
}
static void decode_outputs(float* prob, std::vector& objects, float scale, const int img_w, const int img_h) {
std::vector proposals;
std::vector strides = {8, 16, 32};
std::vector grid_strides;
generate_grids_and_stride(INPUT_W, strides, grid_strides);
generate_yolox_proposals(grid_strides, prob, BBOX_CONF_THRESH, proposals);
std::cout << "num of boxes before nms: " << proposals.size() << std::endl;
qsort_descent_inplace(proposals);
std::vector picked;
nms_sorted_bboxes(proposals, picked, NMS_THRESH);
int count = picked.size();
std::cout << "num of boxes: " << count << std::endl;
objects.resize(count);
for (int i = 0; i < count; i++)
{
objects[i] = proposals[picked[i]];
// adjust offset to original unpadded
float x0 = (objects[i].rect.x) / scale;
float y0 = (objects[i].rect.y) / scale;
float x1 = (objects[i].rect.x + objects[i].rect.width) / scale;
float y1 = (objects[i].rect.y + objects[i].rect.height) / scale;
// clip
x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);
objects[i].rect.x = x0;
objects[i].rect.y = y0;
objects[i].rect.width = x1 - x0;
objects[i].rect.height = y1 - y0;
}
}
const float color_list[80][3] =
{
{0.000, 0.447, 0.741},
{0.850, 0.325, 0.098},
{0.929, 0.694, 0.125},
{0.494, 0.184, 0.556},
{0.466, 0.674, 0.188},
{0.301, 0.745, 0.933},
{0.635, 0.078, 0.184},
{0.300, 0.300, 0.300},
{0.600, 0.600, 0.600},
{1.000, 0.000, 0.000},
{1.000, 0.500, 0.000},
{0.749, 0.749, 0.000},
{0.000, 1.000, 0.000},
{0.000, 0.000, 1.000},
{0.667, 0.000, 1.000},
{0.333, 0.333, 0.000},
{0.333, 0.667, 0.000},
{0.333, 1.000, 0.000},
{0.667, 0.333, 0.000},
{0.667, 0.667, 0.000},
{0.667, 1.000, 0.000},
{1.000, 0.333, 0.000},
{1.000, 0.667, 0.000},
{1.000, 1.000, 0.000},
{0.000, 0.333, 0.500},
{0.000, 0.667, 0.500},
{0.000, 1.000, 0.500},
{0.333, 0.000, 0.500},
{0.333, 0.333, 0.500},
{0.333, 0.667, 0.500},
{0.333, 1.000, 0.500},
{0.667, 0.000, 0.500},
{0.667, 0.333, 0.500},
{0.667, 0.667, 0.500},
{0.667, 1.000, 0.500},
{1.000, 0.000, 0.500},
{1.000, 0.333, 0.500},
{1.000, 0.667, 0.500},
{1.000, 1.000, 0.500},
{0.000, 0.333, 1.000},
{0.000, 0.667, 1.000},
{0.000, 1.000, 1.000},
{0.333, 0.000, 1.000},
{0.333, 0.333, 1.000},
{0.333, 0.667, 1.000},
{0.333, 1.000, 1.000},
{0.667, 0.000, 1.000},
{0.667, 0.333, 1.000},
{0.667, 0.667, 1.000},
{0.667, 1.000, 1.000},
{1.000, 0.000, 1.000},
{1.000, 0.333, 1.000},
{1.000, 0.667, 1.000},
{0.333, 0.000, 0.000},
{0.500, 0.000, 0.000},
{0.667, 0.000, 0.000},
{0.833, 0.000, 0.000},
{1.000, 0.000, 0.000},
{0.000, 0.167, 0.000},
{0.000, 0.333, 0.000},
{0.000, 0.500, 0.000},
{0.000, 0.667, 0.000},
{0.000, 0.833, 0.000},
{0.000, 1.000, 0.000},
{0.000, 0.000, 0.167},
{0.000, 0.000, 0.333},
{0.000, 0.000, 0.500},
{0.000, 0.000, 0.667},
{0.000, 0.000, 0.833},
{0.000, 0.000, 1.000},
{0.000, 0.000, 0.000},
{0.143, 0.143, 0.143},
{0.286, 0.286, 0.286},
{0.429, 0.429, 0.429},
{0.571, 0.571, 0.571},
{0.714, 0.714, 0.714},
{0.857, 0.857, 0.857},
{0.000, 0.447, 0.741},
{0.314, 0.717, 0.741},
{0.50, 0.5, 0}
};
static void draw_objects(const cv::Mat& bgr, const std::vector& objects, std::string f)
{
static const char* class_names[] = {
"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
"elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
"tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
"sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
"potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
"microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
"hair drier", "toothbrush"
};
cv::Mat image = bgr.clone();
for (size_t i = 0; i < objects.size(); i++)
{
const Object& obj = objects[i];
fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
cv::Scalar color = cv::Scalar(color_list[obj.label][0], color_list[obj.label][1], color_list[obj.label][2]);
float c_mean = cv::mean(color)[0];
cv::Scalar txt_color;
if (c_mean > 0.5){
txt_color = cv::Scalar(0, 0, 0);
}else{
txt_color = cv::Scalar(255, 255, 255);
}
cv::rectangle(image, obj.rect, color * 255, 2);
char text[256];
sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
int baseLine = 0;
cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.4, 1, &baseLine);
cv::Scalar txt_bk_color = color * 0.7 * 255;
int x = obj.rect.x;
int y = obj.rect.y + 1;
//int y = obj.rect.y - label_size.height - baseLine;
if (y > image.rows)
y = image.rows;
//if (x + label_size.width > image.cols)
//x = image.cols - label_size.width;
cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
txt_bk_color, -1);
cv::putText(image, text, cv::Point(x, y + label_size.height),
cv::FONT_HERSHEY_SIMPLEX, 0.4, txt_color, 1);
}
cv::imwrite("det_res.jpg", image);
fprintf(stderr, "save vis file\n");
/* cv::imshow("image", image); */
/* cv::waitKey(0); */
}
void doInference(IExecutionContext& context, float* input, float* output, const int output_size, cv::Size input_shape) {
const ICudaEngine& engine = context.getEngine();
// Pointers to input and output device buffers to pass to engine.
// Engine requires exactly IEngine::getNbBindings() number of buffers.
assert(engine.getNbBindings() == 2);
void* buffers[2];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME);
assert(engine.getBindingDataType(inputIndex) == nvinfer1::DataType::kFLOAT);
const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME);
assert(engine.getBindingDataType(outputIndex) == nvinfer1::DataType::kFLOAT);
int mBatchSize = engine.getMaxBatchSize();
// Create GPU buffers on device
CHECK(cudaMalloc(&buffers[inputIndex], 3 * input_shape.height * input_shape.width * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex], output_size*sizeof(float)));
// Create stream
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
// DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
CHECK(cudaMemcpyAsync(buffers[inputIndex], input, 3 * input_shape.height * input_shape.width * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(1, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(output, buffers[outputIndex], output_size * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
// Release stream and buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputIndex]));
}
int main(int argc, char** argv) {
cudaSetDevice(DEVICE);
// create a model using the API directly and serialize it to a stream
char *trtModelStream{nullptr};
size_t size{0};
if (argc == 4 && std::string(argv[2]) == "-i") {
const std::string engine_file_path {argv[1]};
std::ifstream file(engine_file_path, std::ios::binary);
if (file.good()) {
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
trtModelStream = new char[size];
assert(trtModelStream);
file.read(trtModelStream, size);
file.close();
}
} else {
std::cerr << "arguments not right!" << std::endl;
std::cerr << "run 'python3 yolox/deploy/trt.py -n yolox-{tiny, s, m, l, x}' to serialize model first!" << std::endl;
std::cerr << "Then use the following command:" << std::endl;
std::cerr << "./yolox ../model_trt.engine -i ../../../assets/dog.jpg // deserialize file and run inference" << std::endl;
return -1;
}
const std::string input_image_path {argv[3]};
//std::vector file_names;
//if (read_files_in_dir(argv[2], file_names) < 0) {
//std::cout << "read_files_in_dir failed." << std::endl;
//return -1;
//}
IRuntime* runtime = createInferRuntime(gLogger);
assert(runtime != nullptr);
ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
assert(engine != nullptr);
IExecutionContext* context = engine->createExecutionContext();
assert(context != nullptr);
delete[] trtModelStream;
auto out_dims = engine->getBindingDimensions(1);
auto output_size = 1;
for(int j=0;j(end - start).count() << "ms" << std::endl;
std::vector objects;
decode_outputs(prob, objects, scale, img_w, img_h);
draw_objects(img, objects, input_image_path);
// destroy the engine
context->destroy();
engine->destroy();
runtime->destroy();
return 0;
}
================================================
FILE: detector/YOLOX/demo/TensorRT/python/README.md
================================================
# YOLOX-TensorRT in Python
This toturial includes a Python demo for TensorRT.
## Install TensorRT Toolkit
Please follow the [TensorRT Installation Guide](https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html) and [torch2trt gitrepo](https://github.com/NVIDIA-AI-IOT/torch2trt) to install TensorRT and torch2trt.
## Convert model
YOLOX models can be easily conveted to TensorRT models using torch2trt
If you want to convert our model, use the flag -n to specify a model name:
```shell
python tools/trt.py -n -c
```
For example:
```shell
python tools/trt.py -n yolox-s -c your_ckpt.pth.tar
```
can be: yolox-nano, yolox-tiny. yolox-s, yolox-m, yolox-l, yolox-x.
If you want to convert your customized model, use the flag -f to specify you exp file:
```shell
python tools/trt.py -f -c
```
For example:
```shell
python tools/trt.py -f /path/to/your/yolox/exps/yolox_s.py -c your_ckpt.pth.tar
```
*yolox_s.py* can be any exp file modified by you.
The converted model and the serialized engine file (for C++ demo) will be saved on your experiment output dir.
## Demo
The TensorRT python demo is merged on our pytorch demo file, so you can run the pytorch demo command with ```--trt```.
```shell
python tools/demo.py image -n yolox-s --trt --save_result
```
or
```shell
python tools/demo.py image -f exps/default/yolox_s.py --trt --save_result
```
================================================
FILE: detector/YOLOX/demo/ncnn/android/README.md
================================================
# YOLOX-Android-ncnn
Andoird app of YOLOX object detection base on [ncnn](https://github.com/Tencent/ncnn)
## Tutorial
### Step1
Download ncnn-android-vulkan.zip from [releases of ncnn](https://github.com/Tencent/ncnn/releases). This repo us
[20210525 release](https://github.com/Tencent/ncnn/releases/download/20210525/ncnn-20210525-android-vulkan.zip) for building.
### Step2
After downloading, please extract your zip file. Then, there are two ways to finish this step:
* put your extracted directory into app/src/main/jni
* change the ncnn_DIR path in app/src/main/jni/CMakeLists.txt to your extracted directory.
### Step3
Download example param and bin file from [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/ESXBH_GSSmFMszWJ6YG2VkQB5cWDfqVWXgk0D996jH0rpQ?e=qzEqUh) or [github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_s_ncnn.tar.gz). Unzip the file to app/src/main/assets.
### Step4
Open this project with Android Studio, build it and enjoy!
## Reference
* [ncnn-android-yolov5](https://github.com/nihui/ncnn-android-yolov5)
================================================
FILE: detector/YOLOX/demo/ncnn/android/app/build.gradle
================================================
apply plugin: 'com.android.application'
android {
compileSdkVersion 24
buildToolsVersion "29.0.2"
defaultConfig {
applicationId "com.megvii.yoloXncnn"
archivesBaseName = "$applicationId"
ndk {
moduleName "ncnn"
abiFilters "armeabi-v7a", "arm64-v8a"
}
minSdkVersion 24
}
externalNativeBuild {
cmake {
version "3.10.2"
path file('src/main/jni/CMakeLists.txt')
}
}
}
================================================
FILE: detector/YOLOX/demo/ncnn/android/app/src/main/AndroidManifest.xml
================================================
================================================
FILE: detector/YOLOX/demo/ncnn/android/app/src/main/assets/yolox.param
================================================
7767517
220 250
Input images 0 1 images
YoloV5Focus focus 1 1 images 503
Convolution Conv_41 1 1 503 877 0=32 1=3 4=1 5=1 6=3456
Swish Mul_43 1 1 877 507
Convolution Conv_44 1 1 507 880 0=64 1=3 3=2 4=1 5=1 6=18432
Swish Mul_46 1 1 880 511
Split splitncnn_0 1 2 511 511_splitncnn_0 511_splitncnn_1
Convolution Conv_47 1 1 511_splitncnn_1 883 0=32 1=1 5=1 6=2048
Swish Mul_49 1 1 883 515
Split splitncnn_1 1 2 515 515_splitncnn_0 515_splitncnn_1
Convolution Conv_50 1 1 511_splitncnn_0 886 0=32 1=1 5=1 6=2048
Swish Mul_52 1 1 886 519
Convolution Conv_53 1 1 515_splitncnn_1 889 0=32 1=1 5=1 6=1024
Swish Mul_55 1 1 889 523
Convolution Conv_56 1 1 523 892 0=32 1=3 4=1 5=1 6=9216
Swish Mul_58 1 1 892 527
BinaryOp Add_59 2 1 527 515_splitncnn_0 528
Concat Concat_60 2 1 528 519 529
Convolution Conv_61 1 1 529 895 0=64 1=1 5=1 6=4096
Swish Mul_63 1 1 895 533
Convolution Conv_64 1 1 533 898 0=128 1=3 3=2 4=1 5=1 6=73728
Swish Mul_66 1 1 898 537
Split splitncnn_2 1 2 537 537_splitncnn_0 537_splitncnn_1
Convolution Conv_67 1 1 537_splitncnn_1 901 0=64 1=1 5=1 6=8192
Swish Mul_69 1 1 901 541
Split splitncnn_3 1 2 541 541_splitncnn_0 541_splitncnn_1
Convolution Conv_70 1 1 537_splitncnn_0 904 0=64 1=1 5=1 6=8192
Swish Mul_72 1 1 904 545
Convolution Conv_73 1 1 541_splitncnn_1 907 0=64 1=1 5=1 6=4096
Swish Mul_75 1 1 907 549
Convolution Conv_76 1 1 549 910 0=64 1=3 4=1 5=1 6=36864
Swish Mul_78 1 1 910 553
BinaryOp Add_79 2 1 553 541_splitncnn_0 554
Split splitncnn_4 1 2 554 554_splitncnn_0 554_splitncnn_1
Convolution Conv_80 1 1 554_splitncnn_1 913 0=64 1=1 5=1 6=4096
Swish Mul_82 1 1 913 558
Convolution Conv_83 1 1 558 916 0=64 1=3 4=1 5=1 6=36864
Swish Mul_85 1 1 916 562
BinaryOp Add_86 2 1 562 554_splitncnn_0 563
Split splitncnn_5 1 2 563 563_splitncnn_0 563_splitncnn_1
Convolution Conv_87 1 1 563_splitncnn_1 919 0=64 1=1 5=1 6=4096
Swish Mul_89 1 1 919 567
Convolution Conv_90 1 1 567 922 0=64 1=3 4=1 5=1 6=36864
Swish Mul_92 1 1 922 571
BinaryOp Add_93 2 1 571 563_splitncnn_0 572
Concat Concat_94 2 1 572 545 573
Convolution Conv_95 1 1 573 925 0=128 1=1 5=1 6=16384
Swish Mul_97 1 1 925 577
Split splitncnn_6 1 2 577 577_splitncnn_0 577_splitncnn_1
Convolution Conv_98 1 1 577_splitncnn_1 928 0=256 1=3 3=2 4=1 5=1 6=294912
Swish Mul_100 1 1 928 581
Split splitncnn_7 1 2 581 581_splitncnn_0 581_splitncnn_1
Convolution Conv_101 1 1 581_splitncnn_1 931 0=128 1=1 5=1 6=32768
Swish Mul_103 1 1 931 585
Split splitncnn_8 1 2 585 585_splitncnn_0 585_splitncnn_1
Convolution Conv_104 1 1 581_splitncnn_0 934 0=128 1=1 5=1 6=32768
Swish Mul_106 1 1 934 589
Convolution Conv_107 1 1 585_splitncnn_1 937 0=128 1=1 5=1 6=16384
Swish Mul_109 1 1 937 593
Convolution Conv_110 1 1 593 940 0=128 1=3 4=1 5=1 6=147456
Swish Mul_112 1 1 940 597
BinaryOp Add_113 2 1 597 585_splitncnn_0 598
Split splitncnn_9 1 2 598 598_splitncnn_0 598_splitncnn_1
Convolution Conv_114 1 1 598_splitncnn_1 943 0=128 1=1 5=1 6=16384
Swish Mul_116 1 1 943 602
Convolution Conv_117 1 1 602 946 0=128 1=3 4=1 5=1 6=147456
Swish Mul_119 1 1 946 606
BinaryOp Add_120 2 1 606 598_splitncnn_0 607
Split splitncnn_10 1 2 607 607_splitncnn_0 607_splitncnn_1
Convolution Conv_121 1 1 607_splitncnn_1 949 0=128 1=1 5=1 6=16384
Swish Mul_123 1 1 949 611
Convolution Conv_124 1 1 611 952 0=128 1=3 4=1 5=1 6=147456
Swish Mul_126 1 1 952 615
BinaryOp Add_127 2 1 615 607_splitncnn_0 616
Concat Concat_128 2 1 616 589 617
Convolution Conv_129 1 1 617 955 0=256 1=1 5=1 6=65536
Swish Mul_131 1 1 955 621
Split splitncnn_11 1 2 621 621_splitncnn_0 621_splitncnn_1
Convolution Conv_132 1 1 621_splitncnn_1 958 0=512 1=3 3=2 4=1 5=1 6=1179648
Swish Mul_134 1 1 958 625
Convolution Conv_135 1 1 625 961 0=256 1=1 5=1 6=131072
Swish Mul_137 1 1 961 629
Split splitncnn_12 1 4 629 629_splitncnn_0 629_splitncnn_1 629_splitncnn_2 629_splitncnn_3
Pooling MaxPool_138 1 1 629_splitncnn_3 630 1=5 3=2 5=1
Pooling MaxPool_139 1 1 629_splitncnn_2 631 1=9 3=4 5=1
Pooling MaxPool_140 1 1 629_splitncnn_1 632 1=13 3=6 5=1
Concat Concat_141 4 1 629_splitncnn_0 630 631 632 633
Convolution Conv_142 1 1 633 964 0=512 1=1 5=1 6=524288
Swish Mul_144 1 1 964 637
Split splitncnn_13 1 2 637 637_splitncnn_0 637_splitncnn_1
Convolution Conv_145 1 1 637_splitncnn_1 967 0=256 1=1 5=1 6=131072
Swish Mul_147 1 1 967 641
Convolution Conv_148 1 1 637_splitncnn_0 970 0=256 1=1 5=1 6=131072
Swish Mul_150 1 1 970 645
Convolution Conv_151 1 1 641 973 0=256 1=1 5=1 6=65536
Swish Mul_153 1 1 973 649
Convolution Conv_154 1 1 649 976 0=256 1=3 4=1 5=1 6=589824
Swish Mul_156 1 1 976 653
Concat Concat_157 2 1 653 645 654
Convolution Conv_158 1 1 654 979 0=512 1=1 5=1 6=262144
Swish Mul_160 1 1 979 658
Convolution Conv_161 1 1 658 982 0=256 1=1 5=1 6=131072
Swish Mul_163 1 1 982 662
Split splitncnn_14 1 2 662 662_splitncnn_0 662_splitncnn_1
Interp Resize_165 1 1 662_splitncnn_1 667 0=1 1=2.000000e+00 2=2.000000e+00
Concat Concat_166 2 1 667 621_splitncnn_0 668
Split splitncnn_15 1 2 668 668_splitncnn_0 668_splitncnn_1
Convolution Conv_167 1 1 668_splitncnn_1 985 0=128 1=1 5=1 6=65536
Swish Mul_169 1 1 985 672
Convolution Conv_170 1 1 668_splitncnn_0 988 0=128 1=1 5=1 6=65536
Swish Mul_172 1 1 988 676
Convolution Conv_173 1 1 672 991 0=128 1=1 5=1 6=16384
Swish Mul_175 1 1 991 680
Convolution Conv_176 1 1 680 994 0=128 1=3 4=1 5=1 6=147456
Swish Mul_178 1 1 994 684
Concat Concat_179 2 1 684 676 685
Convolution Conv_180 1 1 685 997 0=256 1=1 5=1 6=65536
Swish Mul_182 1 1 997 689
Convolution Conv_183 1 1 689 1000 0=128 1=1 5=1 6=32768
Swish Mul_185 1 1 1000 693
Split splitncnn_16 1 2 693 693_splitncnn_0 693_splitncnn_1
Interp Resize_187 1 1 693_splitncnn_1 698 0=1 1=2.000000e+00 2=2.000000e+00
Concat Concat_188 2 1 698 577_splitncnn_0 699
Split splitncnn_17 1 2 699 699_splitncnn_0 699_splitncnn_1
Convolution Conv_189 1 1 699_splitncnn_1 1003 0=64 1=1 5=1 6=16384
Swish Mul_191 1 1 1003 703
Convolution Conv_192 1 1 699_splitncnn_0 1006 0=64 1=1 5=1 6=16384
Swish Mul_194 1 1 1006 707
Convolution Conv_195 1 1 703 1009 0=64 1=1 5=1 6=4096
Swish Mul_197 1 1 1009 711
Convolution Conv_198 1 1 711 1012 0=64 1=3 4=1 5=1 6=36864
Swish Mul_200 1 1 1012 715
Concat Concat_201 2 1 715 707 716
Convolution Conv_202 1 1 716 1015 0=128 1=1 5=1 6=16384
Swish Mul_204 1 1 1015 720
Split splitncnn_18 1 2 720 720_splitncnn_0 720_splitncnn_1
Convolution Conv_205 1 1 720_splitncnn_1 1018 0=128 1=3 3=2 4=1 5=1 6=147456
Swish Mul_207 1 1 1018 724
Concat Concat_208 2 1 724 693_splitncnn_0 725
Split splitncnn_19 1 2 725 725_splitncnn_0 725_splitncnn_1
Convolution Conv_209 1 1 725_splitncnn_1 1021 0=128 1=1 5=1 6=32768
Swish Mul_211 1 1 1021 729
Convolution Conv_212 1 1 725_splitncnn_0 1024 0=128 1=1 5=1 6=32768
Swish Mul_214 1 1 1024 733
Convolution Conv_215 1 1 729 1027 0=128 1=1 5=1 6=16384
Swish Mul_217 1 1 1027 737
Convolution Conv_218 1 1 737 1030 0=128 1=3 4=1 5=1 6=147456
Swish Mul_220 1 1 1030 741
Concat Concat_221 2 1 741 733 742
Convolution Conv_222 1 1 742 1033 0=256 1=1 5=1 6=65536
Swish Mul_224 1 1 1033 746
Split splitncnn_20 1 2 746 746_splitncnn_0 746_splitncnn_1
Convolution Conv_225 1 1 746_splitncnn_1 1036 0=256 1=3 3=2 4=1 5=1 6=589824
Swish Mul_227 1 1 1036 750
Concat Concat_228 2 1 750 662_splitncnn_0 751
Split splitncnn_21 1 2 751 751_splitncnn_0 751_splitncnn_1
Convolution Conv_229 1 1 751_splitncnn_1 1039 0=256 1=1 5=1 6=131072
Swish Mul_231 1 1 1039 755
Convolution Conv_232 1 1 751_splitncnn_0 1042 0=256 1=1 5=1 6=131072
Swish Mul_234 1 1 1042 759
Convolution Conv_235 1 1 755 1045 0=256 1=1 5=1 6=65536
Swish Mul_237 1 1 1045 763
Convolution Conv_238 1 1 763 1048 0=256 1=3 4=1 5=1 6=589824
Swish Mul_240 1 1 1048 767
Concat Concat_241 2 1 767 759 768
Convolution Conv_242 1 1 768 1051 0=512 1=1 5=1 6=262144
Swish Mul_244 1 1 1051 772
Convolution Conv_245 1 1 720_splitncnn_0 1054 0=128 1=1 5=1 6=16384
Swish Mul_247 1 1 1054 776
Split splitncnn_22 1 2 776 776_splitncnn_0 776_splitncnn_1
Convolution Conv_248 1 1 776_splitncnn_1 1057 0=128 1=3 4=1 5=1 6=147456
Swish Mul_250 1 1 1057 780
Convolution Conv_251 1 1 780 1060 0=128 1=3 4=1 5=1 6=147456
Swish Mul_253 1 1 1060 784
Convolution Conv_254 1 1 784 797 0=80 1=1 5=1 6=10240 9=4
Convolution Conv_255 1 1 776_splitncnn_0 1063 0=128 1=3 4=1 5=1 6=147456
Swish Mul_257 1 1 1063 789
Convolution Conv_258 1 1 789 1066 0=128 1=3 4=1 5=1 6=147456
Swish Mul_260 1 1 1066 793
Split splitncnn_23 1 2 793 793_splitncnn_0 793_splitncnn_1
Convolution Conv_261 1 1 793_splitncnn_1 794 0=4 1=1 5=1 6=512
Convolution Conv_262 1 1 793_splitncnn_0 796 0=1 1=1 5=1 6=128 9=4
Concat Concat_265 3 1 794 796 797 798
Convolution Conv_266 1 1 746_splitncnn_0 1069 0=128 1=1 5=1 6=32768
Swish Mul_268 1 1 1069 802
Split splitncnn_24 1 2 802 802_splitncnn_0 802_splitncnn_1
Convolution Conv_269 1 1 802_splitncnn_1 1072 0=128 1=3 4=1 5=1 6=147456
Swish Mul_271 1 1 1072 806
Convolution Conv_272 1 1 806 1075 0=128 1=3 4=1 5=1 6=147456
Swish Mul_274 1 1 1075 810
Convolution Conv_275 1 1 810 823 0=80 1=1 5=1 6=10240 9=4
Convolution Conv_276 1 1 802_splitncnn_0 1078 0=128 1=3 4=1 5=1 6=147456
Swish Mul_278 1 1 1078 815
Convolution Conv_279 1 1 815 1081 0=128 1=3 4=1 5=1 6=147456
Swish Mul_281 1 1 1081 819
Split splitncnn_25 1 2 819 819_splitncnn_0 819_splitncnn_1
Convolution Conv_282 1 1 819_splitncnn_1 820 0=4 1=1 5=1 6=512
Convolution Conv_283 1 1 819_splitncnn_0 822 0=1 1=1 5=1 6=128 9=4
Concat Concat_286 3 1 820 822 823 824
Convolution Conv_287 1 1 772 1084 0=128 1=1 5=1 6=65536
Swish Mul_289 1 1 1084 828
Split splitncnn_26 1 2 828 828_splitncnn_0 828_splitncnn_1
Convolution Conv_290 1 1 828_splitncnn_1 1087 0=128 1=3 4=1 5=1 6=147456
Swish Mul_292 1 1 1087 832
Convolution Conv_293 1 1 832 1090 0=128 1=3 4=1 5=1 6=147456
Swish Mul_295 1 1 1090 836
Convolution Conv_296 1 1 836 849 0=80 1=1 5=1 6=10240 9=4
Convolution Conv_297 1 1 828_splitncnn_0 1093 0=128 1=3 4=1 5=1 6=147456
Swish Mul_299 1 1 1093 841
Convolution Conv_300 1 1 841 1096 0=128 1=3 4=1 5=1 6=147456
Swish Mul_302 1 1 1096 845
Split splitncnn_27 1 2 845 845_splitncnn_0 845_splitncnn_1
Convolution Conv_303 1 1 845_splitncnn_1 846 0=4 1=1 5=1 6=512
Convolution Conv_304 1 1 845_splitncnn_0 848 0=1 1=1 5=1 6=128 9=4
Concat Concat_307 3 1 846 848 849 850
Reshape Reshape_315 1 1 798 858 0=-1 1=85
Reshape Reshape_323 1 1 824 866 0=-1 1=85
Reshape Reshape_331 1 1 850 874 0=-1 1=85
Concat Concat_332 3 1 858 866 874 875 0=1
Permute Transpose_333 1 1 875 output 0=1
================================================
FILE: detector/YOLOX/demo/ncnn/android/app/src/main/java/com/megvii/yoloXncnn/MainActivity.java
================================================
// Some code in this file is based on:
// https://github.com/nihui/ncnn-android-yolov5/blob/master/app/src/main/java/com/tencent/yolov5ncnn/MainActivity.java
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
// Copyright (C) Megvii, Inc. and its affiliates. All rights reserved.
package com.megvii.yoloXncnn;
import android.app.Activity;
import android.content.Intent;
import android.graphics.Bitmap;
import android.graphics.BitmapFactory;
import android.graphics.Canvas;
import android.graphics.Color;
import android.graphics.Paint;
import android.media.ExifInterface;
import android.graphics.Matrix;
import android.net.Uri;
import android.os.Bundle;
import android.util.Log;
import android.view.View;
import android.widget.Button;
import android.widget.ImageView;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.io.IOException;
public class MainActivity extends Activity
{
private static final int SELECT_IMAGE = 1;
private ImageView imageView;
private Bitmap bitmap = null;
private Bitmap yourSelectedImage = null;
private YOLOXncnn yoloX = new YOLOXncnn();
/** Called when the activity is first created. */
@Override
public void onCreate(Bundle savedInstanceState)
{
super.onCreate(savedInstanceState);
setContentView(R.layout.main);
boolean ret_init = yoloX.Init(getAssets());
if (!ret_init)
{
Log.e("MainActivity", "yoloXncnn Init failed");
}
imageView = (ImageView) findViewById(R.id.imageView);
Button buttonImage = (Button) findViewById(R.id.buttonImage);
buttonImage.setOnClickListener(new View.OnClickListener() {
@Override
public void onClick(View arg0) {
Intent i = new Intent(Intent.ACTION_PICK);
i.setType("image/*");
startActivityForResult(i, SELECT_IMAGE);
}
});
Button buttonDetect = (Button) findViewById(R.id.buttonDetect);
buttonDetect.setOnClickListener(new View.OnClickListener() {
@Override
public void onClick(View arg0) {
if (yourSelectedImage == null)
return;
YOLOXncnn.Obj[] objects = yoloX.Detect(yourSelectedImage, false);
showObjects(objects);
}
});
Button buttonDetectGPU = (Button) findViewById(R.id.buttonDetectGPU);
buttonDetectGPU.setOnClickListener(new View.OnClickListener() {
@Override
public void onClick(View arg0) {
if (yourSelectedImage == null)
return;
YOLOXncnn.Obj[] objects = yoloX.Detect(yourSelectedImage, true);
showObjects(objects);
}
});
}
private void showObjects(YOLOXncnn.Obj[] objects)
{
if (objects == null)
{
imageView.setImageBitmap(bitmap);
return;
}
// draw objects on bitmap
Bitmap rgba = bitmap.copy(Bitmap.Config.ARGB_8888, true);
final int[] colors = new int[] {
Color.rgb( 54, 67, 244),
Color.rgb( 99, 30, 233),
Color.rgb(176, 39, 156),
Color.rgb(183, 58, 103),
Color.rgb(181, 81, 63),
Color.rgb(243, 150, 33),
Color.rgb(244, 169, 3),
Color.rgb(212, 188, 0),
Color.rgb(136, 150, 0),
Color.rgb( 80, 175, 76),
Color.rgb( 74, 195, 139),
Color.rgb( 57, 220, 205),
Color.rgb( 59, 235, 255),
Color.rgb( 7, 193, 255),
Color.rgb( 0, 152, 255),
Color.rgb( 34, 87, 255),
Color.rgb( 72, 85, 121),
Color.rgb(158, 158, 158),
Color.rgb(139, 125, 96)
};
Canvas canvas = new Canvas(rgba);
Paint paint = new Paint();
paint.setStyle(Paint.Style.STROKE);
paint.setStrokeWidth(4);
Paint textbgpaint = new Paint();
textbgpaint.setColor(Color.WHITE);
textbgpaint.setStyle(Paint.Style.FILL);
Paint textpaint = new Paint();
textpaint.setColor(Color.BLACK);
textpaint.setTextSize(26);
textpaint.setTextAlign(Paint.Align.LEFT);
for (int i = 0; i < objects.length; i++)
{
paint.setColor(colors[i % 19]);
canvas.drawRect(objects[i].x, objects[i].y, objects[i].x + objects[i].w, objects[i].y + objects[i].h, paint);
// draw filled text inside image
{
String text = objects[i].label + " = " + String.format("%.1f", objects[i].prob * 100) + "%";
float text_width = textpaint.measureText(text);
float text_height = - textpaint.ascent() + textpaint.descent();
float x = objects[i].x;
float y = objects[i].y - text_height;
if (y < 0)
y = 0;
if (x + text_width > rgba.getWidth())
x = rgba.getWidth() - text_width;
canvas.drawRect(x, y, x + text_width, y + text_height, textbgpaint);
canvas.drawText(text, x, y - textpaint.ascent(), textpaint);
}
}
imageView.setImageBitmap(rgba);
}
@Override
protected void onActivityResult(int requestCode, int resultCode, Intent data)
{
super.onActivityResult(requestCode, resultCode, data);
if (resultCode == RESULT_OK && null != data) {
Uri selectedImage = data.getData();
try
{
if (requestCode == SELECT_IMAGE) {
bitmap = decodeUri(selectedImage);
yourSelectedImage = bitmap.copy(Bitmap.Config.ARGB_8888, true);
imageView.setImageBitmap(bitmap);
}
}
catch (FileNotFoundException e)
{
Log.e("MainActivity", "FileNotFoundException");
return;
}
}
}
private Bitmap decodeUri(Uri selectedImage) throws FileNotFoundException
{
// Decode image size
BitmapFactory.Options o = new BitmapFactory.Options();
o.inJustDecodeBounds = true;
BitmapFactory.decodeStream(getContentResolver().openInputStream(selectedImage), null, o);
// The new size we want to scale to
final int REQUIRED_SIZE = 640;
// Find the correct scale value. It should be the power of 2.
int width_tmp = o.outWidth, height_tmp = o.outHeight;
int scale = 1;
while (true) {
if (width_tmp / 2 < REQUIRED_SIZE || height_tmp / 2 < REQUIRED_SIZE) {
break;
}
width_tmp /= 2;
height_tmp /= 2;
scale *= 2;
}
// Decode with inSampleSize
BitmapFactory.Options o2 = new BitmapFactory.Options();
o2.inSampleSize = scale;
Bitmap bitmap = BitmapFactory.decodeStream(getContentResolver().openInputStream(selectedImage), null, o2);
// Rotate according to EXIF
int rotate = 0;
try
{
ExifInterface exif = new ExifInterface(getContentResolver().openInputStream(selectedImage));
int orientation = exif.getAttributeInt(ExifInterface.TAG_ORIENTATION, ExifInterface.ORIENTATION_NORMAL);
switch (orientation) {
case ExifInterface.ORIENTATION_ROTATE_270:
rotate = 270;
break;
case ExifInterface.ORIENTATION_ROTATE_180:
rotate = 180;
break;
case ExifInterface.ORIENTATION_ROTATE_90:
rotate = 90;
break;
}
}
catch (IOException e)
{
Log.e("MainActivity", "ExifInterface IOException");
}
Matrix matrix = new Matrix();
matrix.postRotate(rotate);
return Bitmap.createBitmap(bitmap, 0, 0, bitmap.getWidth(), bitmap.getHeight(), matrix, true);
}
}
================================================
FILE: detector/YOLOX/demo/ncnn/android/app/src/main/java/com/megvii/yoloXncnn/YOLOXncnn.java
================================================
// Copyright (C) Megvii, Inc. and its affiliates. All rights reserved.
package com.megvii.yoloXncnn;
import android.content.res.AssetManager;
import android.graphics.Bitmap;
public class YOLOXncnn
{
public native boolean Init(AssetManager mgr);
public class Obj
{
public float x;
public float y;
public float w;
public float h;
public String label;
public float prob;
}
public native Obj[] Detect(Bitmap bitmap, boolean use_gpu);
static {
System.loadLibrary("yoloXncnn");
}
}
================================================
FILE: detector/YOLOX/demo/ncnn/android/app/src/main/java/com/megvii/yoloXncnn/yoloXncnn.java
================================================
// Copyright (C) Megvii, Inc. and its affiliates. All rights reserved.
package com.megvii.yoloXncnn;
import android.content.res.AssetManager;
import android.graphics.Bitmap;
public class YOLOXncnn
{
public native boolean Init(AssetManager mgr);
public class Obj
{
public float x;
public float y;
public float w;
public float h;
public String label;
public float prob;
}
public native Obj[] Detect(Bitmap bitmap, boolean use_gpu);
static {
System.loadLibrary("yoloXncnn");
}
}
================================================
FILE: detector/YOLOX/demo/ncnn/android/app/src/main/jni/CMakeLists.txt
================================================
project(yoloXncnn)
cmake_minimum_required(VERSION 3.4.1)
set(ncnn_DIR ${CMAKE_SOURCE_DIR}/ncnn-20210525-android-vulkan/${ANDROID_ABI}/lib/cmake/ncnn)
find_package(ncnn REQUIRED)
add_library(yoloXncnn SHARED yoloXncnn_jni.cpp)
target_link_libraries(yoloXncnn
ncnn
jnigraphics
)
================================================
FILE: detector/YOLOX/demo/ncnn/android/app/src/main/jni/yoloXncnn_jni.cpp
================================================
// Some code in this file is based on:
// https://github.com/nihui/ncnn-android-yolov5/blob/master/app/src/main/jni/yolov5ncnn_jni.cpp
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
// Copyright (C) Megvii, Inc. and its affiliates. All rights reserved.
#include
#include
#include
#include
#include
#include
// ncnn
#include "layer.h"
#include "net.h"
#include "benchmark.h"
static ncnn::UnlockedPoolAllocator g_blob_pool_allocator;
static ncnn::PoolAllocator g_workspace_pool_allocator;
static ncnn::Net yoloX;
class YoloV5Focus : public ncnn::Layer
{
public:
YoloV5Focus()
{
one_blob_only = true;
}
virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob, const ncnn::Option& opt) const
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
int outw = w / 2;
int outh = h / 2;
int outc = channels * 4;
top_blob.create(outw, outh, outc, 4u, 1, opt.blob_allocator);
if (top_blob.empty())
return -100;
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outc; p++)
{
const float* ptr = bottom_blob.channel(p % channels).row((p / channels) % 2) + ((p / channels) / 2);
float* outptr = top_blob.channel(p);
for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)
{
*outptr = *ptr;
outptr += 1;
ptr += 2;
}
ptr += w;
}
}
return 0;
}
};
DEFINE_LAYER_CREATOR(YoloV5Focus)
struct Object
{
float x;
float y;
float w;
float h;
int label;
float prob;
};
struct GridAndStride
{
int grid0;
int grid1;
int stride;
};
static inline float intersection_area(const Object& a, const Object& b)
{
if (a.x > b.x + b.w || a.x + a.w < b.x || a.y > b.y + b.h || a.y + a.h < b.y)
{
// no intersection
return 0.f;
}
float inter_width = std::min(a.x + a.w, b.x + b.w) - std::max(a.x, b.x);
float inter_height = std::min(a.y + a.h, b.y + b.h) - std::max(a.y, b.y);
return inter_width * inter_height;
}
static void qsort_descent_inplace(std::vector& faceobjects, int left, int right)
{
int i = left;
int j = right;
float p = faceobjects[(left + right) / 2].prob;
while (i <= j)
{
while (faceobjects[i].prob > p)
i++;
while (faceobjects[j].prob < p)
j--;
if (i <= j)
{
// swap
std::swap(faceobjects[i], faceobjects[j]);
i++;
j--;
}
}
#pragma omp parallel sections
{
#pragma omp section
{
if (left < j) qsort_descent_inplace(faceobjects, left, j);
}
#pragma omp section
{
if (i < right) qsort_descent_inplace(faceobjects, i, right);
}
}
}
static void qsort_descent_inplace(std::vector& faceobjects)
{
if (faceobjects.empty())
return;
qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
}
static void nms_sorted_bboxes(const std::vector& faceobjects, std::vector& picked, float nms_threshold)
{
picked.clear();
const int n = faceobjects.size();
std::vector areas(n);
for (int i = 0; i < n; i++)
{
areas[i] = faceobjects[i].w * faceobjects[i].h;
}
for (int i = 0; i < n; i++)
{
const Object& a = faceobjects[i];
int keep = 1;
for (int j = 0; j < (int)picked.size(); j++)
{
const Object& b = faceobjects[picked[j]];
// intersection over union
float inter_area = intersection_area(a, b);
float union_area = areas[i] + areas[picked[j]] - inter_area;
// float IoU = inter_area / union_area
if (inter_area / union_area > nms_threshold)
keep = 0;
}
if (keep)
picked.push_back(i);
}
}
static void generate_grids_and_stride(const int target_size, std::vector& strides, std::vector& grid_strides)
{
for (auto stride : strides)
{
int num_grid = target_size / stride;
for (int g1 = 0; g1 < num_grid; g1++)
{
for (int g0 = 0; g0 < num_grid; g0++)
{
grid_strides.push_back((GridAndStride){g0, g1, stride});
}
}
}
}
static void generate_yolox_proposals(std::vector grid_strides, const ncnn::Mat& feat_blob, float prob_threshold, std::vector& objects)
{
const int num_grid = feat_blob.h;
fprintf(stderr, "output height: %d, width: %d, channels: %d, dims:%d\n", feat_blob.h, feat_blob.w, feat_blob.c, feat_blob.dims);
const int num_class = feat_blob.w - 5;
const int num_anchors = grid_strides.size();
const float* feat_ptr = feat_blob.channel(0);
for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++)
{
const int grid0 = grid_strides[anchor_idx].grid0;
const int grid1 = grid_strides[anchor_idx].grid1;
const int stride = grid_strides[anchor_idx].stride;
// yolox/models/yolo_head.py decode logic
// outputs[..., :2] = (outputs[..., :2] + grids) * strides
// outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides
float x_center = (feat_ptr[0] + grid0) * stride;
float y_center = (feat_ptr[1] + grid1) * stride;
float w = exp(feat_ptr[2]) * stride;
float h = exp(feat_ptr[3]) * stride;
float x0 = x_center - w * 0.5f;
float y0 = y_center - h * 0.5f;
float box_objectness = feat_ptr[4];
for (int class_idx = 0; class_idx < num_class; class_idx++)
{
float box_cls_score = feat_ptr[5 + class_idx];
float box_prob = box_objectness * box_cls_score;
if (box_prob > prob_threshold)
{
Object obj;
obj.x = x0;
obj.y = y0;
obj.w = w;
obj.h = h;
obj.label = class_idx;
obj.prob = box_prob;
objects.push_back(obj);
}
} // class loop
feat_ptr += feat_blob.w;
} // point anchor loop
}
extern "C" {
// FIXME DeleteGlobalRef is missing for objCls
static jclass objCls = NULL;
static jmethodID constructortorId;
static jfieldID xId;
static jfieldID yId;
static jfieldID wId;
static jfieldID hId;
static jfieldID labelId;
static jfieldID probId;
JNIEXPORT jint JNI_OnLoad(JavaVM* vm, void* reserved)
{
__android_log_print(ANDROID_LOG_DEBUG, "YOLOXncnn", "JNI_OnLoad");
ncnn::create_gpu_instance();
return JNI_VERSION_1_4;
}
JNIEXPORT void JNI_OnUnload(JavaVM* vm, void* reserved)
{
__android_log_print(ANDROID_LOG_DEBUG, "YOLOXncnn", "JNI_OnUnload");
ncnn::destroy_gpu_instance();
}
// public native boolean Init(AssetManager mgr);
JNIEXPORT jboolean JNICALL Java_com_megvii_yoloXncnn_YOLOXncnn_Init(JNIEnv* env, jobject thiz, jobject assetManager)
{
ncnn::Option opt;
opt.lightmode = true;
opt.num_threads = 4;
opt.blob_allocator = &g_blob_pool_allocator;
opt.workspace_allocator = &g_workspace_pool_allocator;
opt.use_packing_layout = true;
// use vulkan compute
if (ncnn::get_gpu_count() != 0)
opt.use_vulkan_compute = true;
AAssetManager* mgr = AAssetManager_fromJava(env, assetManager);
yoloX.opt = opt;
yoloX.register_custom_layer("YoloV5Focus", YoloV5Focus_layer_creator);
// init param
{
int ret = yoloX.load_param(mgr, "yolox.param");
if (ret != 0)
{
__android_log_print(ANDROID_LOG_DEBUG, "YOLOXncnn", "load_param failed");
return JNI_FALSE;
}
}
// init bin
{
int ret = yoloX.load_model(mgr, "yolox.bin");
if (ret != 0)
{
__android_log_print(ANDROID_LOG_DEBUG, "YOLOXncnn", "load_model failed");
return JNI_FALSE;
}
}
// init jni glue
jclass localObjCls = env->FindClass("com/megvii/yoloXncnn/YOLOXncnn$Obj");
objCls = reinterpret_cast(env->NewGlobalRef(localObjCls));
constructortorId = env->GetMethodID(objCls, "", "(Lcom/megvii/yoloXncnn/YOLOXncnn;)V");
xId = env->GetFieldID(objCls, "x", "F");
yId = env->GetFieldID(objCls, "y", "F");
wId = env->GetFieldID(objCls, "w", "F");
hId = env->GetFieldID(objCls, "h", "F");
labelId = env->GetFieldID(objCls, "label", "Ljava/lang/String;");
probId = env->GetFieldID(objCls, "prob", "F");
return JNI_TRUE;
}
// public native Obj[] Detect(Bitmap bitmap, boolean use_gpu);
JNIEXPORT jobjectArray JNICALL Java_com_megvii_yoloXncnn_YOLOXncnn_Detect(JNIEnv* env, jobject thiz, jobject bitmap, jboolean use_gpu)
{
if (use_gpu == JNI_TRUE && ncnn::get_gpu_count() == 0)
{
return NULL;
//return env->NewStringUTF("no vulkan capable gpu");
}
double start_time = ncnn::get_current_time();
AndroidBitmapInfo info;
AndroidBitmap_getInfo(env, bitmap, &info);
const int width = info.width;
const int height = info.height;
if (info.format != ANDROID_BITMAP_FORMAT_RGBA_8888)
return NULL;
// parameters which might change for different model
const int target_size = 640;
const float prob_threshold = 0.3f;
const float nms_threshold = 0.65f;
std::vector strides = {8, 16, 32}; // might have stride=64
// python 0-1 input tensor with rgb_means = (0.485, 0.456, 0.406), std = (0.229, 0.224, 0.225)
// so for 0-255 input image, rgb_mean should multiply 255 and norm should div by std.
const float mean_vals[3] = {255.f * 0.485f, 255.f * 0.456, 255.f * 0.406f};
const float norm_vals[3] = {1 / (255.f * 0.229f), 1 / (255.f * 0.224f), 1 / (255.f * 0.225f)};
int w = width;
int h = height;
float scale = 1.f;
if (w > h)
{
scale = (float)target_size / w;
w = target_size;
h = h * scale;
}
else
{
scale = (float)target_size / h;
h = target_size;
w = w * scale;
}
ncnn::Mat in = ncnn::Mat::from_android_bitmap_resize(env, bitmap, ncnn::Mat::PIXEL_RGB, w, h);
// pad to target_size rectangle
int wpad = target_size - w;
int hpad = target_size - h;
ncnn::Mat in_pad;
// different from yolov5, yolox only pad on bottom and right side,
// which means users don't need to extra padding info to decode boxes coordinate.
ncnn::copy_make_border(in, in_pad, 0, hpad, 0, wpad, ncnn::BORDER_CONSTANT, 114.f);
// yolox
std::vector objects;
{
in_pad.substract_mean_normalize(mean_vals, norm_vals);
ncnn::Extractor ex = yoloX.create_extractor();
ex.set_vulkan_compute(use_gpu);
ex.input("images", in_pad);
std::vector proposals;
// yolox decode and generate proposal logic
{
ncnn::Mat out;
ex.extract("output", out);
std::vector grid_strides;
generate_grids_and_stride(target_size, strides, grid_strides);
generate_yolox_proposals(grid_strides, out, prob_threshold, proposals);
}
// sort all proposals by score from highest to lowest
qsort_descent_inplace(proposals);
// apply nms with nms_threshold
std::vector picked;
nms_sorted_bboxes(proposals, picked, nms_threshold);
int count = picked.size();
objects.resize(count);
for (int i = 0; i < count; i++)
{
objects[i] = proposals[picked[i]];
// adjust offset to original unpadded
float x0 = (objects[i].x) / scale;
float y0 = (objects[i].y) / scale;
float x1 = (objects[i].x + objects[i].w) / scale;
float y1 = (objects[i].y + objects[i].h) / scale;
// clip
x0 = std::max(std::min(x0, (float)(width - 1)), 0.f);
y0 = std::max(std::min(y0, (float)(height - 1)), 0.f);
x1 = std::max(std::min(x1, (float)(width - 1)), 0.f);
y1 = std::max(std::min(y1, (float)(height - 1)), 0.f);
objects[i].x = x0;
objects[i].y = y0;
objects[i].w = x1 - x0;
objects[i].h = y1 - y0;
}
}
// objects to Obj[]
static const char* class_names[] = {
"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
"elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
"tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
"sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
"potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
"microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
"hair drier", "toothbrush"
};
jobjectArray jObjArray = env->NewObjectArray(objects.size(), objCls, NULL);
for (size_t i=0; iNewObject(objCls, constructortorId, thiz);
env->SetFloatField(jObj, xId, objects[i].x);
env->SetFloatField(jObj, yId, objects[i].y);
env->SetFloatField(jObj, wId, objects[i].w);
env->SetFloatField(jObj, hId, objects[i].h);
env->SetObjectField(jObj, labelId, env->NewStringUTF(class_names[objects[i].label]));
env->SetFloatField(jObj, probId, objects[i].prob);
env->SetObjectArrayElement(jObjArray, i, jObj);
}
double elasped = ncnn::get_current_time() - start_time;
__android_log_print(ANDROID_LOG_DEBUG, "YOLOXncnn", "%.2fms detect", elasped);
return jObjArray;
}
}
================================================
FILE: detector/YOLOX/demo/ncnn/android/app/src/main/res/layout/main.xml
================================================
================================================
FILE: detector/YOLOX/demo/ncnn/android/app/src/main/res/values/strings.xml
================================================
yoloXncnn
================================================
FILE: detector/YOLOX/demo/ncnn/android/build.gradle
================================================
// Top-level build file where you can add configuration options common to all sub-projects/modules.
buildscript {
repositories {
jcenter()
google()
}
dependencies {
classpath 'com.android.tools.build:gradle:3.5.0'
}
}
allprojects {
repositories {
jcenter()
google()
}
}
================================================
FILE: detector/YOLOX/demo/ncnn/android/gradle/wrapper/gradle-wrapper.properties
================================================
#Sun Aug 25 10:34:48 CST 2019
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-5.4.1-all.zip
================================================
FILE: detector/YOLOX/demo/ncnn/android/gradlew
================================================
#!/usr/bin/env sh
##############################################################################
##
## Gradle start up script for UN*X
##
##############################################################################
# Attempt to set APP_HOME
# Resolve links: $0 may be a link
PRG="$0"
# Need this for relative symlinks.
while [ -h "$PRG" ] ; do
ls=`ls -ld "$PRG"`
link=`expr "$ls" : '.*-> \(.*\)$'`
if expr "$link" : '/.*' > /dev/null; then
PRG="$link"
else
PRG=`dirname "$PRG"`"/$link"
fi
done
SAVED="`pwd`"
cd "`dirname \"$PRG\"`/" >/dev/null
APP_HOME="`pwd -P`"
cd "$SAVED" >/dev/null
APP_NAME="Gradle"
APP_BASE_NAME=`basename "$0"`
# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
DEFAULT_JVM_OPTS=""
# Use the maximum available, or set MAX_FD != -1 to use that value.
MAX_FD="maximum"
warn () {
echo "$*"
}
die () {
echo
echo "$*"
echo
exit 1
}
# OS specific support (must be 'true' or 'false').
cygwin=false
msys=false
darwin=false
nonstop=false
case "`uname`" in
CYGWIN* )
cygwin=true
;;
Darwin* )
darwin=true
;;
MINGW* )
msys=true
;;
NONSTOP* )
nonstop=true
;;
esac
CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
# Determine the Java command to use to start the JVM.
if [ -n "$JAVA_HOME" ] ; then
if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
# IBM's JDK on AIX uses strange locations for the executables
JAVACMD="$JAVA_HOME/jre/sh/java"
else
JAVACMD="$JAVA_HOME/bin/java"
fi
if [ ! -x "$JAVACMD" ] ; then
die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
Please set the JAVA_HOME variable in your environment to match the
location of your Java installation."
fi
else
JAVACMD="java"
which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
Please set the JAVA_HOME variable in your environment to match the
location of your Java installation."
fi
# Increase the maximum file descriptors if we can.
if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
MAX_FD_LIMIT=`ulimit -H -n`
if [ $? -eq 0 ] ; then
if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
MAX_FD="$MAX_FD_LIMIT"
fi
ulimit -n $MAX_FD
if [ $? -ne 0 ] ; then
warn "Could not set maximum file descriptor limit: $MAX_FD"
fi
else
warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
fi
fi
# For Darwin, add options to specify how the application appears in the dock
if $darwin; then
GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
fi
# For Cygwin, switch paths to Windows format before running java
if $cygwin ; then
APP_HOME=`cygpath --path --mixed "$APP_HOME"`
CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
JAVACMD=`cygpath --unix "$JAVACMD"`
# We build the pattern for arguments to be converted via cygpath
ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
SEP=""
for dir in $ROOTDIRSRAW ; do
ROOTDIRS="$ROOTDIRS$SEP$dir"
SEP="|"
done
OURCYGPATTERN="(^($ROOTDIRS))"
# Add a user-defined pattern to the cygpath arguments
if [ "$GRADLE_CYGPATTERN" != "" ] ; then
OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
fi
# Now convert the arguments - kludge to limit ourselves to /bin/sh
i=0
for arg in "$@" ; do
CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
else
eval `echo args$i`="\"$arg\""
fi
i=$((i+1))
done
case $i in
(0) set -- ;;
(1) set -- "$args0" ;;
(2) set -- "$args0" "$args1" ;;
(3) set -- "$args0" "$args1" "$args2" ;;
(4) set -- "$args0" "$args1" "$args2" "$args3" ;;
(5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
(6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
(7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
(8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
(9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
esac
fi
# Escape application args
save () {
for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
echo " "
}
APP_ARGS=$(save "$@")
# Collect all arguments for the java command, following the shell quoting and substitution rules
eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
cd "$(dirname "$0")"
fi
exec "$JAVACMD" "$@"
================================================
FILE: detector/YOLOX/demo/ncnn/android/gradlew.bat
================================================
@if "%DEBUG%" == "" @echo off
@rem ##########################################################################
@rem
@rem Gradle startup script for Windows
@rem
@rem ##########################################################################
@rem Set local scope for the variables with windows NT shell
if "%OS%"=="Windows_NT" setlocal
set DIRNAME=%~dp0
if "%DIRNAME%" == "" set DIRNAME=.
set APP_BASE_NAME=%~n0
set APP_HOME=%DIRNAME%
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
set DEFAULT_JVM_OPTS=
@rem Find java.exe
if defined JAVA_HOME goto findJavaFromJavaHome
set JAVA_EXE=java.exe
%JAVA_EXE% -version >NUL 2>&1
if "%ERRORLEVEL%" == "0" goto init
echo.
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
echo.
echo Please set the JAVA_HOME variable in your environment to match the
echo location of your Java installation.
goto fail
:findJavaFromJavaHome
set JAVA_HOME=%JAVA_HOME:"=%
set JAVA_EXE=%JAVA_HOME%/bin/java.exe
if exist "%JAVA_EXE%" goto init
echo.
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
echo.
echo Please set the JAVA_HOME variable in your environment to match the
echo location of your Java installation.
goto fail
:init
@rem Get command-line arguments, handling Windows variants
if not "%OS%" == "Windows_NT" goto win9xME_args
:win9xME_args
@rem Slurp the command line arguments.
set CMD_LINE_ARGS=
set _SKIP=2
:win9xME_args_slurp
if "x%~1" == "x" goto execute
set CMD_LINE_ARGS=%*
:execute
@rem Setup the command line
set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
@rem Execute Gradle
"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
:end
@rem End local scope for the variables with windows NT shell
if "%ERRORLEVEL%"=="0" goto mainEnd
:fail
rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
rem the _cmd.exe /c_ return code!
if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
exit /b 1
:mainEnd
if "%OS%"=="Windows_NT" endlocal
:omega
================================================
FILE: detector/YOLOX/demo/ncnn/android/settings.gradle
================================================
include ':app'
================================================
FILE: detector/YOLOX/demo/ncnn/cpp/README.md
================================================
# YOLOX-CPP-ncnn
Cpp file compile of YOLOX object detection base on [ncnn](https://github.com/Tencent/ncnn).
## Tutorial
### Step1
Clone [ncnn](https://github.com/Tencent/ncnn) first, then please following [build tutorial of ncnn](https://github.com/Tencent/ncnn/wiki/how-to-build) to build on your own device.
### Step2
Use provided tools to generate onnx file.
For example, if you want to generate onnx file of yolox-s, please run the following command:
```shell
cd
python3 tools/export_onnx.py -n yolox-s
```
Then, a yolox.onnx file is generated.
### Step3
Generate ncnn param and bin file.
```shell
cd
cd build/tools/ncnn
./onnx2ncnn yolox.onnx yolox.param yolox.bin
```
Since Focus module is not supported in ncnn. Warnings like:
```shell
Unsupported slice step !
```
will be printed. However, don't worry! C++ version of Focus layer is already implemented in yolox.cpp.
### Step4
Open **yolox.param**, and modify it.
Before (just an example):
```
295 328
Input images 0 1 images
Split splitncnn_input0 1 4 images images_splitncnn_0 images_splitncnn_1 images_splitncnn_2 images_splitncnn_3
Crop Slice_4 1 1 images_splitncnn_3 647 -23309=1,0 -23310=1,2147483647 -23311=1,1
Crop Slice_9 1 1 647 652 -23309=1,0 -23310=1,2147483647 -23311=1,2
Crop Slice_14 1 1 images_splitncnn_2 657 -23309=1,0 -23310=1,2147483647 -23311=1,1
Crop Slice_19 1 1 657 662 -23309=1,1 -23310=1,2147483647 -23311=1,2
Crop Slice_24 1 1 images_splitncnn_1 667 -23309=1,1 -23310=1,2147483647 -23311=1,1
Crop Slice_29 1 1 667 672 -23309=1,0 -23310=1,2147483647 -23311=1,2
Crop Slice_34 1 1 images_splitncnn_0 677 -23309=1,1 -23310=1,2147483647 -23311=1,1
Crop Slice_39 1 1 677 682 -23309=1,1 -23310=1,2147483647 -23311=1,2
Concat Concat_40 4 1 652 672 662 682 683 0=0
...
```
* Change first number for 295 to 295 - 9 = 286(since we will remove 10 layers and add 1 layers, total layers number should minus 9).
* Then remove 10 lines of code from Split to Concat, but remember the last but 2nd number: 683.
* Add YoloV5Focus layer After Input (using previous number 683):
```
YoloV5Focus focus 1 1 images 683
```
After(just an exmaple):
```
286 328
Input images 0 1 images
YoloV5Focus focus 1 1 images 683
...
```
### Step5
Use onnx_optimze to generate new param and bin:
```shell
# suppose you are still under ncnn/build/tools/ncnn dir.
../ncnnoptimize model.param model.bin yolox.param yolox.bin 65536
```
### Step6
Copy or Move yolox.cpp file into ncnn/examples, modify the CMakeList.txt, then build yolox
### Step7
Inference image with executable file yolox, enjoy the detect result:
```shell
./yolox demo.jpg
```
## Acknowledgement
* [ncnn](https://github.com/Tencent/ncnn)
================================================
FILE: detector/YOLOX/demo/ncnn/cpp/yolox.cpp
================================================
// This file is wirtten base on the following file:
// https://github.com/Tencent/ncnn/blob/master/examples/yolov5.cpp
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
// ------------------------------------------------------------------------------
// Copyright (C) 2020-2021, Megvii Inc. All rights reserved.
#include "layer.h"
#include "net.h"
#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include
#include
#include
#endif
#include
#include
#include
// YOLOX use the same focus in yolov5
class YoloV5Focus : public ncnn::Layer
{
public:
YoloV5Focus()
{
one_blob_only = true;
}
virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob, const ncnn::Option& opt) const
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
int outw = w / 2;
int outh = h / 2;
int outc = channels * 4;
top_blob.create(outw, outh, outc, 4u, 1, opt.blob_allocator);
if (top_blob.empty())
return -100;
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outc; p++)
{
const float* ptr = bottom_blob.channel(p % channels).row((p / channels) % 2) + ((p / channels) / 2);
float* outptr = top_blob.channel(p);
for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)
{
*outptr = *ptr;
outptr += 1;
ptr += 2;
}
ptr += w;
}
}
return 0;
}
};
DEFINE_LAYER_CREATOR(YoloV5Focus)
struct Object
{
cv::Rect_ rect;
int label;
float prob;
};
struct GridAndStride
{
int grid0;
int grid1;
int stride;
};
static inline float intersection_area(const Object& a, const Object& b)
{
cv::Rect_ inter = a.rect & b.rect;
return inter.area();
}
static void qsort_descent_inplace(std::vector& faceobjects, int left, int right)
{
int i = left;
int j = right;
float p = faceobjects[(left + right) / 2].prob;
while (i <= j)
{
while (faceobjects[i].prob > p)
i++;
while (faceobjects[j].prob < p)
j--;
if (i <= j)
{
// swap
std::swap(faceobjects[i], faceobjects[j]);
i++;
j--;
}
}
#pragma omp parallel sections
{
#pragma omp section
{
if (left < j) qsort_descent_inplace(faceobjects, left, j);
}
#pragma omp section
{
if (i < right) qsort_descent_inplace(faceobjects, i, right);
}
}
}
static void qsort_descent_inplace(std::vector& objects)
{
if (objects.empty())
return;
qsort_descent_inplace(objects, 0, objects.size() - 1);
}
static void nms_sorted_bboxes(const std::vector& faceobjects, std::vector& picked, float nms_threshold)
{
picked.clear();
const int n = faceobjects.size();
std::vector areas(n);
for (int i = 0; i < n; i++)
{
areas[i] = faceobjects[i].rect.area();
}
for (int i = 0; i < n; i++)
{
const Object& a = faceobjects[i];
int keep = 1;
for (int j = 0; j < (int)picked.size(); j++)
{
const Object& b = faceobjects[picked[j]];
// intersection over union
float inter_area = intersection_area(a, b);
float union_area = areas[i] + areas[picked[j]] - inter_area;
// float IoU = inter_area / union_area
if (inter_area / union_area > nms_threshold)
keep = 0;
}
if (keep)
picked.push_back(i);
}
}
static void generate_grids_and_stride(const int target_size, std::vector& strides, std::vector& grid_strides)
{
for (auto stride : strides)
{
int num_grid = target_size / stride;
for (int g1 = 0; g1 < num_grid; g1++)
{
for (int g0 = 0; g0 < num_grid; g0++)
{
grid_strides.push_back((GridAndStride){g0, g1, stride});
}
}
}
}
static void generate_yolox_proposals(std::vector grid_strides, const ncnn::Mat& feat_blob, float prob_threshold, std::vector& objects)
{
const int num_grid = feat_blob.h;
fprintf(stderr, "output height: %d, width: %d, channels: %d, dims:%d\n", feat_blob.h, feat_blob.w, feat_blob.c, feat_blob.dims);
const int num_class = feat_blob.w - 5;
const int num_anchors = grid_strides.size();
const float* feat_ptr = feat_blob.channel(0);
for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++)
{
const int grid0 = grid_strides[anchor_idx].grid0;
const int grid1 = grid_strides[anchor_idx].grid1;
const int stride = grid_strides[anchor_idx].stride;
// yolox/models/yolo_head.py decode logic
// outputs[..., :2] = (outputs[..., :2] + grids) * strides
// outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides
float x_center = (feat_ptr[0] + grid0) * stride;
float y_center = (feat_ptr[1] + grid1) * stride;
float w = exp(feat_ptr[2]) * stride;
float h = exp(feat_ptr[3]) * stride;
float x0 = x_center - w * 0.5f;
float y0 = y_center - h * 0.5f;
float box_objectness = feat_ptr[4];
for (int class_idx = 0; class_idx < num_class; class_idx++)
{
float box_cls_score = feat_ptr[5 + class_idx];
float box_prob = box_objectness * box_cls_score;
if (box_prob > prob_threshold)
{
Object obj;
obj.rect.x = x0;
obj.rect.y = y0;
obj.rect.width = w;
obj.rect.height = h;
obj.label = class_idx;
obj.prob = box_prob;
objects.push_back(obj);
}
} // class loop
feat_ptr += feat_blob.w;
} // point anchor loop
}
static int detect_yolox(const cv::Mat& bgr, std::vector& objects)
{
ncnn::Net yolox;
yolox.opt.use_vulkan_compute = true;
// yolox.opt.use_bf16_storage = true;
yolox.register_custom_layer("YoloV5Focus", YoloV5Focus_layer_creator);
// original pretrained model from https://github.com/yolox
// TODO ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
yolox.load_param("yolox.param");
yolox.load_model("yolox.bin");
const int target_size = 416;
const float prob_threshold = 0.3f;
const float nms_threshold = 0.65f;
int img_w = bgr.cols;
int img_h = bgr.rows;
int w = img_w;
int h = img_h;
float scale = 1.f;
if (w > h)
{
scale = (float)target_size / w;
w = target_size;
h = h * scale;
}
else
{
scale = (float)target_size / h;
h = target_size;
w = w * scale;
}
ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h);
// pad to target_size rectangle
int wpad = target_size - w;
int hpad = target_size - h;
ncnn::Mat in_pad;
// different from yolov5, yolox only pad on bottom and right side,
// which means users don't need to extra padding info to decode boxes coordinate.
ncnn::copy_make_border(in, in_pad, 0, hpad, 0, wpad, ncnn::BORDER_CONSTANT, 114.f);
// python 0-1 input tensor with rgb_means = (0.485, 0.456, 0.406), std = (0.229, 0.224, 0.225)
// so for 0-255 input image, rgb_mean should multiply 255 and norm should div by std.
const float mean_vals[3] = {255.f * 0.485f, 255.f * 0.456, 255.f * 0.406f};
const float norm_vals[3] = {1 / (255.f * 0.229f), 1 / (255.f * 0.224f), 1 / (255.f * 0.225f)};
in_pad.substract_mean_normalize(mean_vals, norm_vals);
ncnn::Extractor ex = yolox.create_extractor();
ex.input("images", in_pad);
std::vector proposals;
{
ncnn::Mat out;
ex.extract("output", out);
std::vector strides = {8, 16, 32}; // might have stride=64
std::vector grid_strides;
generate_grids_and_stride(target_size, strides, grid_strides);
generate_yolox_proposals(grid_strides, out, prob_threshold, proposals);
}
// sort all proposals by score from highest to lowest
qsort_descent_inplace(proposals);
// apply nms with nms_threshold
std::vector picked;
nms_sorted_bboxes(proposals, picked, nms_threshold);
int count = picked.size();
objects.resize(count);
for (int i = 0; i < count; i++)
{
objects[i] = proposals[picked[i]];
// adjust offset to original unpadded
float x0 = (objects[i].rect.x) / scale;
float y0 = (objects[i].rect.y) / scale;
float x1 = (objects[i].rect.x + objects[i].rect.width) / scale;
float y1 = (objects[i].rect.y + objects[i].rect.height) / scale;
// clip
x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);
objects[i].rect.x = x0;
objects[i].rect.y = y0;
objects[i].rect.width = x1 - x0;
objects[i].rect.height = y1 - y0;
}
return 0;
}
static void draw_objects(const cv::Mat& bgr, const std::vector& objects)
{
static const char* class_names[] = {
"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
"elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
"tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
"sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
"potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
"microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
"hair drier", "toothbrush"
};
cv::Mat image = bgr.clone();
for (size_t i = 0; i < objects.size(); i++)
{
const Object& obj = objects[i];
fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
char text[256];
sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
int baseLine = 0;
cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
int x = obj.rect.x;
int y = obj.rect.y - label_size.height - baseLine;
if (y < 0)
y = 0;
if (x + label_size.width > image.cols)
x = image.cols - label_size.width;
cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
cv::Scalar(255, 255, 255), -1);
cv::putText(image, text, cv::Point(x, y + label_size.height),
cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
}
cv::imshow("image", image);
cv::waitKey(0);
}
int main(int argc, char** argv)
{
if (argc != 2)
{
fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
return -1;
}
const char* imagepath = argv[1];
cv::Mat m = cv::imread(imagepath, 1);
if (m.empty())
{
fprintf(stderr, "cv::imread %s failed\n", imagepath);
return -1;
}
std::vector objects;
detect_yolox(m, objects);
draw_objects(m, objects);
return 0;
}
================================================
FILE: detector/YOLOX/demo.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
import argparse
import os
import pdb
import time
from loguru import logger
import cv2
import torch
from yolox.data.data_augment import preproc
from yolox.data.datasets import COCO_CLASSES
from yolox.exp import get_exp
from yolox.utils import fuse_model, get_model_info, postprocess, vis
IMAGE_EXT = ['.jpg', '.jpeg', '.webp', '.bmp', '.png']
def make_parser():
parser = argparse.ArgumentParser("YOLOX Demo!")
parser.add_argument('demo', default='image', help='demo type, eg. image, video and webcam')
parser.add_argument("-expn", "--experiment-name", type=str, default=None)
parser.add_argument("-n", "--name", type=str, default=None, help="model name")
parser.add_argument('--path', default='./assets/dog.jpg', help='path to images or video')
parser.add_argument('--camid', type=int, default=0, help='webcam demo camera id')
parser.add_argument(
'--save_result', action='store_true',
help='whether to save the inference result of image/video'
)
# exp file
parser.add_argument(
"-f",
"--exp_file",
default=None,
type=str,
help="pls input your expriment description file",
)
parser.add_argument("-c", "--ckpt", default=None, type=str, help="ckpt for eval")
parser.add_argument("--device", default="cpu", type=str, help="device to run our model, can either be cpu or gpu")
parser.add_argument("--conf", default=None, type=float, help="test conf")
parser.add_argument("--nms", default=None, type=float, help="test nms threshold")
parser.add_argument("--tsize", default=None, type=int, help="test img size")
parser.add_argument(
"--fp16",
dest="fp16",
default=False,
action="store_true",
help="Adopting mix precision evaluating.",
)
parser.add_argument(
"--fuse",
dest="fuse",
default=False,
action="store_true",
help="Fuse conv and bn for testing.",
)
parser.add_argument(
"--trt",
dest="trt",
default=False,
action="store_true",
help="Using TensorRT model for testing.",
)
return parser
def get_image_list(path):
image_names = []
for maindir, subdir, file_name_list in os.walk(path):
for filename in file_name_list:
apath = os.path.join(maindir, filename)
ext = os.path.splitext(apath)[1]
if ext in IMAGE_EXT:
image_names.append(apath)
return image_names
class Predictor(object):
def __init__(self, model, exp, cls_names=COCO_CLASSES, trt_file=None, decoder=None, device="cpu"):
self.model = model
self.cls_names = cls_names
self.decoder = decoder
self.num_classes = exp.num_classes
self.confthre = exp.test_conf
self.nmsthre = exp.nmsthre
self.test_size = exp.test_size
self.device = device
if trt_file is not None:
from torch2trt import TRTModule
model_trt = TRTModule()
model_trt.load_state_dict(torch.load(trt_file))
x = torch.ones(1, 3, exp.test_size[0], exp.test_size[1]).cuda()
self.model(x)
self.model = model_trt
self.rgb_means = (0.485, 0.456, 0.406)
self.std = (0.229, 0.224, 0.225)
def inference(self, img):
img_info = {'id': 0}
if isinstance(img, str):
img_info['file_name'] = os.path.basename(img)
img = cv2.imread(img)
else:
img_info['file_name'] = None
height, width = img.shape[:2]
img_info['height'] = height
img_info['width'] = width
img_info['raw_img'] = img
img, ratio = preproc(img, self.test_size, self.rgb_means, self.std)
img_info['ratio'] = ratio
img = torch.from_numpy(img).unsqueeze(0)
if self.device == "gpu":
img = img.cuda()
with torch.no_grad():
t0 = time.time()
outputs = self.model(img)
if self.decoder is not None:
outputs = self.decoder(outputs, dtype=outputs.type())
outputs = postprocess(
outputs, self.num_classes, self.confthre, self.nmsthre
)
logger.info('Infer time: {:.4f}s'.format(time.time()-t0))
return outputs, img_info
def visual(self, output, img_info, cls_conf=0.35):
ratio = img_info['ratio']
img = img_info['raw_img']
if output is None:
return img
output = output.cpu()
bboxes = output[:, 0:4]
# preprocessing: resize
bboxes /= ratio
cls = output[:, 6]
scores = output[:, 4] * output[:, 5]
vis_res = vis(img, bboxes, scores, cls, cls_conf, self.cls_names)
return vis_res
def image_demo(predictor, vis_folder, path, current_time, save_result):
if os.path.isdir(path):
files = get_image_list(path)
else:
files = [path]
files.sort()
for image_name in files:
outputs, img_info = predictor.inference(image_name)
result_image = predictor.visual(outputs[0], img_info)
if save_result:
save_folder = os.path.join(
vis_folder, time.strftime("%Y_%m_%d_%H_%M_%S", current_time)
)
os.makedirs(save_folder, exist_ok=True)
save_file_name = os.path.join(save_folder, os.path.basename(image_name))
logger.info("Saving detection result in {}".format(save_file_name))
cv2.imwrite(save_file_name, result_image)
ch = cv2.waitKey(0)
if ch == 27 or ch == ord('q') or ch == ord('Q'):
break
def imageflow_demo(predictor, vis_folder, current_time, args):
cap = cv2.VideoCapture(args.path if args.demo == 'video' else args.camid)
width = cap.get(cv2.CAP_PROP_FRAME_WIDTH) # float
height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT) # float
fps = cap.get(cv2.CAP_PROP_FPS)
save_folder = os.path.join(vis_folder, time.strftime("%Y_%m_%d_%H_%M_%S", current_time))
os.makedirs(save_folder, exist_ok=True)
if args.demo == "video":
save_path = os.path.join(save_folder, args.path.split('/')[-1])
else:
save_path = os.path.join(save_folder, 'camera.mp4')
logger.info(f'video save_path is {save_path}')
vid_writer = cv2.VideoWriter(
save_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (int(width), int(height))
)
while True:
ret_val, frame = cap.read()
if ret_val:
outputs, img_info = predictor.inference(frame)
result_frame = predictor.visual(outputs[0], img_info)
if args.save_result:
vid_writer.write(result_frame)
ch = cv2.waitKey(1)
if ch == 27 or ch == ord('q') or ch == ord('Q'):
break
else:
break
def main(exp, args):
if not args.experiment_name:
args.experiment_name = exp.exp_name
file_name = os.path.join(exp.output_dir, args.experiment_name)
os.makedirs(file_name, exist_ok=True)
if args.save_result:
vis_folder = os.path.join(file_name, 'vis_res')
os.makedirs(vis_folder, exist_ok=True)
if args.trt:
args.device="gpu"
logger.info("Args: {}".format(args))
if args.conf is not None:
exp.test_conf = args.conf
if args.nms is not None:
exp.nmsthre = args.nms
if args.tsize is not None:
exp.test_size = (args.tsize, args.tsize)
model = exp.get_model()
logger.info("Model Summary: {}".format(get_model_info(model, exp.test_size)))
if args.device == "gpu":
model.cuda()
model.eval()
if not args.trt:
if args.ckpt is None:
ckpt_file = os.path.join(file_name, "best_ckpt.pth.tar")
else:
ckpt_file = args.ckpt
logger.info("loading checkpoint")
ckpt = torch.load(ckpt_file, map_location="cpu")
# load the model state dict
model.load_state_dict(ckpt["model"])
logger.info("loaded checkpoint done.")
if args.fuse:
logger.info("\tFusing model...")
model = fuse_model(model)
if args.trt:
assert (not args.fuse),\
"TensorRT model is not support model fusing!"
trt_file = os.path.join(file_name, "model_trt.pth")
assert os.path.exists(trt_file), (
"TensorRT model is not found!\n Run python3 tools/trt.py first!"
)
model.head.decode_in_inference = False
decoder = model.head.decode_outputs
logger.info("Using TensorRT to inference")
else:
trt_file = None
decoder = None
predictor = Predictor(model, exp, COCO_CLASSES, trt_file, decoder, args.device)
current_time = time.localtime()
if args.demo == 'image':
image_demo(predictor, vis_folder, args.path, current_time, args.save_result)
elif args.demo == 'video' or args.demo == 'webcam':
imageflow_demo(predictor, vis_folder, current_time, args)
if __name__ == "__main__":
args = make_parser().parse_args()
exp = get_exp(args.exp_file, args.name)
main(exp, args)
================================================
FILE: detector/YOLOX/docs/train_custom_data.md
================================================
# Train Custom Data.
This page explains how to train your own custom data with YOLOX.
We take an example of fine-tuning YOLOX-S model on VOC dataset to give a more clear guide.
## 0. Before you start
Clone this repo and follow the [README](../README.md) to install YOLOX.
## 1. Create your own dataset
**Step 1** Prepare your own dataset with images and labels first. For labeling images, you may use a tool like [Labelme](https://github.com/wkentaro/labelme) or [CVAT](https://github.com/openvinotoolkit/cvat).
**Step 2** Then, you should write the corresponding Dataset Class which can load images and labels through "\_\_getitem\_\_" method. We currently support COCO format and VOC format.
You can also write the Dataset by you own. Let's take the [VOC](../yolox/data/datasets/voc.py#L151) Dataset file for example:
```python
@Dataset.resize_getitem
def __getitem__(self, index):
img, target, img_info, img_id = self.pull_item(index)
if self.preproc is not None:
img, target = self.preproc(img, target, self.input_dim)
return img, target, img_info, img_id
```
One more thing worth noting is that you should also implement "[pull_item](../yolox/data/datasets/voc.py#L129)" and "[load_anno](../yolox/data/datasets/voc.py#L121)" method for the Mosiac and MixUp augmentation.
**Step 3** Prepare the evaluator. We currently have [COCO evaluator](../yolox/evaluators/coco_evaluator.py) and [VOC evaluator](../yolox/evaluators/voc_evaluator.py).
If you have your own format data or evaluation metric, you may write your own evaluator.
**Step 4** Put your dataset under $YOLOX_DIR/datasets$, for VOC:
```shell
ln -s /path/to/your/VOCdevkit ./datasets/VOCdevkit
```
* The path "VOCdevkit" will be used in your exp file described in next section.Specifically, in "get_data_loader" and "get_eval_loader" function.
## 2. Create your Exp file to control everything
We put everything involved in a model to one single Exp file, including model setting, training setting, and testing setting.
**A complete Exp file is at [yolox_base.py](../yolox/exp/yolox_base.py).** It may be too long to write for every exp, but you can inherit the base Exp file and only overwrite the changed part.
Let's still take the [VOC Exp file](../exps/example/yolox_voc/yolox_voc_s.py) for an example.
We select YOLOX-S model here, so we should change the network depth and width. VOC has only 20 classes, so we should also change the num_classes.
These configs are changed in the init() methd:
```python
class Exp(MyExp):
def __init__(self):
super(Exp, self).__init__()
self.num_classes = 20
self.depth = 0.33
self.width = 0.50
self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
```
Besides, you should also overwrite the dataset and evaluator preprared before to training the model on your own data.
Please see "[get_data_loader](../exps/example/yolox_voc/yolox_voc_s.py#L20)", "[get_eval_loader](../exps/example/yolox_voc/yolox_voc_s.py#L82)", and "[get_evaluator](../exps/example/yolox_voc/yolox_voc_s.py#L113)" for more details.
## 3. Train
Except special cases, we always recommend to use our [COCO pretrained weights](../README.md) for initializing.
Once you get the Exp file and the COCO pretrained weights we provided, you can train your own model by the following command:
```bash
python tools/train.py -f /path/to/your/Exp/file -d 8 -b 64 --fp16 -o -c /path/to/the/pretrained/weights
```
or take the YOLOX-S VOC training for example:
```bash
python tools/train.py -f exps/example/yolox_voc/yolox_voc_s.py -d 8 -b 64 --fp16 -o -c /path/to/yolox_s.pth.tar
```
(Don't worry for the different shape of detection head between the pretrained weights and your own model, we will handle it)
## 4. Tips for Best Training Results
As YOLOX is an anchor-free detector with only several hyper-parameters, most of the time good results can be obtained with no changes to the models or training settings.
We thus always recommend you first train with all default training settings.
If at first you don't get good results, there are steps you could consider to take to improve.
**Model Selection** We provide YOLOX-Nano, YOLOX-Tiny, and YOLOX-S for mobile deployments, while YOLOX-M/L/X for cloud or high performance GPU deployments.
If your deployment meets some trouble of compatibility. we recommand YOLOX-DarkNet53.
**Training Configs** If your training overfits early, then you can reduce max\_epochs or decrease the base\_lr and min\_lr\_ratio in your Exp file:
```python
# -------------- training config --------------------- #
self.warmup_epochs = 5
self.max_epoch = 300
self.warmup_lr = 0
self.basic_lr_per_img = 0.01 / 64.0
self.scheduler = "yoloxwarmcos"
self.no_aug_epochs = 15
self.min_lr_ratio = 0.05
self.ema = True
self.weight_decay = 5e-4
self.momentum = 0.9
```
**Aug Configs** You may also change the degree of the augmentations.
Generally, for small models, you should weak the aug, while for large models or small size of dataset, you may enchance the aug in your Exp file:
```python
# --------------- transform config ----------------- #
self.degrees = 10.0
self.translate = 0.1
self.scale = (0.1, 2)
self.mscale = (0.8, 1.6)
self.shear = 2.0
self.perspective = 0.0
self.enable_mixup = True
```
**Design your own detector** You may refer to our [Arxiv](https://arxiv.org/abs/2107.08430) paper for details and suggestions for designing your own detector.
================================================
FILE: detector/YOLOX/exps/default/nano.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
import os
import torch.nn as nn
from yolox.exp import Exp as MyExp
class Exp(MyExp):
def __init__(self):
super(Exp, self).__init__()
self.depth = 0.33
self.width = 0.25
self.scale = (0.5, 1.5)
self.random_size = (10, 20)
self.test_size = (416, 416)
self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
self.enable_mixup = False
def get_model(self, sublinear=False):
def init_yolo(M):
for m in M.modules():
if isinstance(m, nn.BatchNorm2d):
m.eps = 1e-3
m.momentum = 0.03
if "model" not in self.__dict__:
from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
in_channels = [256, 512, 1024]
# NANO model use depthwise = True, which is main difference.
backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, depthwise=True)
head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, depthwise=True)
self.model = YOLOX(backbone, head)
self.model.apply(init_yolo)
self.model.head.initialize_biases(1e-2)
return self.model
================================================
FILE: detector/YOLOX/exps/default/yolov3.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
import os
import torch
import torch.nn as nn
from yolox.exp import Exp as MyExp
class Exp(MyExp):
def __init__(self):
super(Exp, self).__init__()
self.depth = 1.0
self.width = 1.0
self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
def get_model(self, sublinear=False):
def init_yolo(M):
for m in M.modules():
if isinstance(m, nn.BatchNorm2d):
m.eps = 1e-3
m.momentum = 0.03
if "model" not in self.__dict__:
from yolox.models import YOLOX, YOLOFPN, YOLOXHead
backbone = YOLOFPN()
head = YOLOXHead(self.num_classes, self.width, in_channels=[128, 256, 512], act="lrelu")
self.model = YOLOX(backbone, head)
self.model.apply(init_yolo)
self.model.head.initialize_biases(1e-2)
return self.model
def get_data_loader(self, batch_size, is_distributed, no_aug=False):
from data.datasets.cocodataset import COCODataset
from data.datasets.mosaicdetection import MosaicDetection
from data.datasets.data_augment import TrainTransform
from data.datasets.dataloading import YoloBatchSampler, DataLoader, InfiniteSampler
import torch.distributed as dist
dataset = COCODataset(
data_dir='data/COCO/',
json_file=self.train_ann,
img_size=self.input_size,
preproc=TrainTransform(
rgb_means=(0.485, 0.456, 0.406),
std=(0.229, 0.224, 0.225),
max_labels=50
),
)
dataset = MosaicDetection(
dataset,
mosaic=not no_aug,
img_size=self.input_size,
preproc=TrainTransform(
rgb_means=(0.485, 0.456, 0.406),
std=(0.229, 0.224, 0.225),
max_labels=120
),
degrees=self.degrees,
translate=self.translate,
scale=self.scale,
shear=self.shear,
perspective=self.perspective,
)
self.dataset = dataset
if is_distributed:
batch_size = batch_size // dist.get_world_size()
sampler = InfiniteSampler(len(self.dataset), seed=self.seed if self.seed else 0)
else:
sampler = torch.utils.data.RandomSampler(self.dataset)
batch_sampler = YoloBatchSampler(
sampler=sampler,
batch_size=batch_size,
drop_last=False,
input_dimension=self.input_size,
mosaic=not no_aug
)
dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
dataloader_kwargs["batch_sampler"] = batch_sampler
train_loader = DataLoader(self.dataset, **dataloader_kwargs)
return train_loader
================================================
FILE: detector/YOLOX/exps/default/yolox_l.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
import os
from yolox.exp import Exp as MyExp
class Exp(MyExp):
def __init__(self):
super(Exp, self).__init__()
self.depth = 1.0
self.width = 1.0
self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
================================================
FILE: detector/YOLOX/exps/default/yolox_m.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
import os
from yolox.exp import Exp as MyExp
class Exp(MyExp):
def __init__(self):
super(Exp, self).__init__()
self.depth = 0.67
self.width = 0.75
self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
================================================
FILE: detector/YOLOX/exps/default/yolox_s.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
import os
from yolox.exp import Exp as MyExp
class Exp(MyExp):
def __init__(self):
super(Exp, self).__init__()
self.depth = 0.33
self.width = 0.50
self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
================================================
FILE: detector/YOLOX/exps/default/yolox_tiny.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
import os
from yolox.exp import Exp as MyExp
class Exp(MyExp):
def __init__(self):
super(Exp, self).__init__()
self.depth = 0.33
self.width = 0.375
self.scale = (0.5, 1.5)
self.random_size = (10, 20)
self.test_size = (416, 416)
self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
self.enable_mixup = False
================================================
FILE: detector/YOLOX/exps/default/yolox_x.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
import os
from yolox.exp import Exp as MyExp
class Exp(MyExp):
def __init__(self):
super(Exp, self).__init__()
self.depth = 1.33
self.width = 1.25
self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
================================================
FILE: detector/YOLOX/exps/example/yolox_voc/yolox_voc_s.py
================================================
# encoding: utf-8
import os
import random
import torch
import torch.nn as nn
import torch.distributed as dist
from yolox.exp import Exp as MyExp
from yolox.data import get_yolox_datadir
class Exp(MyExp):
def __init__(self):
super(Exp, self).__init__()
self.num_classes = 20
self.depth = 0.33
self.width = 0.50
self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
def get_data_loader(self, batch_size, is_distributed, no_aug=False):
from yolox.data import (
VOCDetection,
TrainTransform,
YoloBatchSampler,
DataLoader,
InfiniteSampler,
MosaicDetection,
)
dataset = VOCDetection(
data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
img_size=self.input_size,
preproc=TrainTransform(
rgb_means=(0.485, 0.456, 0.406),
std=(0.229, 0.224, 0.225),
max_labels=50,
),
)
dataset = MosaicDetection(
dataset,
mosaic=not no_aug,
img_size=self.input_size,
preproc=TrainTransform(
rgb_means=(0.485, 0.456, 0.406),
std=(0.229, 0.224, 0.225),
max_labels=120,
),
degrees=self.degrees,
translate=self.translate,
scale=self.scale,
shear=self.shear,
perspective=self.perspective,
enable_mixup=self.enable_mixup,
)
self.dataset = dataset
if is_distributed:
batch_size = batch_size // dist.get_world_size()
sampler = InfiniteSampler(
len(self.dataset), seed=self.seed if self.seed else 0
)
batch_sampler = YoloBatchSampler(
sampler=sampler,
batch_size=batch_size,
drop_last=False,
input_dimension=self.input_size,
mosaic=not no_aug,
)
dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
dataloader_kwargs["batch_sampler"] = batch_sampler
train_loader = DataLoader(self.dataset, **dataloader_kwargs)
return train_loader
def get_eval_loader(self, batch_size, is_distributed, testdev=False):
from yolox.data import VOCDetection, ValTransform
valdataset = VOCDetection(
data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
image_sets=[('2007', 'test')],
img_size=self.test_size,
preproc=ValTransform(
rgb_means=(0.485, 0.456, 0.406),
std=(0.229, 0.224, 0.225),
),
)
if is_distributed:
batch_size = batch_size // dist.get_world_size()
sampler = torch.utils.data.distributed.DistributedSampler(
valdataset, shuffle=False
)
else:
sampler = torch.utils.data.SequentialSampler(valdataset)
dataloader_kwargs = {
"num_workers": self.data_num_workers,
"pin_memory": True,
"sampler": sampler,
}
dataloader_kwargs["batch_size"] = batch_size
val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
return val_loader
def get_evaluator(self, batch_size, is_distributed, testdev=False):
from yolox.evaluators import VOCEvaluator
val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev)
evaluator = VOCEvaluator(
dataloader=val_loader,
img_size=self.test_size,
confthre=self.test_conf,
nmsthre=self.nmsthre,
num_classes=self.num_classes,
)
return evaluator
================================================
FILE: detector/YOLOX/requirements.txt
================================================
numpy
torch>=1.7
opencv_python
loguru
scikit-image
tqdm
torchvision
Pillow
thop
ninja
tabulate
tensorboard
onnxruntime
================================================
FILE: detector/YOLOX/setup.cfg
================================================
[isort]
line_length = 100
multi_line_output = 3
balanced_wrapping = True
known_standard_library = setuptools
known_third_party = tqdm,loguru
known_data_processing = cv2,numpy,scipy,PIL,matplotlib,scikit_image
known_datasets = pycocotools
known_deeplearning = torch,torchvision,caffe2,onnx,apex,timm,thop,torch2trt,tensorrt,openvino,onnxruntime
known_myself = yolox
sections = FUTURE,STDLIB,THIRDPARTY,data_processing,datasets,deeplearning,myself,FIRSTPARTY,LOCALFOLDER
no_lines_before=STDLIB,THIRDPARTY,datasets
default_section = FIRSTPARTY
[flake8]
max-line-length = 100
max-complexity = 18
exclude = __init__.py
================================================
FILE: detector/YOLOX/setup.py
================================================
#!/usr/bin/env python
# Copyright (c) Megvii, Inc. and its affiliates. All Rights Reserved
import re
import setuptools
import glob
from os import path
import torch
from torch.utils.cpp_extension import CppExtension
torch_ver = [int(x) for x in torch.__version__.split(".")[:2]]
assert torch_ver >= [1, 3], "Requires PyTorch >= 1.3"
def get_extensions():
this_dir = path.dirname(path.abspath(__file__))
extensions_dir = path.join(this_dir, "yolox", "layers", "csrc")
main_source = path.join(extensions_dir, "vision.cpp")
sources = glob.glob(path.join(extensions_dir, "**", "*.cpp"))
sources = [main_source] + sources
extension = CppExtension
extra_compile_args = {"cxx": ["-O3"]}
define_macros = []
include_dirs = [extensions_dir]
ext_modules = [
extension(
"yolox._C",
sources,
include_dirs=include_dirs,
define_macros=define_macros,
extra_compile_args=extra_compile_args,
)
]
return ext_modules
with open("yolox/__init__.py", "r") as f:
version = re.search(
r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]',
f.read(), re.MULTILINE
).group(1)
with open("README.md", "r") as f:
long_description = f.read()
setuptools.setup(
name="yolox",
version=version,
author="basedet team",
python_requires=">=3.6",
long_description=long_description,
ext_modules=None,
classifiers=["Programming Language :: Python :: 3", "Operating System :: OS Independent"],
cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
packages=setuptools.find_packages(),
)
================================================
FILE: detector/YOLOX/tools/__init__.py
================================================
###################################################################
# File Name: __init__.py
# Author: Zhongdao Wang
# mail: wcd17@mails.tsinghua.edu.cn
# Created Time: Sun Jul 25 17:14:12 2021
###################################################################
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import
================================================
FILE: detector/YOLOX/tools/demo.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
import argparse
import os
import time
from loguru import logger
import cv2
import torch
from yolox.data.data_augment import preproc
from yolox.data.datasets import COCO_CLASSES
from yolox.exp import get_exp
from yolox.utils import fuse_model, get_model_info, postprocess, vis
IMAGE_EXT = ['.jpg', '.jpeg', '.webp', '.bmp', '.png']
def make_parser():
parser = argparse.ArgumentParser("YOLOX Demo!")
parser.add_argument('demo', default='image', help='demo type, eg. image, video and webcam')
parser.add_argument("-expn", "--experiment-name", type=str, default=None)
parser.add_argument("-n", "--name", type=str, default=None, help="model name")
parser.add_argument('--path', default='./assets/dog.jpg', help='path to images or video')
parser.add_argument('--camid', type=int, default=0, help='webcam demo camera id')
parser.add_argument(
'--save_result', action='store_true',
help='whether to save the inference result of image/video'
)
# exp file
parser.add_argument(
"-f",
"--exp_file",
default=None,
type=str,
help="pls input your expriment description file",
)
parser.add_argument("-c", "--ckpt", default=None, type=str, help="ckpt for eval")
parser.add_argument("--device", default="cpu", type=str, help="device to run our model, can either be cpu or gpu")
parser.add_argument("--conf", default=None, type=float, help="test conf")
parser.add_argument("--nms", default=None, type=float, help="test nms threshold")
parser.add_argument("--tsize", default=None, type=int, help="test img size")
parser.add_argument(
"--fp16",
dest="fp16",
default=False,
action="store_true",
help="Adopting mix precision evaluating.",
)
parser.add_argument(
"--fuse",
dest="fuse",
default=False,
action="store_true",
help="Fuse conv and bn for testing.",
)
parser.add_argument(
"--trt",
dest="trt",
default=False,
action="store_true",
help="Using TensorRT model for testing.",
)
return parser
def get_image_list(path):
image_names = []
for maindir, subdir, file_name_list in os.walk(path):
for filename in file_name_list:
apath = os.path.join(maindir, filename)
ext = os.path.splitext(apath)[1]
if ext in IMAGE_EXT:
image_names.append(apath)
return image_names
class Predictor(object):
def __init__(self, model, exp, cls_names=COCO_CLASSES, trt_file=None, decoder=None, device="cpu"):
self.model = model
self.cls_names = cls_names
self.decoder = decoder
self.num_classes = exp.num_classes
self.confthre = exp.test_conf
self.nmsthre = exp.nmsthre
self.test_size = exp.test_size
self.device = device
if trt_file is not None:
from torch2trt import TRTModule
model_trt = TRTModule()
model_trt.load_state_dict(torch.load(trt_file))
x = torch.ones(1, 3, exp.test_size[0], exp.test_size[1]).cuda()
self.model(x)
self.model = model_trt
self.rgb_means = (0.485, 0.456, 0.406)
self.std = (0.229, 0.224, 0.225)
def inference(self, img):
img_info = {'id': 0}
if isinstance(img, str):
img_info['file_name'] = os.path.basename(img)
img = cv2.imread(img)
else:
img_info['file_name'] = None
height, width = img.shape[:2]
img_info['height'] = height
img_info['width'] = width
img_info['raw_img'] = img
img, ratio = preproc(img, self.test_size, self.rgb_means, self.std)
img_info['ratio'] = ratio
img = torch.from_numpy(img).unsqueeze(0)
if self.device == "gpu":
img = img.cuda()
with torch.no_grad():
t0 = time.time()
outputs = self.model(img)
if self.decoder is not None:
outputs = self.decoder(outputs, dtype=outputs.type())
outputs = postprocess(
outputs, self.num_classes, self.confthre, self.nmsthre
)
logger.info('Infer time: {:.4f}s'.format(time.time()-t0))
return outputs, img_info
def visual(self, output, img_info, cls_conf=0.35):
ratio = img_info['ratio']
img = img_info['raw_img']
if output is None:
return img
output = output.cpu()
bboxes = output[:, 0:4]
# preprocessing: resize
bboxes /= ratio
cls = output[:, 6]
scores = output[:, 4] * output[:, 5]
vis_res = vis(img, bboxes, scores, cls, cls_conf, self.cls_names)
return vis_res
def image_demo(predictor, vis_folder, path, current_time, save_result):
if os.path.isdir(path):
files = get_image_list(path)
else:
files = [path]
files.sort()
for image_name in files:
outputs, img_info = predictor.inference(image_name)
result_image = predictor.visual(outputs[0], img_info)
if save_result:
save_folder = os.path.join(
vis_folder, time.strftime("%Y_%m_%d_%H_%M_%S", current_time)
)
os.makedirs(save_folder, exist_ok=True)
save_file_name = os.path.join(save_folder, os.path.basename(image_name))
logger.info("Saving detection result in {}".format(save_file_name))
cv2.imwrite(save_file_name, result_image)
ch = cv2.waitKey(0)
if ch == 27 or ch == ord('q') or ch == ord('Q'):
break
def imageflow_demo(predictor, vis_folder, current_time, args):
cap = cv2.VideoCapture(args.path if args.demo == 'video' else args.camid)
width = cap.get(cv2.CAP_PROP_FRAME_WIDTH) # float
height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT) # float
fps = cap.get(cv2.CAP_PROP_FPS)
save_folder = os.path.join(vis_folder, time.strftime("%Y_%m_%d_%H_%M_%S", current_time))
os.makedirs(save_folder, exist_ok=True)
if args.demo == "video":
save_path = os.path.join(save_folder, args.path.split('/')[-1])
else:
save_path = os.path.join(save_folder, 'camera.mp4')
logger.info(f'video save_path is {save_path}')
vid_writer = cv2.VideoWriter(
save_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (int(width), int(height))
)
while True:
ret_val, frame = cap.read()
if ret_val:
outputs, img_info = predictor.inference(frame)
result_frame = predictor.visual(outputs[0], img_info)
if args.save_result:
vid_writer.write(result_frame)
ch = cv2.waitKey(1)
if ch == 27 or ch == ord('q') or ch == ord('Q'):
break
else:
break
def main(exp, args):
if not args.experiment_name:
args.experiment_name = exp.exp_name
file_name = os.path.join(exp.output_dir, args.experiment_name)
os.makedirs(file_name, exist_ok=True)
if args.save_result:
vis_folder = os.path.join(file_name, 'vis_res')
os.makedirs(vis_folder, exist_ok=True)
if args.trt:
args.device = "gpu"
logger.info("Args: {}".format(args))
if args.conf is not None:
exp.test_conf = args.conf
if args.nms is not None:
exp.nmsthre = args.nms
if args.tsize is not None:
exp.test_size = (args.tsize, args.tsize)
model = exp.get_model()
logger.info("Model Summary: {}".format(get_model_info(model, exp.test_size)))
if args.device == "gpu":
model.cuda()
model.eval()
if not args.trt:
if args.ckpt is None:
ckpt_file = os.path.join(file_name, "best_ckpt.pth.tar")
else:
ckpt_file = args.ckpt
logger.info("loading checkpoint")
ckpt = torch.load(ckpt_file, map_location="cpu")
# load the model state dict
model.load_state_dict(ckpt["model"])
logger.info("loaded checkpoint done.")
if args.fuse:
logger.info("\tFusing model...")
model = fuse_model(model)
if args.trt:
assert (not args.fuse),\
"TensorRT model is not support model fusing!"
trt_file = os.path.join(file_name, "model_trt.pth")
assert os.path.exists(trt_file), (
"TensorRT model is not found!\n Run python3 tools/trt.py first!"
)
model.head.decode_in_inference = False
decoder = model.head.decode_outputs
logger.info("Using TensorRT to inference")
else:
trt_file = None
decoder = None
predictor = Predictor(model, exp, COCO_CLASSES, trt_file, decoder, args.device)
current_time = time.localtime()
if args.demo == 'image':
image_demo(predictor, vis_folder, args.path, current_time, args.save_result)
elif args.demo == 'video' or args.demo == 'webcam':
imageflow_demo(predictor, vis_folder, current_time, args)
if __name__ == "__main__":
args = make_parser().parse_args()
exp = get_exp(args.exp_file, args.name)
main(exp, args)
================================================
FILE: detector/YOLOX/tools/eval.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
import argparse
import os
import random
import warnings
from loguru import logger
import torch
import torch.backends.cudnn as cudnn
from torch.nn.parallel import DistributedDataParallel as DDP
from yolox.core import launch
from yolox.exp import get_exp
from yolox.utils import configure_nccl, fuse_model, get_local_rank, get_model_info, setup_logger
def make_parser():
parser = argparse.ArgumentParser("YOLOX Eval")
parser.add_argument("-expn", "--experiment-name", type=str, default=None)
parser.add_argument("-n", "--name", type=str, default=None, help="model name")
# distributed
parser.add_argument(
"--dist-backend", default="nccl", type=str, help="distributed backend"
)
parser.add_argument(
"--dist-url", default=None, type=str, help="url used to set up distributed training"
)
parser.add_argument("-b", "--batch-size", type=int, default=64, help="batch size")
parser.add_argument(
"-d", "--devices", default=None, type=int, help="device for training"
)
parser.add_argument(
"--local_rank", default=0, type=int, help="local rank for dist training"
)
parser.add_argument(
"--num_machine", default=1, type=int, help="num of node for training"
)
parser.add_argument(
"--machine_rank", default=0, type=int, help="node rank for multi-node training"
)
parser.add_argument(
"-f",
"--exp_file",
default=None,
type=str,
help="pls input your expriment description file",
)
parser.add_argument("-c", "--ckpt", default=None, type=str, help="ckpt for eval")
parser.add_argument("--conf", default=None, type=float, help="test conf")
parser.add_argument("--nms", default=None, type=float, help="test nms threshold")
parser.add_argument("--tsize", default=None, type=int, help="test img size")
parser.add_argument("--seed", default=None, type=int, help="eval seed")
parser.add_argument(
"--fp16",
dest="fp16",
default=False,
action="store_true",
help="Adopting mix precision evaluating.",
)
parser.add_argument(
"--fuse",
dest="fuse",
default=False,
action="store_true",
help="Fuse conv and bn for testing.",
)
parser.add_argument(
"--trt",
dest="trt",
default=False,
action="store_true",
help="Using TensorRT model for testing.",
)
parser.add_argument(
"--test",
dest="test",
default=False,
action="store_true",
help="Evaluating on test-dev set.",
)
parser.add_argument(
"--speed", dest="speed", default=False, action="store_true", help="speed test only."
)
parser.add_argument(
"opts",
help="Modify config options using the command-line",
default=None,
nargs=argparse.REMAINDER,
)
return parser
@logger.catch
def main(exp, num_gpu, args):
if not args.experiment_name:
args.experiment_name = exp.exp_name
if args.seed is not None:
random.seed(args.seed)
torch.manual_seed(args.seed)
cudnn.deterministic = True
warnings.warn(
"You have chosen to seed testing. This will turn on the CUDNN deterministic setting, "
)
is_distributed = num_gpu > 1
# set environment variables for distributed training
configure_nccl()
cudnn.benchmark = True
# rank = args.local_rank
rank = get_local_rank()
if rank == 0:
if os.path.exists("./" + args.experiment_name + "ip_add.txt"):
os.remove("./" + args.experiment_name + "ip_add.txt")
file_name = os.path.join(exp.output_dir, args.experiment_name)
if rank == 0:
os.makedirs(file_name, exist_ok=True)
setup_logger(
file_name, distributed_rank=rank, filename="val_log.txt", mode="a"
)
logger.info("Args: {}".format(args))
if args.conf is not None:
exp.test_conf = args.conf
if args.nms is not None:
exp.nmsthre = args.nms
if args.tsize is not None:
exp.test_size = (args.tsize, args.tsize)
model = exp.get_model()
logger.info("Model Summary: {}".format(get_model_info(model, exp.test_size)))
logger.info("Model Structure:\n{}".format(str(model)))
evaluator = exp.get_evaluator(args.batch_size, is_distributed, args.test)
torch.cuda.set_device(rank)
model.cuda(rank)
model.eval()
if not args.speed and not args.trt:
if args.ckpt is None:
ckpt_file = os.path.join(file_name, "best_ckpt.pth.tar")
else:
ckpt_file = args.ckpt
logger.info("loading checkpoint")
loc = "cuda:{}".format(rank)
ckpt = torch.load(ckpt_file, map_location=loc)
# load the model state dict
model.load_state_dict(ckpt["model"])
logger.info("loaded checkpoint done.")
if is_distributed:
model = DDP(model, device_ids=[rank])
if args.fuse:
logger.info("\tFusing model...")
model = fuse_model(model)
if args.trt:
assert (not args.fuse and not is_distributed and args.batch_size == 1),\
"TensorRT model is not support model fusing and distributed inferencing!"
trt_file = os.path.join(file_name, "model_trt.pth")
assert os.path.exists(trt_file), "TensorRT model is not found!\n Run tools/trt.py first!"
model.head.decode_in_inference = False
decoder = model.head.decode_outputs
else:
trt_file = None
decoder = None
# start evaluate
*_, summary = evaluator.evaluate(
model, is_distributed, args.fp16, trt_file, decoder, exp.test_size
)
logger.info("\n" + summary)
if __name__ == "__main__":
args = make_parser().parse_args()
exp = get_exp(args.exp_file, args.name)
exp.merge(args.opts)
num_gpu = torch.cuda.device_count() if args.devices is None else args.devices
assert num_gpu <= torch.cuda.device_count()
dist_url = "auto" if args.dist_url is None else args.dist_url
launch(
main, num_gpu, args.num_machine, backend=args.dist_backend,
dist_url=dist_url, args=(exp, num_gpu, args)
)
================================================
FILE: detector/YOLOX/tools/export_onnx.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
import argparse
import os
from loguru import logger
import torch
from torch import nn
from yolox.exp import get_exp
from yolox.models.network_blocks import SiLU
from yolox.utils import replace_module
def make_parser():
parser = argparse.ArgumentParser("YOLOX onnx deploy")
parser.add_argument(
"--output-name", type=str, default="yolox.onnx", help="output name of models"
)
parser.add_argument("--input", default="images", type=str, help="input name of onnx model")
parser.add_argument("--output", default="output", type=str, help="output name of onnx model")
parser.add_argument("-o", "--opset", default=11, type=int, help="onnx opset version")
parser.add_argument("--no-onnxsim", action="store_true", help="use onnxsim or not")
parser.add_argument(
"-f",
"--exp_file",
default=None,
type=str,
help="expriment description file",
)
parser.add_argument("-expn", "--experiment-name", type=str, default=None)
parser.add_argument("-n", "--name", type=str, default=None, help="model name")
parser.add_argument("-c", "--ckpt", default=None, type=str, help="ckpt path")
parser.add_argument(
"opts",
help="Modify config options using the command-line",
default=None,
nargs=argparse.REMAINDER,
)
return parser
@logger.catch
def main():
args = make_parser().parse_args()
logger.info("args value: {}".format(args))
exp = get_exp(args.exp_file, args.name)
exp.merge(args.opts)
if not args.experiment_name:
args.experiment_name = exp.exp_name
model = exp.get_model()
if args.ckpt is None:
file_name = os.path.join(exp.output_dir, args.experiment_name)
ckpt_file = os.path.join(file_name, "best_ckpt.pth.tar")
else:
ckpt_file = args.ckpt
ckpt = torch.load(ckpt_file, map_location="cpu")
# load the model state dict
model.eval()
if "model" in ckpt:
ckpt = ckpt["model"]
model.load_state_dict(ckpt)
model = replace_module(model, nn.SiLU, SiLU)
model.head.decode_in_inference = False
logger.info("loaded checkpoint done.")
dummy_input = torch.randn(1, 3, exp.test_size[0], exp.test_size[1])
torch.onnx._export(
model,
dummy_input,
args.output_name,
input_names=[args.input],
output_names=[args.output],
opset_version=args.opset,
)
logger.info("generate onnx named {}".format(args.output_name))
if not args.no_onnxsim:
# use onnxsimplify to reduce reduent model.
os.system("python3 -m onnxsim {} {}".format(args.output_name, args.output_name))
logger.info("generate simplify onnx named {}".format(args.output_name))
if __name__ == "__main__":
main()
================================================
FILE: detector/YOLOX/tools/train.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
import argparse
import random
import warnings
from loguru import logger
import torch
import torch.backends.cudnn as cudnn
from yolox.core import Trainer, launch
from yolox.exp import get_exp
from yolox.utils import configure_nccl
def make_parser():
parser = argparse.ArgumentParser("YOLOX train parser")
parser.add_argument("-expn", "--experiment-name", type=str, default=None)
parser.add_argument("-n", "--name", type=str, default=None, help="model name")
# distributed
parser.add_argument(
"--dist-backend", default="nccl", type=str, help="distributed backend"
)
parser.add_argument(
"--dist-url", default=None, type=str, help="url used to set up distributed training"
)
parser.add_argument("-b", "--batch-size", type=int, default=64, help="batch size")
parser.add_argument(
"-d", "--devices", default=None, type=int, help="device for training"
)
parser.add_argument(
"--local_rank", default=0, type=int, help="local rank for dist training"
)
parser.add_argument(
"-f",
"--exp_file",
default=None,
type=str,
help="plz input your expriment description file",
)
parser.add_argument(
"--resume", default=False, action="store_true", help="resume training"
)
parser.add_argument("-c", "--ckpt", default=None, type=str, help="checkpoint file")
parser.add_argument(
"-e", "--start_epoch", default=None, type=int, help="resume training start epoch"
)
parser.add_argument(
"--num_machine", default=1, type=int, help="num of node for training"
)
parser.add_argument(
"--machine_rank", default=0, type=int, help="node rank for multi-node training"
)
parser.add_argument(
"--fp16",
dest="fp16",
default=True,
action="store_true",
help="Adopting mix precision training.",
)
parser.add_argument(
"-o",
"--occumpy",
dest="occumpy",
default=False,
action="store_true",
help="occumpy GPU memory first for training.",
)
parser.add_argument(
"opts",
help="Modify config options using the command-line",
default=None,
nargs=argparse.REMAINDER,
)
return parser
@logger.catch
def main(exp, args):
if not args.experiment_name:
args.experiment_name = exp.exp_name
if exp.seed is not None:
random.seed(exp.seed)
torch.manual_seed(exp.seed)
cudnn.deterministic = True
warnings.warn(
"You have chosen to seed training. This will turn on the CUDNN deterministic setting, "
"which can slow down your training considerably! You may see unexpected behavior "
"when restarting from checkpoints."
)
# set environment variables for distributed training
configure_nccl()
cudnn.benchmark = True
trainer = Trainer(exp, args)
trainer.train()
if __name__ == "__main__":
args = make_parser().parse_args()
exp = get_exp(args.exp_file, args.name)
exp.merge(args.opts)
num_gpu = torch.cuda.device_count() if args.devices is None else args.devices
assert num_gpu <= torch.cuda.device_count()
dist_url = "auto" if args.dist_url is None else args.dist_url
launch(
main, num_gpu, args.num_machine, backend=args.dist_backend,
dist_url=dist_url, args=(exp, args)
)
================================================
FILE: detector/YOLOX/tools/trt.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
import argparse
import os
import shutil
from loguru import logger
import tensorrt as trt
import torch
from torch2trt import torch2trt
from yolox.exp import get_exp
def make_parser():
parser = argparse.ArgumentParser("YOLOX ncnn deploy")
parser.add_argument("-expn", "--experiment-name", type=str, default=None)
parser.add_argument("-n", "--name", type=str, default=None, help="model name")
parser.add_argument(
"-f",
"--exp_file",
default=None,
type=str,
help="pls input your expriment description file",
)
parser.add_argument("-c", "--ckpt", default=None, type=str, help="ckpt path")
return parser
@logger.catch
def main():
args = make_parser().parse_args()
exp = get_exp(args.exp_file, args.name)
if not args.experiment_name:
args.experiment_name = exp.exp_name
model = exp.get_model()
file_name = os.path.join(exp.output_dir, args.experiment_name)
os.makedirs(file_name, exist_ok=True)
if args.ckpt is None:
ckpt_file = os.path.join(file_name, "best_ckpt.pth.tar")
else:
ckpt_file = args.ckpt
ckpt = torch.load(ckpt_file, map_location="cpu")
# load the model state dict
model.load_state_dict(ckpt["model"])
logger.info("loaded checkpoint done.")
model.eval()
model.cuda()
model.head.decode_in_inference = False
x = torch.ones(1, 3, exp.test_size[0], exp.test_size[1]).cuda()
model_trt = torch2trt(
model,
[x],
fp16_mode=True,
log_level=trt.Logger.INFO,
max_workspace_size=(1 << 32),
)
torch.save(model_trt.state_dict(), os.path.join(file_name, 'model_trt.pth'))
logger.info("Converted TensorRT model done.")
engine_file = os.path.join(file_name, 'model_trt.engine')
engine_file_demo = os.path.join('demo', 'TensorRT', 'cpp', 'model_trt.engine')
with open(engine_file, 'wb') as f:
f.write(model_trt.engine.serialize())
shutil.copyfile(engine_file, engine_file_demo)
logger.info("Converted TensorRT model engine file is saved for C++ inference.")
if __name__ == "__main__":
main()
================================================
FILE: detector/YOLOX/yolox/__init__.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
from .utils import configure_module
configure_module()
__version__ = "0.1.0"
================================================
FILE: detector/YOLOX/yolox/core/__init__.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
from .launch import launch
from .trainer import Trainer
================================================
FILE: detector/YOLOX/yolox/core/launch.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Code are based on
# https://github.com/facebookresearch/detectron2/blob/master/detectron2/engine/launch.py
# Copyright (c) Facebook, Inc. and its affiliates.
# Copyright (c) Megvii, Inc. and its affiliates.
from loguru import logger
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import yolox.utils.dist as comm
__all__ = ["launch"]
def _find_free_port():
"""
Find an available port of current machine / node.
"""
import socket
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
# Binding to port 0 will cause the OS to find an available port for us
sock.bind(("", 0))
port = sock.getsockname()[1]
sock.close()
# NOTE: there is still a chance the port could be taken by other processes.
return port
def launch(
main_func, num_gpus_per_machine, num_machines=1, machine_rank=0,
backend="nccl", dist_url=None, args=()
):
"""
Args:
main_func: a function that will be called by `main_func(*args)`
num_machines (int): the total number of machines
machine_rank (int): the rank of this machine (one per machine)
dist_url (str): url to connect to for distributed training, including protocol
e.g. "tcp://127.0.0.1:8686".
Can be set to auto to automatically select a free port on localhost
args (tuple): arguments passed to main_func
"""
world_size = num_machines * num_gpus_per_machine
if world_size > 1:
# https://github.com/pytorch/pytorch/pull/14391
# TODO prctl in spawned processes
if dist_url == "auto":
assert num_machines == 1, "dist_url=auto cannot work with distributed training."
port = _find_free_port()
dist_url = f"tcp://127.0.0.1:{port}"
mp.spawn(
_distributed_worker,
nprocs=num_gpus_per_machine,
args=(
main_func, world_size, num_gpus_per_machine,
machine_rank, backend, dist_url, args
),
daemon=False,
)
else:
main_func(*args)
def _distributed_worker(
local_rank, main_func, world_size, num_gpus_per_machine,
machine_rank, backend, dist_url, args
):
assert torch.cuda.is_available(), "cuda is not available. Please check your installation."
global_rank = machine_rank * num_gpus_per_machine + local_rank
logger.info("Rank {} initialization finished.".format(global_rank))
try:
dist.init_process_group(
backend=backend,
init_method=dist_url,
world_size=world_size,
rank=global_rank,
)
except Exception:
logger.error("Process group URL: {}".format(dist_url))
raise
# synchronize is needed here to prevent a possible timeout after calling init_process_group
# See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
comm.synchronize()
assert num_gpus_per_machine <= torch.cuda.device_count()
torch.cuda.set_device(local_rank)
# Setup the local process group (which contains ranks within the same machine)
assert comm._LOCAL_PROCESS_GROUP is None
num_machines = world_size // num_gpus_per_machine
for i in range(num_machines):
ranks_on_i = list(range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine))
pg = dist.new_group(ranks_on_i)
if i == machine_rank:
comm._LOCAL_PROCESS_GROUP = pg
main_func(*args)
================================================
FILE: detector/YOLOX/yolox/core/trainer.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
import datetime
import os
import time
from loguru import logger
import apex
import torch
from apex import amp
from torch.utils.tensorboard import SummaryWriter
from yolox.data import DataPrefetcher
from yolox.utils import (
MeterBuffer,
ModelEMA,
all_reduce_norm,
get_local_rank,
get_model_info,
get_rank,
get_world_size,
gpu_mem_usage,
load_ckpt,
occumpy_mem,
save_checkpoint,
setup_logger,
synchronize
)
class Trainer:
def __init__(self, exp, args):
# init function only defines some basic attr, other attrs like model, optimizer are built in
# before_train methods.
self.exp = exp
self.args = args
# training related attr
self.max_epoch = exp.max_epoch
self.amp_training = args.fp16
self.is_distributed = get_world_size() > 1
self.rank = get_rank()
self.local_rank = get_local_rank()
self.device = "cuda:{}".format(self.local_rank)
self.use_model_ema = exp.ema
# data/dataloader related attr
self.data_type = torch.float16 if args.fp16 else torch.float32
self.input_size = exp.input_size
self.best_ap = 0
# metric record
self.meter = MeterBuffer(window_size=exp.print_interval)
self.file_name = os.path.join(exp.output_dir, args.experiment_name)
if self.rank == 0 and os.path.exists("./" + args.experiment_name + "ip_add.txt"):
os.remove("./" + args.experiment_name + "ip_add.txt")
if self.rank == 0:
os.makedirs(self.file_name, exist_ok=True)
setup_logger(self.file_name, distributed_rank=self.rank, filename="train_log.txt", mode="a")
def train(self):
self.before_train()
try:
self.train_in_epoch()
except Exception:
raise
finally:
self.after_train()
def train_in_epoch(self):
for self.epoch in range(self.start_epoch, self.max_epoch):
self.before_epoch()
self.train_in_iter()
self.after_epoch()
def train_in_iter(self):
for self.iter in range(self.max_iter):
self.before_iter()
self.train_one_iter()
self.after_iter()
def train_one_iter(self):
iter_start_time = time.time()
inps, targets = self.prefetcher.next()
inps = inps.to(self.data_type)
targets = targets.to(self.data_type)
targets.requires_grad = False
data_end_time = time.time()
outputs = self.model(inps, targets)
loss = outputs["total_loss"]
self.optimizer.zero_grad()
if self.amp_training:
with amp.scale_loss(loss, self.optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
self.optimizer.step()
if self.use_model_ema:
self.ema_model.update(self.model)
lr = self.lr_scheduler.update_lr(self.progress_in_iter + 1)
for param_group in self.optimizer.param_groups:
param_group["lr"] = lr
iter_end_time = time.time()
self.meter.update(
iter_time=iter_end_time - iter_start_time,
data_time=data_end_time - iter_start_time,
lr=lr,
**outputs,
)
def before_train(self):
logger.info("args: {}".format(self.args))
logger.info("exp value:\n{}".format(self.exp))
# model related init
torch.cuda.set_device(self.local_rank)
model = self.exp.get_model()
logger.info("Model Summary: {}".format(get_model_info(model, self.exp.test_size)))
model.to(self.device)
# solver related init
self.optimizer = self.exp.get_optimizer(self.args.batch_size)
if self.amp_training:
model, optimizer = amp.initialize(model, self.optimizer, opt_level="O1")
# value of epoch will be set in `resume_train`
model = self.resume_train(model)
# data related init
self.no_aug = self.start_epoch >= self.max_epoch - self.exp.no_aug_epochs
self.train_loader = self.exp.get_data_loader(
batch_size=self.args.batch_size,
is_distributed=self.is_distributed,
no_aug=self.no_aug
)
logger.info("init prefetcher, this might take one minute or less...")
self.prefetcher = DataPrefetcher(self.train_loader)
# max_iter means iters per epoch
self.max_iter = len(self.train_loader)
self.lr_scheduler = self.exp.get_lr_scheduler(
self.exp.basic_lr_per_img * self.args.batch_size, self.max_iter
)
if self.args.occumpy:
occumpy_mem(self.local_rank)
if self.is_distributed:
model = apex.parallel.DistributedDataParallel(model)
# from torch.nn.parallel import DistributedDataParallel as DDP
# model = DDP(model, device_ids=[self.local_rank], broadcast_buffers=False)
if self.use_model_ema:
self.ema_model = ModelEMA(model, 0.9998)
self.ema_model.updates = self.max_iter * self.start_epoch
self.model = model
self.model.train()
self.evaluator = self.exp.get_evaluator(
batch_size=self.args.batch_size, is_distributed=self.is_distributed
)
# Tensorboard logger
if self.rank == 0:
self.tblogger = SummaryWriter(self.file_name)
logger.info("Training start...")
logger.info("\n{}".format(model))
def after_train(self):
logger.info(
"Training of experiment is done and the best AP is {:.2f}".format(self.best_ap * 100)
)
def before_epoch(self):
logger.info("---> start train epoch{}".format(self.epoch + 1))
if self.epoch + 1 == self.max_epoch - self.exp.no_aug_epochs or self.no_aug:
logger.info("--->No mosaic aug now!")
self.train_loader.close_mosaic()
logger.info("--->Add additional L1 loss now!")
if self.is_distributed:
self.model.module.head.use_l1 = True
else:
self.model.head.use_l1 = True
self.exp.eval_interval = 1
if not self.no_aug:
self.save_ckpt(ckpt_name="last_mosaic_epoch")
def after_epoch(self):
if self.use_model_ema:
self.ema_model.update_attr(self.model)
self.save_ckpt(ckpt_name="latest")
if (self.epoch + 1) % self.exp.eval_interval == 0:
all_reduce_norm(self.model)
self.evaluate_and_save_model()
def before_iter(self):
pass
def after_iter(self):
"""
`after_iter` contains two parts of logic:
* log information
* reset setting of resize
"""
# log needed information
if (self.iter + 1) % self.exp.print_interval == 0:
# TODO check ETA logic
left_iters = self.max_iter * self.max_epoch - (self.progress_in_iter + 1)
eta_seconds = self.meter["iter_time"].global_avg * left_iters
eta_str = "ETA: {}".format(datetime.timedelta(seconds=int(eta_seconds)))
progress_str = "epoch: {}/{}, iter: {}/{}".format(
self.epoch + 1, self.max_epoch, self.iter + 1, self.max_iter
)
loss_meter = self.meter.get_filtered_meter("loss")
loss_str = ", ".join(["{}: {:.1f}".format(k, v.latest) for k, v in loss_meter.items()])
time_meter = self.meter.get_filtered_meter("time")
time_str = ", ".join(["{}: {:.3f}s".format(k, v.avg) for k, v in time_meter.items()])
logger.info(
"{}, mem: {:.0f}Mb, {}, {}, lr: {:.3e}".format(
progress_str,
gpu_mem_usage(),
time_str,
loss_str,
self.meter["lr"].latest,
)
+ (", size: {:d}, {}".format(self.input_size[0], eta_str))
)
self.meter.clear_meters()
# random resizing
if self.exp.random_size is not None and (self.progress_in_iter + 1) % 10 == 0:
self.input_size = self.exp.random_resize(
self.train_loader, self.epoch, self.rank, self.is_distributed
)
@property
def progress_in_iter(self):
return self.epoch * self.max_iter + self.iter
def resume_train(self, model):
if self.args.resume:
logger.info("resume training")
if self.args.ckpt is None:
ckpt_file = os.path.join(self.file_name, "latest" + "_ckpt.pth.tar")
else:
ckpt_file = self.args.ckpt
ckpt = torch.load(ckpt_file, map_location=self.device)
# resume the model/optimizer state dict
model.load_state_dict(ckpt["model"])
self.optimizer.load_state_dict(ckpt["optimizer"])
# resume the training states variables
if self.amp_training and "amp" in ckpt:
amp.load_state_dict(ckpt["amp"])
start_epoch = (
self.args.start_epoch - 1
if self.args.start_epoch is not None
else ckpt["start_epoch"]
)
self.start_epoch = start_epoch
logger.info("loaded checkpoint '{}' (epoch {})".format(self.args.resume, self.start_epoch)) # noqa
else:
if self.args.ckpt is not None:
logger.info("loading checkpoint for fine tuning")
ckpt_file = self.args.ckpt
ckpt = torch.load(ckpt_file, map_location=self.device)["model"]
model = load_ckpt(model, ckpt)
self.start_epoch = 0
return model
def evaluate_and_save_model(self):
evalmodel = self.ema_model.ema if self.use_model_ema else self.model
ap50_95, ap50, summary = self.exp.eval(evalmodel, self.evaluator, self.is_distributed)
self.model.train()
if self.rank == 0:
self.tblogger.add_scalar("val/COCOAP50", ap50, self.epoch + 1)
self.tblogger.add_scalar("val/COCOAP50_95", ap50_95, self.epoch + 1)
logger.info("\n" + summary)
synchronize()
self.save_ckpt("last_epoch", ap50_95 > self.best_ap)
self.best_ap = max(self.best_ap, ap50_95)
def save_ckpt(self, ckpt_name, update_best_ckpt=False):
if self.rank == 0:
save_model = self.ema_model.ema if self.use_model_ema else self.model
logger.info("Save weights to {}".format(self.file_name))
ckpt_state = {
"start_epoch": self.epoch + 1,
"model": save_model.state_dict(),
"optimizer": self.optimizer.state_dict(),
}
if self.amp_training:
# save amp state according to
# https://nvidia.github.io/apex/amp.html#checkpointing
ckpt_state["amp"] = amp.state_dict()
save_checkpoint(
ckpt_state,
update_best_ckpt,
self.file_name,
ckpt_name,
)
================================================
FILE: detector/YOLOX/yolox/data/__init__.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
from .data_augment import TrainTransform, ValTransform
from .data_prefetcher import DataPrefetcher
from .dataloading import DataLoader, get_yolox_datadir
from .datasets import *
from .samplers import InfiniteSampler, YoloBatchSampler
================================================
FILE: detector/YOLOX/yolox/data/data_augment.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
"""
Data augmentation functionality. Passed as callable transformations to
Dataset classes.
The data augmentation procedures were interpreted from @weiliu89's SSD paper
http://arxiv.org/abs/1512.02325
"""
import math
import random
import cv2
import numpy as np
import torch
def augment_hsv(img, hgain=0.015, sgain=0.7, vgain=0.4):
r = np.random.uniform(-1, 1, 3) * [hgain, sgain, vgain] + 1 # random gains
hue, sat, val = cv2.split(cv2.cvtColor(img, cv2.COLOR_BGR2HSV))
dtype = img.dtype # uint8
x = np.arange(0, 256, dtype=np.int16)
lut_hue = ((x * r[0]) % 180).astype(dtype)
lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)
lut_val = np.clip(x * r[2], 0, 255).astype(dtype)
img_hsv = cv2.merge(
(cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val))
).astype(dtype)
cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img) # no return needed
def box_candidates(box1, box2, wh_thr=2, ar_thr=20, area_thr=0.2):
# box1(4,n), box2(4,n)
# Compute candidate boxes which include follwing 5 things:
# box1 before augment, box2 after augment, wh_thr (pixels), aspect_ratio_thr, area_ratio
w1, h1 = box1[2] - box1[0], box1[3] - box1[1]
w2, h2 = box2[2] - box2[0], box2[3] - box2[1]
ar = np.maximum(w2 / (h2 + 1e-16), h2 / (w2 + 1e-16)) # aspect ratio
return (
(w2 > wh_thr)
& (h2 > wh_thr)
& (w2 * h2 / (w1 * h1 + 1e-16) > area_thr)
& (ar < ar_thr)
) # candidates
def random_perspective(
img, targets=(), degrees=10, translate=0.1, scale=0.1, shear=10, perspective=0.0, border=(0, 0),
):
# targets = [cls, xyxy]
height = img.shape[0] + border[0] * 2 # shape(h,w,c)
width = img.shape[1] + border[1] * 2
# Center
C = np.eye(3)
C[0, 2] = -img.shape[1] / 2 # x translation (pixels)
C[1, 2] = -img.shape[0] / 2 # y translation (pixels)
# Rotation and Scale
R = np.eye(3)
a = random.uniform(-degrees, degrees)
# a += random.choice([-180, -90, 0, 90]) # add 90deg rotations to small rotations
s = random.uniform(scale[0], scale[1])
# s = 2 ** random.uniform(-scale, scale)
R[:2] = cv2.getRotationMatrix2D(angle=a, center=(0, 0), scale=s)
# Shear
S = np.eye(3)
S[0, 1] = math.tan(random.uniform(-shear, shear) * math.pi / 180) # x shear (deg)
S[1, 0] = math.tan(random.uniform(-shear, shear) * math.pi / 180) # y shear (deg)
# Translation
T = np.eye(3)
T[0, 2] = (random.uniform(0.5 - translate, 0.5 + translate) * width) # x translation (pixels)
T[1, 2] = (random.uniform(0.5 - translate, 0.5 + translate) * height) # y translation (pixels)
# Combined rotation matrix
M = T @ S @ R @ C # order of operations (right to left) is IMPORTANT
###########################
# For Aug out of Mosaic
# s = 1.
# M = np.eye(3)
###########################
if (border[0] != 0) or (border[1] != 0) or (M != np.eye(3)).any(): # image changed
if perspective:
img = cv2.warpPerspective(img, M, dsize=(width, height), borderValue=(114, 114, 114))
else: # affine
img = cv2.warpAffine(img, M[:2], dsize=(width, height), borderValue=(114, 114, 114))
# Transform label coordinates
n = len(targets)
if n:
# warp points
xy = np.ones((n * 4, 3))
xy[:, :2] = targets[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(n * 4, 2) # x1y1, x2y2, x1y2, x2y1
xy = xy @ M.T # transform
if perspective:
xy = (xy[:, :2] / xy[:, 2:3]).reshape(n, 8) # rescale
else: # affine
xy = xy[:, :2].reshape(n, 8)
# create new boxes
x = xy[:, [0, 2, 4, 6]]
y = xy[:, [1, 3, 5, 7]]
xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
# clip boxes
xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width)
xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height)
# filter candidates
i = box_candidates(box1=targets[:, :4].T * s, box2=xy.T)
targets = targets[i]
targets[:, :4] = xy[i]
return img, targets
def _distort(image):
def _convert(image, alpha=1, beta=0):
tmp = image.astype(float) * alpha + beta
tmp[tmp < 0] = 0
tmp[tmp > 255] = 255
image[:] = tmp
image = image.copy()
if random.randrange(2):
_convert(image, beta=random.uniform(-32, 32))
if random.randrange(2):
_convert(image, alpha=random.uniform(0.5, 1.5))
image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
if random.randrange(2):
tmp = image[:, :, 0].astype(int) + random.randint(-18, 18)
tmp %= 180
image[:, :, 0] = tmp
if random.randrange(2):
_convert(image[:, :, 1], alpha=random.uniform(0.5, 1.5))
image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
return image
def _mirror(image, boxes):
_, width, _ = image.shape
if random.randrange(2):
image = image[:, ::-1]
boxes = boxes.copy()
boxes[:, 0::2] = width - boxes[:, 2::-2]
return image, boxes
def preproc(image, input_size, mean, std, swap=(2, 0, 1)):
if len(image.shape) == 3:
padded_img = np.ones((input_size[0], input_size[1], 3)) * 114.0
else:
padded_img = np.ones(input_size) * 114.0
img = np.array(image)
r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
resized_img = cv2.resize(
img, (int(img.shape[1] * r), int(img.shape[0] * r)), interpolation=cv2.INTER_LINEAR
).astype(np.float32)
padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
image = padded_img
image = image.astype(np.float32)
image = image[:, :, ::-1]
image /= 255.0
if mean is not None:
image -= mean
if std is not None:
image /= std
image = image.transpose(swap)
image = np.ascontiguousarray(image, dtype=np.float32)
return image, r
class TrainTransform:
def __init__(self, p=0.5, rgb_means=None, std=None, max_labels=50):
self.means = rgb_means
self.std = std
self.p = p
self.max_labels = max_labels
def __call__(self, image, targets, input_dim):
boxes = targets[:, :4].copy()
labels = targets[:, 4].copy()
if targets.shape[1] > 5:
mixup = True
ratios = targets[:, -1].copy()
ratios_o = targets[:, -1].copy()
else:
mixup = False
ratios = None
ratios_o = None
lshape = 6 if mixup else 5
if len(boxes) == 0:
targets = np.zeros((self.max_labels, lshape), dtype=np.float32)
image, r_o = preproc(image, input_dim, self.means, self.std)
image = np.ascontiguousarray(image, dtype=np.float32)
return image, targets
image_o = image.copy()
targets_o = targets.copy()
height_o, width_o, _ = image_o.shape
boxes_o = targets_o[:, :4]
labels_o = targets_o[:, 4]
# bbox_o: [xyxy] to [c_x,c_y,w,h]
b_x_o = (boxes_o[:, 2] + boxes_o[:, 0]) * 0.5
b_y_o = (boxes_o[:, 3] + boxes_o[:, 1]) * 0.5
b_w_o = (boxes_o[:, 2] - boxes_o[:, 0]) * 1.0
b_h_o = (boxes_o[:, 3] - boxes_o[:, 1]) * 1.0
boxes_o[:, 0] = b_x_o
boxes_o[:, 1] = b_y_o
boxes_o[:, 2] = b_w_o
boxes_o[:, 3] = b_h_o
image_t = _distort(image)
image_t, boxes = _mirror(image_t, boxes)
height, width, _ = image_t.shape
image_t, r_ = preproc(image_t, input_dim, self.means, self.std)
boxes = boxes.copy()
# boxes [xyxy] 2 [cx,cy,w,h]
b_x = (boxes[:, 2] + boxes[:, 0]) * 0.5
b_y = (boxes[:, 3] + boxes[:, 1]) * 0.5
b_w = (boxes[:, 2] - boxes[:, 0]) * 1.0
b_h = (boxes[:, 3] - boxes[:, 1]) * 1.0
boxes[:, 0] = b_x
boxes[:, 1] = b_y
boxes[:, 2] = b_w
boxes[:, 3] = b_h
boxes *= r_
mask_b = np.minimum(boxes[:, 2], boxes[:, 3]) > 8
boxes_t = boxes[mask_b]
labels_t = labels[mask_b].copy()
if mixup:
ratios_t = ratios[mask_b].copy()
if len(boxes_t) == 0:
image_t, r_o = preproc(image_o, input_dim, self.means, self.std)
boxes_o *= r_o
boxes_t = boxes_o
labels_t = labels_o
ratios_t = ratios_o
labels_t = np.expand_dims(labels_t, 1)
if mixup:
ratios_t = np.expand_dims(ratios_t, 1)
targets_t = np.hstack((labels_t, boxes_t, ratios_t))
else:
targets_t = np.hstack((labels_t, boxes_t))
padded_labels = np.zeros((self.max_labels, lshape))
padded_labels[range(len(targets_t))[: self.max_labels]] = targets_t[
: self.max_labels
]
padded_labels = np.ascontiguousarray(padded_labels, dtype=np.float32)
image_t = np.ascontiguousarray(image_t, dtype=np.float32)
return image_t, padded_labels
class ValTransform:
"""
Defines the transformations that should be applied to test PIL image
for input into the network
dimension -> tensorize -> color adj
Arguments:
resize (int): input dimension to SSD
rgb_means ((int,int,int)): average RGB of the dataset
(104,117,123)
swap ((int,int,int)): final order of channels
Returns:
transform (transform) : callable transform to be applied to test/val
data
"""
def __init__(self, rgb_means=None, std=None, swap=(2, 0, 1)):
self.means = rgb_means
self.swap = swap
self.std = std
# assume input is cv2 img for now
def __call__(self, img, res, input_size):
img, _ = preproc(img, input_size, self.means, self.std, self.swap)
return torch.from_numpy(img), torch.zeros(1, 5)
================================================
FILE: detector/YOLOX/yolox/data/data_prefetcher.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
import random
import torch
import torch.distributed as dist
from ..utils import synchronize
class DataPrefetcher:
"""
DataPrefetcher is inspired by code of following file:
https://github.com/NVIDIA/apex/blob/master/examples/imagenet/main_amp.py
It could speedup your pytorch dataloader. For more information, please check
https://github.com/NVIDIA/apex/issues/304#issuecomment-493562789.
"""
def __init__(self, loader):
self.loader = iter(loader)
self.stream = torch.cuda.Stream()
self.input_cuda = self._input_cuda_for_image
self.record_stream = DataPrefetcher._record_stream_for_image
self.preload()
def preload(self):
try:
self.next_input, self.next_target, _, _ = next(self.loader)
except StopIteration:
self.next_input = None
self.next_target = None
return
with torch.cuda.stream(self.stream):
self.input_cuda()
self.next_target = self.next_target.cuda(non_blocking=True)
def next(self):
torch.cuda.current_stream().wait_stream(self.stream)
input = self.next_input
target = self.next_target
if input is not None:
self.record_stream(input)
if target is not None:
target.record_stream(torch.cuda.current_stream())
self.preload()
return input, target
def _input_cuda_for_image(self):
self.next_input = self.next_input.cuda(non_blocking=True)
@staticmethod
def _record_stream_for_image(input):
input.record_stream(torch.cuda.current_stream())
def random_resize(data_loader, exp, epoch, rank, is_distributed):
tensor = torch.LongTensor(1).cuda()
if is_distributed:
synchronize()
if rank == 0:
if epoch > exp.max_epoch - 10:
size = exp.input_size
else:
size = random.randint(*exp.random_size)
size = int(32 * size)
tensor.fill_(size)
if is_distributed:
synchronize()
dist.broadcast(tensor, 0)
input_size = data_loader.change_input_dim(multiple=tensor.item(), random_range=None)
return input_size
================================================
FILE: detector/YOLOX/yolox/data/dataloading.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
import os
import random
import torch
from torch.utils.data.dataloader import DataLoader as torchDataLoader
from torch.utils.data.dataloader import default_collate
from .samplers import YoloBatchSampler
def get_yolox_datadir():
"""
get dataset dir of YOLOX. If environment variable named `YOLOX_DATADIR` is set,
this function will return value of the environment variable. Otherwise, use data
"""
yolox_datadir = os.getenv("YOLOX_DATADIR", None)
if yolox_datadir is None:
import yolox
yolox_path = os.path.dirname(os.path.dirname(yolox.__file__))
yolox_datadir = os.path.join(yolox_path, "datasets")
return yolox_datadir
class DataLoader(torchDataLoader):
"""
Lightnet dataloader that enables on the fly resizing of the images.
See :class:`torch.utils.data.DataLoader` for more information on the arguments.
Check more on the following website:
https://gitlab.com/EAVISE/lightnet/-/blob/master/lightnet/data/_dataloading.py
Note:
This dataloader only works with :class:`lightnet.data.Dataset` based datasets.
Example:
>>> class CustomSet(ln.data.Dataset):
... def __len__(self):
... return 4
... @ln.data.Dataset.resize_getitem
... def __getitem__(self, index):
... # Should return (image, anno) but here we return (input_dim,)
... return (self.input_dim,)
>>> dl = ln.data.DataLoader(
... CustomSet((200,200)),
... batch_size = 2,
... collate_fn = ln.data.list_collate # We want the data to be grouped as a list
... )
>>> dl.dataset.input_dim # Default input_dim
(200, 200)
>>> for d in dl:
... d
[[(200, 200), (200, 200)]]
[[(200, 200), (200, 200)]]
>>> dl.change_input_dim(320, random_range=None)
(320, 320)
>>> for d in dl:
... d
[[(320, 320), (320, 320)]]
[[(320, 320), (320, 320)]]
>>> dl.change_input_dim((480, 320), random_range=None)
(480, 320)
>>> for d in dl:
... d
[[(480, 320), (480, 320)]]
[[(480, 320), (480, 320)]]
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.__initialized = False
shuffle = False
batch_sampler = None
if len(args) > 5:
shuffle = args[2]
sampler = args[3]
batch_sampler = args[4]
elif len(args) > 4:
shuffle = args[2]
sampler = args[3]
if "batch_sampler" in kwargs:
batch_sampler = kwargs["batch_sampler"]
elif len(args) > 3:
shuffle = args[2]
if "sampler" in kwargs:
sampler = kwargs["sampler"]
if "batch_sampler" in kwargs:
batch_sampler = kwargs["batch_sampler"]
else:
if "shuffle" in kwargs:
shuffle = kwargs["shuffle"]
if "sampler" in kwargs:
sampler = kwargs["sampler"]
if "batch_sampler" in kwargs:
batch_sampler = kwargs["batch_sampler"]
# Use custom BatchSampler
if batch_sampler is None:
if sampler is None:
if shuffle:
sampler = torch.utils.data.sampler.RandomSampler(self.dataset)
# sampler = torch.utils.data.DistributedSampler(self.dataset)
else:
sampler = torch.utils.data.sampler.SequentialSampler(self.dataset)
batch_sampler = YoloBatchSampler(
sampler,
self.batch_size,
self.drop_last,
input_dimension=self.dataset.input_dim,
)
# batch_sampler = IterationBasedBatchSampler(batch_sampler, num_iterations =
self.batch_sampler = batch_sampler
self.__initialized = True
def close_mosaic(self):
self.batch_sampler.mosaic = False
def change_input_dim(self, multiple=32, random_range=(10, 19)):
""" This function will compute a new size and update it on the next mini_batch.
Args:
multiple (int or tuple, optional): values to multiply the randomly generated range by.
Default **32**
random_range (tuple, optional): This (min, max) tuple sets the range
for the randomisation; Default **(10, 19)**
Return:
tuple: width, height tuple with new dimension
Note:
The new size is generated as follows: |br|
First we compute a random integer inside ``[random_range]``.
We then multiply that number with the ``multiple`` argument,
which gives our final new input size. |br|
If ``multiple`` is an integer we generate a square size. If you give a tuple
of **(width, height)**, the size is computed
as :math:`rng * multiple[0], rng * multiple[1]`.
Note:
You can set the ``random_range`` argument to **None** to set
an exact size of multiply. |br|
See the example above for how this works.
"""
if random_range is None:
size = 1
else:
size = random.randint(*random_range)
if isinstance(multiple, int):
size = (size * multiple, size * multiple)
else:
size = (size * multiple[0], size * multiple[1])
self.batch_sampler.new_input_dim = size
return size
def list_collate(batch):
"""
Function that collates lists or tuples together into one list (of lists/tuples).
Use this as the collate function in a Dataloader, if you want to have a list of
items as an output, as opposed to tensors (eg. Brambox.boxes).
"""
items = list(zip(*batch))
for i in range(len(items)):
if isinstance(items[i][0], (list, tuple)):
items[i] = list(items[i])
else:
items[i] = default_collate(items[i])
return items
================================================
FILE: detector/YOLOX/yolox/data/datasets/__init__.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
from .coco import COCODataset
from .coco_classes import COCO_CLASSES
from .datasets_wrapper import ConcatDataset, Dataset, MixConcatDataset
#from .mosaicdetection import MosaicDetection
#from .voc import VOCDetection
================================================
FILE: detector/YOLOX/yolox/data/datasets/coco.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
import os
import cv2
import numpy as np
from pycocotools.coco import COCO
from ..dataloading import get_yolox_datadir
from .datasets_wrapper import Dataset
class COCODataset(Dataset):
"""
COCO dataset class.
"""
def __init__(
self,
data_dir=None,
json_file="instances_train2017.json",
name="train2017",
img_size=(416, 416),
preproc=None,
):
"""
COCO dataset initialization. Annotation data are read into memory by COCO API.
Args:
data_dir (str): dataset root directory
json_file (str): COCO json file name
name (str): COCO data name (e.g. 'train2017' or 'val2017')
img_size (int): target image size after pre-processing
preproc: data augmentation strategy
"""
super().__init__(img_size)
if data_dir is None:
data_dir = os.path.join(get_yolox_datadir(), "COCO")
self.data_dir = data_dir
self.json_file = json_file
self.coco = COCO(os.path.join(self.data_dir, "annotations", self.json_file))
self.ids = self.coco.getImgIds()
self.class_ids = sorted(self.coco.getCatIds())
cats = self.coco.loadCats(self.coco.getCatIds())
self._classes = tuple([c["name"] for c in cats])
self.name = name
self.img_size = img_size
self.preproc = preproc
def __len__(self):
return len(self.ids)
def load_anno(self, index):
id_ = self.ids[index]
anno_ids = self.coco.getAnnIds(imgIds=[int(id_)], iscrowd=False)
annotations = self.coco.loadAnns(anno_ids)
im_ann = self.coco.loadImgs(id_)[0]
width = im_ann["width"]
height = im_ann["height"]
# load labels
valid_objs = []
for obj in annotations:
x1 = np.max((0, obj["bbox"][0]))
y1 = np.max((0, obj["bbox"][1]))
x2 = np.min((width - 1, x1 + np.max((0, obj["bbox"][2] - 1))))
y2 = np.min((height - 1, y1 + np.max((0, obj["bbox"][3] - 1))))
if obj["area"] > 0 and x2 >= x1 and y2 >= y1:
obj["clean_bbox"] = [x1, y1, x2, y2]
valid_objs.append(obj)
objs = valid_objs
num_objs = len(objs)
res = np.zeros((num_objs, 5))
for ix, obj in enumerate(objs):
cls = self.class_ids.index(obj["category_id"])
res[ix, 0:4] = obj["clean_bbox"]
res[ix, 4] = cls
return res
def pull_item(self, index):
id_ = self.ids[index]
im_ann = self.coco.loadImgs(id_)[0]
width = im_ann["width"]
height = im_ann["height"]
# load image and preprocess
img_file = os.path.join(
self.data_dir, self.name, "{:012}".format(id_) + ".jpg"
)
img = cv2.imread(img_file)
assert img is not None
# load anno
res = self.load_anno(index)
img_info = (height, width)
return img, res, img_info, id_
@Dataset.resize_getitem
def __getitem__(self, index):
"""
One image / label pair for the given index is picked up and pre-processed.
Args:
index (int): data index
Returns:
img (numpy.ndarray): pre-processed image
padded_labels (torch.Tensor): pre-processed label data.
The shape is :math:`[max_labels, 5]`.
each label consists of [class, xc, yc, w, h]:
class (float): class index.
xc, yc (float) : center of bbox whose values range from 0 to 1.
w, h (float) : size of bbox whose values range from 0 to 1.
info_img : tuple of h, w, nh, nw, dx, dy.
h, w (int): original shape of the image
nh, nw (int): shape of the resized image without padding
dx, dy (int): pad size
img_id (int): same as the input index. Used for evaluation.
"""
img, res, img_info, img_id = self.pull_item(index)
if self.preproc is not None:
img, target = self.preproc(img, res, self.input_dim)
return img, target, img_info, img_id
================================================
FILE: detector/YOLOX/yolox/data/datasets/coco_classes.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
COCO_CLASSES = (
"person",
"bicycle",
"car",
"motorcycle",
"airplane",
"bus",
"train",
"truck",
"boat",
"traffic light",
"fire hydrant",
"stop sign",
"parking meter",
"bench",
"bird",
"cat",
"dog",
"horse",
"sheep",
"cow",
"elephant",
"bear",
"zebra",
"giraffe",
"backpack",
"umbrella",
"handbag",
"tie",
"suitcase",
"frisbee",
"skis",
"snowboard",
"sports ball",
"kite",
"baseball bat",
"baseball glove",
"skateboard",
"surfboard",
"tennis racket",
"bottle",
"wine glass",
"cup",
"fork",
"knife",
"spoon",
"bowl",
"banana",
"apple",
"sandwich",
"orange",
"broccoli",
"carrot",
"hot dog",
"pizza",
"donut",
"cake",
"chair",
"couch",
"potted plant",
"bed",
"dining table",
"toilet",
"tv",
"laptop",
"mouse",
"remote",
"keyboard",
"cell phone",
"microwave",
"oven",
"toaster",
"sink",
"refrigerator",
"book",
"clock",
"vase",
"scissors",
"teddy bear",
"hair drier",
"toothbrush",
)
================================================
FILE: detector/YOLOX/yolox/data/datasets/datasets_wrapper.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
import bisect
from functools import wraps
from torch.utils.data.dataset import ConcatDataset as torchConcatDataset
from torch.utils.data.dataset import Dataset as torchDataset
class ConcatDataset(torchConcatDataset):
def __init__(self, datasets):
super(ConcatDataset, self).__init__(datasets)
if hasattr(self.datasets[0], "input_dim"):
self._input_dim = self.datasets[0].input_dim
self.input_dim = self.datasets[0].input_dim
def pull_item(self, idx):
if idx < 0:
if -idx > len(self):
raise ValueError(
"absolute value of index should not exceed dataset length"
)
idx = len(self) + idx
dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
if dataset_idx == 0:
sample_idx = idx
else:
sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
return self.datasets[dataset_idx].pull_item(sample_idx)
class MixConcatDataset(torchConcatDataset):
def __init__(self, datasets):
super(MixConcatDataset, self).__init__(datasets)
if hasattr(self.datasets[0], "input_dim"):
self._input_dim = self.datasets[0].input_dim
self.input_dim = self.datasets[0].input_dim
def __getitem__(self, index):
if not isinstance(index, int):
idx = index[1]
if idx < 0:
if -idx > len(self):
raise ValueError(
"absolute value of index should not exceed dataset length"
)
idx = len(self) + idx
dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
if dataset_idx == 0:
sample_idx = idx
else:
sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
if not isinstance(index, int):
index = (index[0], sample_idx, index[2])
return self.datasets[dataset_idx][index]
class Dataset(torchDataset):
""" This class is a subclass of the base :class:`torch.utils.data.Dataset`,
that enables on the fly resizing of the ``input_dim``.
Args:
input_dimension (tuple): (width,height) tuple with default dimensions of the network
"""
def __init__(self, input_dimension, mosaic=True):
super().__init__()
self.__input_dim = input_dimension[:2]
self._mosaic = mosaic
@property
def input_dim(self):
"""
Dimension that can be used by transforms to set the correct image size, etc.
This allows transforms to have a single source of truth
for the input dimension of the network.
Return:
list: Tuple containing the current width,height
"""
if hasattr(self, "_input_dim"):
return self._input_dim
return self.__input_dim
@staticmethod
def resize_getitem(getitem_fn):
"""
Decorator method that needs to be used around the ``__getitem__`` method. |br|
This decorator enables the on the fly resizing of
the ``input_dim`` with our :class:`~lightnet.data.DataLoader` class.
Example:
>>> class CustomSet(ln.data.Dataset):
... def __len__(self):
... return 10
... @ln.data.Dataset.resize_getitem
... def __getitem__(self, index):
... # Should return (image, anno) but here we return input_dim
... return self.input_dim
>>> data = CustomSet((200,200))
>>> data[0]
(200, 200)
>>> data[(480,320), 0]
(480, 320)
"""
@wraps(getitem_fn)
def wrapper(self, index):
if not isinstance(index, int):
has_dim = True
self._input_dim = index[0]
self._mosaic = index[2]
index = index[1]
else:
has_dim = False
ret_val = getitem_fn(self, index)
if has_dim:
del self._input_dim
return ret_val
return wrapper
================================================
FILE: detector/YOLOX/yolox/data/datasets/mosaicdetection.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
import random
import cv2
import numpy as np
from yolox.utils import adjust_box_anns
from ..data_augment import box_candidates, random_perspective
from .datasets_wrapper import Dataset
def get_mosaic_coordinate(mosaic_image, mosaic_index, xc, yc, w, h, input_h, input_w):
# TODO update doc
# index0 to top left part of image
if mosaic_index == 0:
x1, y1, x2, y2 = max(xc - w, 0), max(yc - h, 0), xc, yc
small_coord = w - (x2 - x1), h - (y2 - y1), w, h
# index1 to top right part of image
elif mosaic_index == 1:
x1, y1, x2, y2 = xc, max(yc - h, 0), min(xc + w, input_w * 2), yc
small_coord = 0, h - (y2 - y1), min(w, x2 - x1), h
# index2 to bottom left part of image
elif mosaic_index == 2:
x1, y1, x2, y2 = max(xc - w, 0), yc, xc, min(input_h * 2, yc + h)
small_coord = w - (x2 - x1), 0, w, min(y2 - y1, h)
# index2 to bottom right part of image
elif mosaic_index == 3:
x1, y1, x2, y2 = xc, yc, min(xc + w, input_w * 2), min(input_h * 2, yc + h) # noqa
small_coord = 0, 0, min(w, x2 - x1), min(y2 - y1, h)
return (x1, y1, x2, y2), small_coord
class MosaicDetection(Dataset):
"""Detection dataset wrapper that performs mixup for normal dataset."""
def __init__(
self, dataset, img_size, mosaic=True, preproc=None,
degrees=10.0, translate=0.1, scale=(0.5, 1.5), mscale=(0.5, 1.5),
shear=2.0, perspective=0.0, enable_mixup=True, *args
):
"""
Args:
dataset(Dataset) : Pytorch dataset object.
img_size (tuple):
mosaic (bool): enable mosaic augmentation or not.
preproc (func):
degrees (float):
translate (float):
scale (tuple):
mscale (tuple):
shear (float):
perspective (float):
enable_mixup (bool):
*args(tuple) : Additional arguments for mixup random sampler.
"""
super().__init__(img_size, mosaic=mosaic)
self._dataset = dataset
self.preproc = preproc
self.degrees = degrees
self.translate = translate
self.scale = scale
self.shear = shear
self.perspective = perspective
self.mixup_scale = mscale
self.enable_mosaic = mosaic
self.enable_mixup = enable_mixup
def __len__(self):
return len(self._dataset)
@Dataset.resize_getitem
def __getitem__(self, idx):
if self.enable_mosaic:
mosaic_labels = []
input_dim = self._dataset.input_dim
input_h, input_w = input_dim[0], input_dim[1]
# yc, xc = s, s # mosaic center x, y
yc = int(random.uniform(0.5 * input_h, 1.5 * input_h))
xc = int(random.uniform(0.5 * input_w, 1.5 * input_w))
# 3 additional image indices
indices = [idx] + [random.randint(0, len(self._dataset) - 1) for _ in range(3)]
for i_mosaic, index in enumerate(indices):
img, _labels, _, _ = self._dataset.pull_item(index)
h0, w0 = img.shape[:2] # orig hw
scale = min(1. * input_h / h0, 1. * input_w / w0)
img = cv2.resize(
img, (int(w0 * scale), int(h0 * scale)), interpolation=cv2.INTER_LINEAR
)
# generate output mosaic image
(h, w, c) = img.shape[:3]
if i_mosaic == 0:
mosaic_img = np.full((input_h * 2, input_w * 2, c), 114, dtype=np.uint8)
# suffix l means large image, while s means small image in mosaic aug.
(l_x1, l_y1, l_x2, l_y2), (s_x1, s_y1, s_x2, s_y2) = get_mosaic_coordinate(
mosaic_img, i_mosaic, xc, yc, w, h, input_h, input_w
)
mosaic_img[l_y1:l_y2, l_x1:l_x2] = img[s_y1:s_y2, s_x1:s_x2]
padw, padh = l_x1 - s_x1, l_y1 - s_y1
labels = _labels.copy()
# Normalized xywh to pixel xyxy format
if _labels.size > 0:
labels[:, 0] = scale * _labels[:, 0] + padw
labels[:, 1] = scale * _labels[:, 1] + padh
labels[:, 2] = scale * _labels[:, 2] + padw
labels[:, 3] = scale * _labels[:, 3] + padh
mosaic_labels.append(labels)
if len(mosaic_labels):
mosaic_labels = np.concatenate(mosaic_labels, 0)
np.clip(mosaic_labels[:, 0], 0, 2 * input_w, out=mosaic_labels[:, 0])
np.clip(mosaic_labels[:, 1], 0, 2 * input_h, out=mosaic_labels[:, 1])
np.clip(mosaic_labels[:, 2], 0, 2 * input_w, out=mosaic_labels[:, 2])
np.clip(mosaic_labels[:, 3], 0, 2 * input_h, out=mosaic_labels[:, 3])
mosaic_img, mosaic_labels = random_perspective(
mosaic_img,
mosaic_labels,
degrees=self.degrees,
translate=self.translate,
scale=self.scale,
shear=self.shear,
perspective=self.perspective,
border=[-input_h // 2, -input_w // 2],
) # border to remove
# -----------------------------------------------------------------
# CopyPaste: https://arxiv.org/abs/2012.07177
# -----------------------------------------------------------------
if self.enable_mixup and not len(mosaic_labels) == 0:
mosaic_img, mosaic_labels = self.mixup(mosaic_img, mosaic_labels, self.input_dim)
mix_img, padded_labels = self.preproc(mosaic_img, mosaic_labels, self.input_dim)
img_info = (mix_img.shape[1], mix_img.shape[0])
return mix_img, padded_labels, img_info, int(idx)
else:
self._dataset._input_dim = self.input_dim
img, label, img_info, idx = self._dataset.pull_item(idx)
img, label = self.preproc(img, label, self.input_dim)
return img, label, img_info, int(idx)
def mixup(self, origin_img, origin_labels, input_dim):
jit_factor = random.uniform(*self.mixup_scale)
FLIP = random.uniform(0, 1) > 0.5
cp_labels = []
while len(cp_labels) == 0:
cp_index = random.randint(0, self.__len__() - 1)
cp_labels = self._dataset.load_anno(cp_index)
img, cp_labels, _, _ = self._dataset.pull_item(cp_index)
if len(img.shape) == 3:
cp_img = np.ones((input_dim[0], input_dim[1], 3)) * 114.0
else:
cp_img = np.ones(input_dim) * 114.0
cp_scale_ratio = min(input_dim[0] / img.shape[0], input_dim[1] / img.shape[1])
resized_img = cv2.resize(
img,
(int(img.shape[1] * cp_scale_ratio), int(img.shape[0] * cp_scale_ratio)),
interpolation=cv2.INTER_LINEAR,
).astype(np.float32)
cp_img[
: int(img.shape[0] * cp_scale_ratio), : int(img.shape[1] * cp_scale_ratio)
] = resized_img
cp_img = cv2.resize(
cp_img,
(int(cp_img.shape[1] * jit_factor), int(cp_img.shape[0] * jit_factor)),
)
cp_scale_ratio *= jit_factor
if FLIP:
cp_img = cp_img[:, ::-1, :]
origin_h, origin_w = cp_img.shape[:2]
target_h, target_w = origin_img.shape[:2]
padded_img = np.zeros(
(max(origin_h, target_h), max(origin_w, target_w), 3)
).astype(np.uint8)
padded_img[:origin_h, :origin_w] = cp_img
x_offset, y_offset = 0, 0
if padded_img.shape[0] > target_h:
y_offset = random.randint(0, padded_img.shape[0] - target_h - 1)
if padded_img.shape[1] > target_w:
x_offset = random.randint(0, padded_img.shape[1] - target_w - 1)
padded_cropped_img = padded_img[
y_offset: y_offset + target_h, x_offset: x_offset + target_w
]
cp_bboxes_origin_np = adjust_box_anns(
cp_labels[:, :4], cp_scale_ratio, 0, 0, origin_w, origin_h
)
if FLIP:
cp_bboxes_origin_np[:, 0::2] = (
origin_w - cp_bboxes_origin_np[:, 0::2][:, ::-1]
)
cp_bboxes_transformed_np = cp_bboxes_origin_np.copy()
cp_bboxes_transformed_np[:, 0::2] = np.clip(
cp_bboxes_transformed_np[:, 0::2] - x_offset, 0, target_w
)
cp_bboxes_transformed_np[:, 1::2] = np.clip(
cp_bboxes_transformed_np[:, 1::2] - y_offset, 0, target_h
)
keep_list = box_candidates(cp_bboxes_origin_np.T, cp_bboxes_transformed_np.T, 5)
if keep_list.sum() >= 1.0:
cls_labels = cp_labels[keep_list, 4:5]
box_labels = cp_bboxes_transformed_np[keep_list]
labels = np.hstack((box_labels, cls_labels))
origin_labels = np.vstack((origin_labels, labels))
origin_img = origin_img.astype(np.float32)
origin_img = 0.5 * origin_img + 0.5 * padded_cropped_img.astype(np.float32)
return origin_img.astype(np.uint8), origin_labels
================================================
FILE: detector/YOLOX/yolox/data/datasets/voc.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Code are based on
# https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py
# Copyright (c) Francisco Massa.
# Copyright (c) Ellis Brown, Max deGroot.
# Copyright (c) Megvii, Inc. and its affiliates.
import os
import os.path
import pickle
import xml.etree.ElementTree as ET
import cv2
import numpy as np
from yolox.evaluators.voc_eval import voc_eval
from .datasets_wrapper import Dataset
from .voc_classes import VOC_CLASSES
class AnnotationTransform(object):
"""Transforms a VOC annotation into a Tensor of bbox coords and label index
Initilized with a dictionary lookup of classnames to indexes
Arguments:
class_to_ind (dict, optional): dictionary lookup of classnames -> indexes
(default: alphabetic indexing of VOC's 20 classes)
keep_difficult (bool, optional): keep difficult instances or not
(default: False)
height (int): height
width (int): width
"""
def __init__(self, class_to_ind=None, keep_difficult=True):
self.class_to_ind = class_to_ind or dict(zip(VOC_CLASSES, range(len(VOC_CLASSES))))
self.keep_difficult = keep_difficult
def __call__(self, target):
"""
Arguments:
target (annotation) : the target annotation to be made usable
will be an ET.Element
Returns:
a list containing lists of bounding boxes [bbox coords, class name]
"""
res = np.empty((0, 5))
for obj in target.iter("object"):
difficult = int(obj.find("difficult").text) == 1
if not self.keep_difficult and difficult:
continue
name = obj.find("name").text.lower().strip()
bbox = obj.find("bndbox")
pts = ["xmin", "ymin", "xmax", "ymax"]
bndbox = []
for i, pt in enumerate(pts):
cur_pt = int(bbox.find(pt).text) - 1
# scale height or width
# cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height
bndbox.append(cur_pt)
label_idx = self.class_to_ind[name]
bndbox.append(label_idx)
res = np.vstack((res, bndbox)) # [xmin, ymin, xmax, ymax, label_ind]
# img_id = target.find('filename').text[:-4]
return res # [[xmin, ymin, xmax, ymax, label_ind], ... ]
class VOCDetection(Dataset):
"""
VOC Detection Dataset Object
input is image, target is annotation
Args:
root (string): filepath to VOCdevkit folder.
image_set (string): imageset to use (eg. 'train', 'val', 'test')
transform (callable, optional): transformation to perform on the
input image
target_transform (callable, optional): transformation to perform on the
target `annotation`
(eg: take in caption string, return tensor of word indices)
dataset_name (string, optional): which dataset to load
(default: 'VOC2007')
"""
def __init__(
self,
data_dir,
image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
img_size=(416, 416),
preproc=None,
target_transform=AnnotationTransform(),
dataset_name="VOC0712",
):
super().__init__(img_size)
self.root = data_dir
self.image_set = image_sets
self.img_size = img_size
self.preproc = preproc
self.target_transform = target_transform
self.name = dataset_name
self._annopath = os.path.join("%s", "Annotations", "%s.xml")
self._imgpath = os.path.join("%s", "JPEGImages", "%s.jpg")
self._classes = VOC_CLASSES
self.ids = list()
for (year, name) in image_sets:
self._year = year
rootpath = os.path.join(self.root, "VOC" + year)
for line in open(
os.path.join(rootpath, "ImageSets", "Main", name + ".txt")
):
self.ids.append((rootpath, line.strip()))
def __len__(self):
return len(self.ids)
def load_anno(self, index):
img_id = self.ids[index]
target = ET.parse(self._annopath % img_id).getroot()
if self.target_transform is not None:
target = self.target_transform(target)
return target
def pull_item(self, index):
"""Returns the original image and target at an index for mixup
Note: not using self.__getitem__(), as any transformations passed in
could mess up this functionality.
Argument:
index (int): index of img to show
Return:
img, target
"""
img_id = self.ids[index]
img = cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR)
height, width, _ = img.shape
target = self.load_anno(index)
img_info = (width, height)
return img, target, img_info, index
@Dataset.resize_getitem
def __getitem__(self, index):
img, target, img_info, img_id = self.pull_item(index)
if self.preproc is not None:
img, target = self.preproc(img, target, self.input_dim)
return img, target, img_info, img_id
def evaluate_detections(self, all_boxes, output_dir=None):
"""
all_boxes is a list of length number-of-classes.
Each list element is a list of length number-of-images.
Each of those list elements is either an empty list []
or a numpy array of detection.
all_boxes[class][image] = [] or np.array of shape #dets x 5
"""
self._write_voc_results_file(all_boxes)
IouTh = np.linspace(0.5, 0.95, int(np.round((0.95 - 0.5) / 0.05)) + 1, endpoint=True)
mAPs = []
for iou in IouTh:
mAP = self._do_python_eval(output_dir, iou)
mAPs.append(mAP)
print("--------------------------------------------------------------")
print("map_5095:", np.mean(mAPs))
print("map_50:", mAPs[0])
print("--------------------------------------------------------------")
return np.mean(mAPs), mAPs[0]
def _get_voc_results_file_template(self):
filename = "comp4_det_test" + "_{:s}.txt"
filedir = os.path.join(self.root, "results", "VOC" + self._year, "Main")
if not os.path.exists(filedir):
os.makedirs(filedir)
path = os.path.join(filedir, filename)
return path
def _write_voc_results_file(self, all_boxes):
for cls_ind, cls in enumerate(VOC_CLASSES):
cls_ind = cls_ind
if cls == "__background__":
continue
print("Writing {} VOC results file".format(cls))
filename = self._get_voc_results_file_template().format(cls)
with open(filename, "wt") as f:
for im_ind, index in enumerate(self.ids):
index = index[1]
dets = all_boxes[cls_ind][im_ind]
if dets == []:
continue
for k in range(dets.shape[0]):
f.write(
"{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n".format(
index,
dets[k, -1],
dets[k, 0] + 1,
dets[k, 1] + 1,
dets[k, 2] + 1,
dets[k, 3] + 1,
)
)
def _do_python_eval(self, output_dir="output", iou=0.5):
rootpath = os.path.join(self.root, "VOC" + self._year)
name = self.image_set[0][1]
annopath = os.path.join(rootpath, "Annotations", "{:s}.xml")
imagesetfile = os.path.join(rootpath, "ImageSets", "Main", name + ".txt")
cachedir = os.path.join(
self.root, "annotations_cache", "VOC" + self._year, name
)
if not os.path.exists(cachedir):
os.makedirs(cachedir)
aps = []
# The PASCAL VOC metric changed in 2010
use_07_metric = True if int(self._year) < 2010 else False
print("Eval IoU : {:.2f}".format(iou))
if output_dir is not None and not os.path.isdir(output_dir):
os.mkdir(output_dir)
for i, cls in enumerate(VOC_CLASSES):
if cls == "__background__":
continue
filename = self._get_voc_results_file_template().format(cls)
rec, prec, ap = voc_eval(
filename,
annopath,
imagesetfile,
cls,
cachedir,
ovthresh=iou,
use_07_metric=use_07_metric,
)
aps += [ap]
if iou == 0.5:
print("AP for {} = {:.4f}".format(cls, ap))
if output_dir is not None:
with open(os.path.join(output_dir, cls + "_pr.pkl"), "wb") as f:
pickle.dump({"rec": rec, "prec": prec, "ap": ap}, f)
if iou == 0.5:
print("Mean AP = {:.4f}".format(np.mean(aps)))
print("~~~~~~~~")
print("Results:")
for ap in aps:
print("{:.3f}".format(ap))
print("{:.3f}".format(np.mean(aps)))
print("~~~~~~~~")
print("")
print("--------------------------------------------------------------")
print("Results computed with the **unofficial** Python eval code.")
print("Results should be very close to the official MATLAB eval code.")
print("Recompute with `./tools/reval.py --matlab ...` for your paper.")
print("-- Thanks, The Management")
print("--------------------------------------------------------------")
return np.mean(aps)
================================================
FILE: detector/YOLOX/yolox/data/datasets/voc_classes.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
# VOC_CLASSES = ( '__background__', # always index 0
VOC_CLASSES = (
"aeroplane",
"bicycle",
"bird",
"boat",
"bottle",
"bus",
"car",
"cat",
"chair",
"cow",
"diningtable",
"dog",
"horse",
"motorbike",
"person",
"pottedplant",
"sheep",
"sofa",
"train",
"tvmonitor",
)
================================================
FILE: detector/YOLOX/yolox/data/samplers.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
import itertools
from typing import Optional
import torch
import torch.distributed as dist
from torch.utils.data.sampler import BatchSampler as torchBatchSampler
from torch.utils.data.sampler import Sampler
class YoloBatchSampler(torchBatchSampler):
"""
This batch sampler will generate mini-batches of (dim, index) tuples from another sampler.
It works just like the :class:`torch.utils.data.sampler.BatchSampler`,
but it will prepend a dimension, whilst ensuring it stays the same across one mini-batch.
"""
def __init__(self, *args, input_dimension=None, mosaic=True, **kwargs):
super().__init__(*args, **kwargs)
self.input_dim = input_dimension
self.new_input_dim = None
self.mosaic = mosaic
def __iter__(self):
self.__set_input_dim()
for batch in super().__iter__():
yield [(self.input_dim, idx, self.mosaic) for idx in batch]
self.__set_input_dim()
def __set_input_dim(self):
""" This function randomly changes the the input dimension of the dataset. """
if self.new_input_dim is not None:
self.input_dim = (self.new_input_dim[0], self.new_input_dim[1])
self.new_input_dim = None
class InfiniteSampler(Sampler):
"""
In training, we only care about the "infinite stream" of training data.
So this sampler produces an infinite stream of indices and
all workers cooperate to correctly shuffle the indices and sample different indices.
The samplers in each worker effectively produces `indices[worker_id::num_workers]`
where `indices` is an infinite stream of indices consisting of
`shuffle(range(size)) + shuffle(range(size)) + ...` (if shuffle is True)
or `range(size) + range(size) + ...` (if shuffle is False)
"""
def __init__(
self,
size: int,
shuffle: bool = True,
seed: Optional[int] = 0,
rank=0,
world_size=1,
):
"""
Args:
size (int): the total number of data of the underlying dataset to sample from
shuffle (bool): whether to shuffle the indices or not
seed (int): the initial seed of the shuffle. Must be the same
across all workers. If None, will use a random seed shared
among workers (require synchronization among all workers).
"""
self._size = size
assert size > 0
self._shuffle = shuffle
self._seed = int(seed)
if dist.is_available() and dist.is_initialized():
self._rank = dist.get_rank()
self._world_size = dist.get_world_size()
else:
self._rank = rank
self._world_size = world_size
def __iter__(self):
start = self._rank
yield from itertools.islice(
self._infinite_indices(), start, None, self._world_size
)
def _infinite_indices(self):
g = torch.Generator()
g.manual_seed(self._seed)
while True:
if self._shuffle:
yield from torch.randperm(self._size, generator=g)
else:
yield from torch.arange(self._size)
def __len__(self):
return self._size // self._world_size
================================================
FILE: detector/YOLOX/yolox/evaluators/__init__.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
from .coco_evaluator import COCOEvaluator
from .voc_evaluator import VOCEvaluator
================================================
FILE: detector/YOLOX/yolox/evaluators/coco_evaluator.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
import contextlib
import io
import itertools
import json
import tempfile
import time
from loguru import logger
from tqdm import tqdm
import torch
from yolox.utils import (
gather,
is_main_process,
postprocess,
synchronize,
time_synchronized,
xyxy2xywh
)
class COCOEvaluator:
"""
COCO AP Evaluation class. All the data in the val2017 dataset are processed
and evaluated by COCO API.
"""
def __init__(
self, dataloader, img_size, confthre, nmsthre, num_classes, testdev=False
):
"""
Args:
dataloader (Dataloader): evaluate dataloader.
img_size (int): image size after preprocess. images are resized
to squares whose shape is (img_size, img_size).
confthre (float): confidence threshold ranging from 0 to 1, which
is defined in the config file.
nmsthre (float): IoU threshold of non-max supression ranging from 0 to 1.
"""
self.dataloader = dataloader
self.img_size = img_size
self.confthre = confthre
self.nmsthre = nmsthre
self.num_classes = num_classes
self.testdev = testdev
def evaluate(
self,
model,
distributed=False,
half=False,
trt_file=None,
decoder=None,
test_size=None,
):
"""
COCO average precision (AP) Evaluation. Iterate inference on the test dataset
and the results are evaluated by COCO API.
NOTE: This function will change training mode to False, please save states if needed.
Args:
model : model to evaluate.
Returns:
ap50_95 (float) : COCO AP of IoU=50:95
ap50 (float) : COCO AP of IoU=50
summary (sr): summary info of evaluation.
"""
# TODO half to amp_test
tensor_type = torch.cuda.HalfTensor if half else torch.cuda.FloatTensor
model = model.eval()
if half:
model = model.half()
ids = []
data_list = []
progress_bar = tqdm if is_main_process() else iter
inference_time = 0
nms_time = 0
n_samples = len(self.dataloader) - 1
if trt_file is not None:
from torch2trt import TRTModule
model_trt = TRTModule()
model_trt.load_state_dict(torch.load(trt_file))
x = torch.ones(1, 3, test_size[0], test_size[1]).cuda()
model(x)
model = model_trt
for cur_iter, (imgs, _, info_imgs, ids) in enumerate(
progress_bar(self.dataloader)
):
with torch.no_grad():
imgs = imgs.type(tensor_type)
# skip the the last iters since batchsize might be not enough for batch inference
is_time_record = cur_iter < len(self.dataloader) - 1
if is_time_record:
start = time.time()
outputs = model(imgs)
if decoder is not None:
outputs = decoder(outputs, dtype=outputs.type())
if is_time_record:
infer_end = time_synchronized()
inference_time += infer_end - start
outputs = postprocess(
outputs, self.num_classes, self.confthre, self.nmsthre
)
if is_time_record:
nms_end = time_synchronized()
nms_time += nms_end - infer_end
data_list.extend(self.convert_to_coco_format(outputs, info_imgs, ids))
statistics = torch.cuda.FloatTensor([inference_time, nms_time, n_samples])
if distributed:
data_list = gather(data_list, dst=0)
data_list = list(itertools.chain(*data_list))
torch.distributed.reduce(statistics, dst=0)
eval_results = self.evaluate_prediction(data_list, statistics)
synchronize()
return eval_results
def convert_to_coco_format(self, outputs, info_imgs, ids):
data_list = []
for (output, img_h, img_w, img_id) in zip(
outputs, info_imgs[0], info_imgs[1], ids
):
if output is None:
continue
output = output.cpu()
bboxes = output[:, 0:4]
# preprocessing: resize
scale = min(
self.img_size[0] / float(img_h), self.img_size[1] / float(img_w)
)
bboxes /= scale
bboxes = xyxy2xywh(bboxes)
cls = output[:, 6]
scores = output[:, 4] * output[:, 5]
for ind in range(bboxes.shape[0]):
label = self.dataloader.dataset.class_ids[int(cls[ind])]
pred_data = {
"image_id": int(img_id),
"category_id": label,
"bbox": bboxes[ind].numpy().tolist(),
"score": scores[ind].numpy().item(),
"segmentation": [],
} # COCO json format
data_list.append(pred_data)
return data_list
def evaluate_prediction(self, data_dict, statistics):
if not is_main_process():
return 0, 0, None
logger.info("Evaluate in main process...")
annType = ["segm", "bbox", "keypoints"]
inference_time = statistics[0].item()
nms_time = statistics[1].item()
n_samples = statistics[2].item()
a_infer_time = 1000 * inference_time / (n_samples * self.dataloader.batch_size)
a_nms_time = 1000 * nms_time / (n_samples * self.dataloader.batch_size)
time_info = ", ".join(
[
"Average {} time: {:.2f} ms".format(k, v)
for k, v in zip(
["forward", "NMS", "inference"],
[a_infer_time, a_nms_time, (a_infer_time + a_nms_time)],
)
]
)
info = time_info + "\n"
# Evaluate the Dt (detection) json comparing with the ground truth
if len(data_dict) > 0:
cocoGt = self.dataloader.dataset.coco
# TODO: since pycocotools can't process dict in py36, write data to json file.
if self.testdev:
json.dump(data_dict, open("./yolox_testdev_2017.json", "w"))
cocoDt = cocoGt.loadRes("./yolox_testdev_2017.json")
else:
_, tmp = tempfile.mkstemp()
json.dump(data_dict, open(tmp, "w"))
cocoDt = cocoGt.loadRes(tmp)
try:
from yolox.layers import COCOeval_opt as COCOeval
except ImportError:
from .cocoeval_mr import COCOeval
logger.warning("Use standard COCOeval.")
cocoEval = COCOeval(cocoGt, cocoDt, annType[1])
cocoEval.evaluate()
cocoEval.accumulate()
redirect_string = io.StringIO()
with contextlib.redirect_stdout(redirect_string):
cocoEval.summarize()
info += redirect_string.getvalue()
return cocoEval.stats[0], cocoEval.stats[1], info
else:
return 0, 0, info
================================================
FILE: detector/YOLOX/yolox/evaluators/voc_eval.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Code are based on
# https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/datasets/voc_eval.py
# Copyright (c) Bharath Hariharan.
# Copyright (c) Megvii, Inc. and its affiliates.
import os
import pickle
import xml.etree.ElementTree as ET
import numpy as np
def parse_rec(filename):
""" Parse a PASCAL VOC xml file """
tree = ET.parse(filename)
objects = []
for obj in tree.findall("object"):
obj_struct = {}
obj_struct["name"] = obj.find("name").text
obj_struct["pose"] = obj.find("pose").text
obj_struct["truncated"] = int(obj.find("truncated").text)
obj_struct["difficult"] = int(obj.find("difficult").text)
bbox = obj.find("bndbox")
obj_struct["bbox"] = [
int(bbox.find("xmin").text),
int(bbox.find("ymin").text),
int(bbox.find("xmax").text),
int(bbox.find("ymax").text),
]
objects.append(obj_struct)
return objects
def voc_ap(rec, prec, use_07_metric=False):
""" ap = voc_ap(rec, prec, [use_07_metric])
Compute VOC AP given precision and recall.
If use_07_metric is true, uses the
VOC 07 11 point method (default:False).
"""
if use_07_metric:
# 11 point metric
ap = 0.0
for t in np.arange(0.0, 1.1, 0.1):
if np.sum(rec >= t) == 0:
p = 0
else:
p = np.max(prec[rec >= t])
ap = ap + p / 11.0
else:
# correct AP calculation
# first append sentinel values at the end
mrec = np.concatenate(([0.0], rec, [1.0]))
mpre = np.concatenate(([0.0], prec, [0.0]))
# compute the precision envelope
for i in range(mpre.size - 1, 0, -1):
mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
# to calculate area under PR curve, look for points
# where X axis (recall) changes value
i = np.where(mrec[1:] != mrec[:-1])[0]
# and sum (\Delta recall) * prec
ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
return ap
def voc_eval(
detpath,
annopath,
imagesetfile,
classname,
cachedir,
ovthresh=0.5,
use_07_metric=False,
):
# first load gt
if not os.path.isdir(cachedir):
os.mkdir(cachedir)
cachefile = os.path.join(cachedir, "annots.pkl")
# read list of images
with open(imagesetfile, "r") as f:
lines = f.readlines()
imagenames = [x.strip() for x in lines]
if not os.path.isfile(cachefile):
# load annots
recs = {}
for i, imagename in enumerate(imagenames):
recs[imagename] = parse_rec(annopath.format(imagename))
if i % 100 == 0:
print("Reading annotation for {:d}/{:d}".format(i + 1, len(imagenames)))
# save
print("Saving cached annotations to {:s}".format(cachefile))
with open(cachefile, "wb") as f:
pickle.dump(recs, f)
else:
# load
with open(cachefile, "rb") as f:
recs = pickle.load(f)
# extract gt objects for this class
class_recs = {}
npos = 0
for imagename in imagenames:
R = [obj for obj in recs[imagename] if obj["name"] == classname]
bbox = np.array([x["bbox"] for x in R])
difficult = np.array([x["difficult"] for x in R]).astype(np.bool)
det = [False] * len(R)
npos = npos + sum(~difficult)
class_recs[imagename] = {"bbox": bbox, "difficult": difficult, "det": det}
# read dets
detfile = detpath.format(classname)
with open(detfile, "r") as f:
lines = f.readlines()
if len(lines) == 0:
return 0, 0, 0
splitlines = [x.strip().split(" ") for x in lines]
image_ids = [x[0] for x in splitlines]
confidence = np.array([float(x[1]) for x in splitlines])
BB = np.array([[float(z) for z in x[2:]] for x in splitlines])
# sort by confidence
sorted_ind = np.argsort(-confidence)
BB = BB[sorted_ind, :]
image_ids = [image_ids[x] for x in sorted_ind]
# go down dets and mark TPs and FPs
nd = len(image_ids)
tp = np.zeros(nd)
fp = np.zeros(nd)
for d in range(nd):
R = class_recs[image_ids[d]]
bb = BB[d, :].astype(float)
ovmax = -np.inf
BBGT = R["bbox"].astype(float)
if BBGT.size > 0:
# compute overlaps
# intersection
ixmin = np.maximum(BBGT[:, 0], bb[0])
iymin = np.maximum(BBGT[:, 1], bb[1])
ixmax = np.minimum(BBGT[:, 2], bb[2])
iymax = np.minimum(BBGT[:, 3], bb[3])
iw = np.maximum(ixmax - ixmin + 1.0, 0.0)
ih = np.maximum(iymax - iymin + 1.0, 0.0)
inters = iw * ih
# union
uni = (
(bb[2] - bb[0] + 1.0) * (bb[3] - bb[1] + 1.0)
+ (BBGT[:, 2] - BBGT[:, 0] + 1.0) * (BBGT[:, 3] - BBGT[:, 1] + 1.0)
- inters
)
overlaps = inters / uni
ovmax = np.max(overlaps)
jmax = np.argmax(overlaps)
if ovmax > ovthresh:
if not R["difficult"][jmax]:
if not R["det"][jmax]:
tp[d] = 1.0
R["det"][jmax] = 1
else:
fp[d] = 1.0
else:
fp[d] = 1.0
# compute precision recall
fp = np.cumsum(fp)
tp = np.cumsum(tp)
rec = tp / float(npos)
# avoid divide by zero in case the first detection matches a difficult
# ground truth
prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
ap = voc_ap(rec, prec, use_07_metric)
return rec, prec, ap
================================================
FILE: detector/YOLOX/yolox/evaluators/voc_evaluator.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) Megvii, Inc. and its affiliates.
import sys
import tempfile
import time
from collections import ChainMap
from loguru import logger
from tqdm import tqdm
import numpy as np
import torch
from yolox.utils import gather, is_main_process, postprocess, synchronize, time_synchronized
class VOCEvaluator:
"""
VOC AP Evaluation class.
"""
def __init__(
self, dataloader, img_size, confthre, nmsthre, num_classes,
):
"""
Args:
dataloader (Dataloader): evaluate dataloader.
img_size (int): image size after preprocess. images are resized
to squares whose shape is (img_size, img_size).
confthre (float): confidence threshold ranging from 0 to 1, which
is defined in the config file.
nmsthre (float): IoU threshold of non-max supression ranging from 0 to 1.
"""
self.dataloader = dataloader
self.img_size = img_size
self.confthre = confthre
self.nmsthre = nmsthre
self.num_classes = num_classes
self.num_images = len(dataloader.dataset)
def evaluate(
self, model, distributed=False, half=False, trt_file=None, decoder=None, test_size=None
):
"""
VOC average precision (AP) Evaluation. Iterate inference on the test dataset
and the results are evaluated by COCO API.
NOTE: This function will change training mode to False, please save states if needed.
Args:
model : model to evaluate.
Returns:
ap50_95 (float) : COCO style AP of IoU=50:95
ap50 (float) : VOC 2007 metric AP of IoU=50
summary (sr): summary info of evaluation.
"""
# TODO half to amp_test
tensor_type = torch.cuda.HalfTensor if half else torch.cuda.FloatTensor
model = model.eval()
if half:
model = model.half()
ids = []
data_list = {}
progress_bar = tqdm if is_main_process() else iter
inference_time = 0
nms_time = 0
n_samples = len(self.dataloader) - 1
if trt_file is not None:
from torch2trt import TRTModule
model_trt = TRTModule()
model_trt.load_state_dict(torch.load(trt_file))
x = torch.ones(1, 3, test_size[0], test_size[1]).cuda()
model(x)
model = model_trt
for cur_iter, (imgs, _, info_imgs, ids) in enumerate(progress_bar(self.dataloader)):
with torch.no_grad():
imgs = imgs.type(tensor_type)
# skip the the last iters since batchsize might be not enough for batch inference
is_time_record = cur_iter < len(self.dataloader) - 1
if is_time_record:
start = time.time()
outputs = model(imgs)
if decoder is not None:
outputs = decoder(outputs, dtype=outputs.type())
if is_time_record:
infer_end = time_synchronized()
inference_time += infer_end - start
outputs = postprocess(
outputs, self.num_classes, self.confthre, self.nmsthre
)
if is_time_record:
nms_end = time_synchronized()
nms_time += nms_end - infer_end
data_list.update(self.convert_to_voc_format(outputs, info_imgs, ids))
statistics = torch.cuda.FloatTensor([inference_time, nms_time, n_samples])
if distributed:
data_list = gather(data_list, dst=0)
data_list = ChainMap(*data_list)
torch.distributed.reduce(statistics, dst=0)
eval_results = self.evaluate_prediction(data_list, statistics)
synchronize()
return eval_results
def convert_to_voc_format(self, outputs, info_imgs, ids):
predictions = {}
for (output, img_h, img_w, img_id) in zip(outputs, info_imgs[0], info_imgs[1], ids):
if output is None:
predictions[int(img_id)] = (None, None, None)
continue
output = output.cpu()
bboxes = output[:, 0:4]
# preprocessing: resize
scale = min(self.img_size[0] / float(img_h), self.img_size[1] / float(img_w))
bboxes /= scale
cls = output[:, 6]
scores = output[:, 4] * output[:, 5]
predictions[int(img_id)] = (bboxes, cls, scores)
return predictions
def evaluate_prediction(self, data_dict, statistics):
if not is_main_process():
return 0, 0, None
logger.info("Evaluate in main process...")
inference_time = statistics[0].item()
nms_time = statistics[1].item()
n_samples = statistics[2].item()
a_infer_time = 1000 * inference_time / (n_samples * self.dataloader.batch_size)
a_nms_time = 1000 * nms_time / (n_samples * self.dataloader.batch_size)
time_info = ", ".join(
["Average {} time: {:.2f} ms".format(k, v) for k, v in zip(
["forward", "NMS", "inference"],
[a_infer_time, a_nms_time, (a_infer_time + a_nms_time)]
)]
)
info = time_info + "\n"
all_boxes = [[[] for _ in range(self.num_images)] for _ in range(self.num_classes)]
for img_num in range(self.num_images):
bboxes, cls, scores = data_dict[img_num]
if bboxes is None:
for j in range(self.num_classes):
all_boxes[j][img_num] = np.empty([0, 5], dtype=np.float32)
continue
for j in range(self.num_classes):
mask_c = cls == j
if sum(mask_c) == 0:
all_boxes[j][img_num] = np.empty([0, 5], dtype=np.float32)
continue
c_dets = torch.cat((bboxes, scores.unsqueeze(1)), dim=1)
all_boxes[j][img_num] = c_dets[mask_c].numpy()
sys.stdout.write(
"im_eval: {:d}/{:d} \r".format(img_num + 1, self.num_images)
)
sys.stdout.flush()
with tempfile.TemporaryDirectory() as tempdir:
mAP50, mAP70 = self.dataloader.dataset.evaluate_detections(all_boxes, tempdir)
return mAP50, mAP70, info
================================================
FILE: detector/YOLOX/yolox/exp/__init__.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
from .base_exp import BaseExp
from .build import get_exp
from .yolox_base import Exp
================================================
FILE: detector/YOLOX/yolox/exp/base_exp.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
import ast
import pprint
from abc import ABCMeta, abstractmethod
from typing import Dict
from tabulate import tabulate
import torch
from torch.nn import Module
from ..utils import LRScheduler
class BaseExp(metaclass=ABCMeta):
"""Basic class for any experiment.
"""
def __init__(self):
self.seed = None
self.output_dir = "./YOLOX_outputs"
self.print_interval = 100
self.eval_interval = 10
@abstractmethod
def get_model(self) -> Module:
pass
@abstractmethod
def get_data_loader(
self, batch_size: int, is_distributed: bool
) -> Dict[str, torch.utils.data.DataLoader]:
pass
@abstractmethod
def get_optimizer(self, batch_size: int) -> torch.optim.Optimizer:
pass
@abstractmethod
def get_lr_scheduler(
self, lr: float, iters_per_epoch: int, **kwargs
) -> LRScheduler:
pass
@abstractmethod
def get_evaluator(self):
pass
@abstractmethod
def eval(self, model, evaluator, weights):
pass
def __repr__(self):
table_header = ["keys", "values"]
exp_table = [
(str(k), pprint.pformat(v)) for k, v in vars(self).items() if not k.startswith("_")
]
return tabulate(exp_table, headers=table_header, tablefmt="fancy_grid")
def merge(self, cfg_list):
assert len(cfg_list) % 2 == 0
for k, v in zip(cfg_list[0::2], cfg_list[1::2]):
# only update value with same key
if hasattr(self, k):
src_value = getattr(self, k)
src_type = type(src_value)
if src_value is not None and src_type != type(v):
try:
v = src_type(v)
except Exception:
v = ast.literal_eval(v)
setattr(self, k, v)
================================================
FILE: detector/YOLOX/yolox/exp/build.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
import importlib
import os
import sys
def get_exp_by_file(exp_file):
sys.path.append(os.path.dirname(exp_file))
current_exp = importlib.import_module(os.path.basename(exp_file).split(".")[0])
exp = current_exp.Exp()
return exp
def get_exp_by_name(exp_name):
import yolox
yolox_path = os.path.dirname(os.path.dirname(yolox.__file__))
filedict = {
"yolox-s": "yolox_s.py",
"yolox-m": "yolox_m.py",
"yolox-l": "yolox_l.py",
"yolox-x": "yolox_x.py",
"yolox-tiny": "yolox_tiny.py",
"yolox-nano": "nano.py",
"yolov3": "yolov3.py",
}
filename = filedict[exp_name]
exp_path = os.path.join(yolox_path, "exps", "default", filename)
return get_exp_by_file(exp_path)
def get_exp(exp_file, exp_name):
"""
get Exp object by file or name. If exp_file and exp_name
are both provided, get Exp by exp_file.
Args:
exp_file (str): file path of experiment.
exp_name (str): name of experiment. "yolo-s",
"""
assert exp_file is not None or exp_name is not None, "plz provide exp file or exp name."
if exp_file is not None:
return get_exp_by_file(exp_file)
else:
return get_exp_by_name(exp_name)
================================================
FILE: detector/YOLOX/yolox/exp/yolox_base.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
import os
import random
import torch
import torch.distributed as dist
import torch.nn as nn
from .base_exp import BaseExp
class Exp(BaseExp):
def __init__(self):
super().__init__()
# ---------------- model config ---------------- #
self.num_classes = 80
self.depth = 1.00
self.width = 1.00
# ---------------- dataloader config ---------------- #
# set worker to 4 for shorter dataloader init time
self.data_num_workers = 4
self.input_size = (640, 640)
self.random_size = (14, 26)
self.train_ann = "instances_train2017.json"
self.val_ann = "instances_val2017.json"
# --------------- transform config ----------------- #
self.degrees = 10.0
self.translate = 0.1
self.scale = (0.1, 2)
self.mscale = (0.8, 1.6)
self.shear = 2.0
self.perspective = 0.0
self.enable_mixup = True
# -------------- training config --------------------- #
self.warmup_epochs = 5
self.max_epoch = 300
self.warmup_lr = 0
self.basic_lr_per_img = 0.01 / 64.0
self.scheduler = "yoloxwarmcos"
self.no_aug_epochs = 15
self.min_lr_ratio = 0.05
self.ema = True
self.weight_decay = 5e-4
self.momentum = 0.9
self.print_interval = 10
self.eval_interval = 10
self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
# ----------------- testing config ------------------ #
self.test_size = (640, 640)
self.test_conf = 0.01
self.nmsthre = 0.65
def get_model(self):
from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
def init_yolo(M):
for m in M.modules():
if isinstance(m, nn.BatchNorm2d):
m.eps = 1e-3
m.momentum = 0.03
if getattr(self, "model", None) is None:
in_channels = [256, 512, 1024]
backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels)
head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels)
self.model = YOLOX(backbone, head)
self.model.apply(init_yolo)
self.model.head.initialize_biases(1e-2)
return self.model
def get_data_loader(self, batch_size, is_distributed, no_aug=False):
from yolox.data import (
COCODataset,
TrainTransform,
YoloBatchSampler,
DataLoader,
InfiniteSampler,
MosaicDetection,
)
dataset = COCODataset(
data_dir=None,
json_file=self.train_ann,
img_size=self.input_size,
preproc=TrainTransform(
rgb_means=(0.485, 0.456, 0.406),
std=(0.229, 0.224, 0.225),
max_labels=50,
),
)
dataset = MosaicDetection(
dataset,
mosaic=not no_aug,
img_size=self.input_size,
preproc=TrainTransform(
rgb_means=(0.485, 0.456, 0.406),
std=(0.229, 0.224, 0.225),
max_labels=120,
),
degrees=self.degrees,
translate=self.translate,
scale=self.scale,
shear=self.shear,
perspective=self.perspective,
enable_mixup=self.enable_mixup,
)
self.dataset = dataset
if is_distributed:
batch_size = batch_size // dist.get_world_size()
sampler = InfiniteSampler(
len(self.dataset), seed=self.seed if self.seed else 0
)
batch_sampler = YoloBatchSampler(
sampler=sampler,
batch_size=batch_size,
drop_last=False,
input_dimension=self.input_size,
mosaic=not no_aug,
)
dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
dataloader_kwargs["batch_sampler"] = batch_sampler
train_loader = DataLoader(self.dataset, **dataloader_kwargs)
return train_loader
def random_resize(self, data_loader, epoch, rank, is_distributed):
tensor = torch.LongTensor(2).cuda()
if rank == 0:
size_factor = self.input_size[1] * 1. / self.input_size[0]
size = random.randint(*self.random_size)
size = (int(32 * size), 32 * int(size * size_factor))
tensor[0] = size[0]
tensor[1] = size[1]
if is_distributed:
dist.barrier()
dist.broadcast(tensor, 0)
input_size = data_loader.change_input_dim(
multiple=(tensor[0].item(), tensor[1].item()), random_range=None
)
return input_size
def get_optimizer(self, batch_size):
if "optimizer" not in self.__dict__:
if self.warmup_epochs > 0:
lr = self.warmup_lr
else:
lr = self.basic_lr_per_img * batch_size
pg0, pg1, pg2 = [], [], [] # optimizer parameter groups
for k, v in self.model.named_modules():
if hasattr(v, "bias") and isinstance(v.bias, nn.Parameter):
pg2.append(v.bias) # biases
if isinstance(v, nn.BatchNorm2d) or "bn" in k:
pg0.append(v.weight) # no decay
elif hasattr(v, "weight") and isinstance(v.weight, nn.Parameter):
pg1.append(v.weight) # apply decay
optimizer = torch.optim.SGD(
pg0, lr=lr, momentum=self.momentum, nesterov=True
)
optimizer.add_param_group(
{"params": pg1, "weight_decay": self.weight_decay}
) # add pg1 with weight_decay
optimizer.add_param_group({"params": pg2})
self.optimizer = optimizer
return self.optimizer
def get_lr_scheduler(self, lr, iters_per_epoch):
from yolox.utils import LRScheduler
scheduler = LRScheduler(
self.scheduler,
lr,
iters_per_epoch,
self.max_epoch,
warmup_epochs=self.warmup_epochs,
warmup_lr_start=self.warmup_lr,
no_aug_epochs=self.no_aug_epochs,
min_lr_ratio=self.min_lr_ratio,
)
return scheduler
def get_eval_loader(self, batch_size, is_distributed, testdev=False):
from yolox.data import COCODataset, ValTransform
valdataset = COCODataset(
data_dir=None,
json_file=self.val_ann if not testdev else "image_info_test-dev2017.json",
name="val2017" if not testdev else "test2017",
img_size=self.test_size,
preproc=ValTransform(
rgb_means=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)
),
)
if is_distributed:
batch_size = batch_size // dist.get_world_size()
sampler = torch.utils.data.distributed.DistributedSampler(
valdataset, shuffle=False
)
else:
sampler = torch.utils.data.SequentialSampler(valdataset)
dataloader_kwargs = {
"num_workers": self.data_num_workers,
"pin_memory": True,
"sampler": sampler,
}
dataloader_kwargs["batch_size"] = batch_size
val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
return val_loader
def get_evaluator(self, batch_size, is_distributed, testdev=False):
from yolox.evaluators import COCOEvaluator
val_loader = self.get_eval_loader(batch_size, is_distributed, testdev=testdev)
evaluator = COCOEvaluator(
dataloader=val_loader,
img_size=self.test_size,
confthre=self.test_conf,
nmsthre=self.nmsthre,
num_classes=self.num_classes,
testdev=testdev,
)
return evaluator
def eval(self, model, evaluator, is_distributed, half=False):
return evaluator.evaluate(model, is_distributed, half)
================================================
FILE: detector/YOLOX/yolox/layers/__init__.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
from .fast_coco_eval_api import COCOeval_opt
================================================
FILE: detector/YOLOX/yolox/layers/csrc/cocoeval/cocoeval.cpp
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include "cocoeval.h"
#include
#include
#include
#include
using namespace pybind11::literals;
namespace COCOeval {
// Sort detections from highest score to lowest, such that
// detection_instances[detection_sorted_indices[t]] >=
// detection_instances[detection_sorted_indices[t+1]]. Use stable_sort to match
// original COCO API
void SortInstancesByDetectionScore(
const std::vector& detection_instances,
std::vector* detection_sorted_indices) {
detection_sorted_indices->resize(detection_instances.size());
std::iota(
detection_sorted_indices->begin(), detection_sorted_indices->end(), 0);
std::stable_sort(
detection_sorted_indices->begin(),
detection_sorted_indices->end(),
[&detection_instances](size_t j1, size_t j2) {
return detection_instances[j1].score > detection_instances[j2].score;
});
}
// Partition the ground truth objects based on whether or not to ignore them
// based on area
void SortInstancesByIgnore(
const std::array& area_range,
const std::vector& ground_truth_instances,
std::vector* ground_truth_sorted_indices,
std::vector* ignores) {
ignores->clear();
ignores->reserve(ground_truth_instances.size());
for (auto o : ground_truth_instances) {
ignores->push_back(
o.ignore || o.area < area_range[0] || o.area > area_range[1]);
}
ground_truth_sorted_indices->resize(ground_truth_instances.size());
std::iota(
ground_truth_sorted_indices->begin(),
ground_truth_sorted_indices->end(),
0);
std::stable_sort(
ground_truth_sorted_indices->begin(),
ground_truth_sorted_indices->end(),
[&ignores](size_t j1, size_t j2) {
return (int)(*ignores)[j1] < (int)(*ignores)[j2];
});
}
// For each IOU threshold, greedily match each detected instance to a ground
// truth instance (if possible) and store the results
void MatchDetectionsToGroundTruth(
const std::vector& detection_instances,
const std::vector& detection_sorted_indices,
const std::vector& ground_truth_instances,
const std::vector& ground_truth_sorted_indices,
const std::vector& ignores,
const std::vector>& ious,
const std::vector& iou_thresholds,
const std::array& area_range,
ImageEvaluation* results) {
// Initialize memory to store return data matches and ignore
const int num_iou_thresholds = iou_thresholds.size();
const int num_ground_truth = ground_truth_sorted_indices.size();
const int num_detections = detection_sorted_indices.size();
std::vector ground_truth_matches(
num_iou_thresholds * num_ground_truth, 0);
std::vector& detection_matches = results->detection_matches;
std::vector& detection_ignores = results->detection_ignores;
std::vector& ground_truth_ignores = results->ground_truth_ignores;
detection_matches.resize(num_iou_thresholds * num_detections, 0);
detection_ignores.resize(num_iou_thresholds * num_detections, false);
ground_truth_ignores.resize(num_ground_truth);
for (auto g = 0; g < num_ground_truth; ++g) {
ground_truth_ignores[g] = ignores[ground_truth_sorted_indices[g]];
}
for (auto t = 0; t < num_iou_thresholds; ++t) {
for (auto d = 0; d < num_detections; ++d) {
// information about best match so far (match=-1 -> unmatched)
double best_iou = std::min(iou_thresholds[t], 1 - 1e-10);
int match = -1;
for (auto g = 0; g < num_ground_truth; ++g) {
// if this ground truth instance is already matched and not a
// crowd, it cannot be matched to another detection
if (ground_truth_matches[t * num_ground_truth + g] > 0 &&
!ground_truth_instances[ground_truth_sorted_indices[g]].is_crowd) {
continue;
}
// if detected instance matched to a regular ground truth
// instance, we can break on the first ground truth instance
// tagged as ignore (because they are sorted by the ignore tag)
if (match >= 0 && !ground_truth_ignores[match] &&
ground_truth_ignores[g]) {
break;
}
// if IOU overlap is the best so far, store the match appropriately
if (ious[d][ground_truth_sorted_indices[g]] >= best_iou) {
best_iou = ious[d][ground_truth_sorted_indices[g]];
match = g;
}
}
// if match was made, store id of match for both detection and
// ground truth
if (match >= 0) {
detection_ignores[t * num_detections + d] = ground_truth_ignores[match];
detection_matches[t * num_detections + d] =
ground_truth_instances[ground_truth_sorted_indices[match]].id;
ground_truth_matches[t * num_ground_truth + match] =
detection_instances[detection_sorted_indices[d]].id;
}
// set unmatched detections outside of area range to ignore
const InstanceAnnotation& detection =
detection_instances[detection_sorted_indices[d]];
detection_ignores[t * num_detections + d] =
detection_ignores[t * num_detections + d] ||
(detection_matches[t * num_detections + d] == 0 &&
(detection.area < area_range[0] || detection.area > area_range[1]));
}
}
// store detection score results
results->detection_scores.resize(detection_sorted_indices.size());
for (size_t d = 0; d < detection_sorted_indices.size(); ++d) {
results->detection_scores[d] =
detection_instances[detection_sorted_indices[d]].score;
}
}
std::vector EvaluateImages(
const std::vector>& area_ranges,
int max_detections,
const std::vector& iou_thresholds,
const ImageCategoryInstances>& image_category_ious,
const ImageCategoryInstances&
image_category_ground_truth_instances,
const ImageCategoryInstances&
image_category_detection_instances) {
const int num_area_ranges = area_ranges.size();
const int num_images = image_category_ground_truth_instances.size();
const int num_categories =
image_category_ious.size() > 0 ? image_category_ious[0].size() : 0;
std::vector detection_sorted_indices;
std::vector ground_truth_sorted_indices;
std::vector ignores;
std::vector results_all(
num_images * num_area_ranges * num_categories);
// Store results for each image, category, and area range combination. Results
// for each IOU threshold are packed into the same ImageEvaluation object
for (auto i = 0; i < num_images; ++i) {
for (auto c = 0; c < num_categories; ++c) {
const std::vector& ground_truth_instances =
image_category_ground_truth_instances[i][c];
const std::vector& detection_instances =
image_category_detection_instances[i][c];
SortInstancesByDetectionScore(
detection_instances, &detection_sorted_indices);
if ((int)detection_sorted_indices.size() > max_detections) {
detection_sorted_indices.resize(max_detections);
}
for (size_t a = 0; a < area_ranges.size(); ++a) {
SortInstancesByIgnore(
area_ranges[a],
ground_truth_instances,
&ground_truth_sorted_indices,
&ignores);
MatchDetectionsToGroundTruth(
detection_instances,
detection_sorted_indices,
ground_truth_instances,
ground_truth_sorted_indices,
ignores,
image_category_ious[i][c],
iou_thresholds,
area_ranges[a],
&results_all
[c * num_area_ranges * num_images + a * num_images + i]);
}
}
}
return results_all;
}
// Convert a python list to a vector
template
std::vector list_to_vec(const py::list& l) {
std::vector v(py::len(l));
for (int i = 0; i < (int)py::len(l); ++i) {
v[i] = l[i].cast();
}
return v;
}
// Helper function to Accumulate()
// Considers the evaluation results applicable to a particular category, area
// range, and max_detections parameter setting, which begin at
// evaluations[evaluation_index]. Extracts a sorted list of length n of all
// applicable detection instances concatenated across all images in the dataset,
// which are represented by the outputs evaluation_indices, detection_scores,
// image_detection_indices, and detection_sorted_indices--all of which are
// length n. evaluation_indices[i] stores the applicable index into
// evaluations[] for instance i, which has detection score detection_score[i],
// and is the image_detection_indices[i]'th of the list of detections
// for the image containing i. detection_sorted_indices[] defines a sorted
// permutation of the 3 other outputs
int BuildSortedDetectionList(
const std::vector& evaluations,
const int64_t evaluation_index,
const int64_t num_images,
const int max_detections,
std::vector* evaluation_indices,
std::vector* detection_scores,
std::vector* detection_sorted_indices,
std::vector* image_detection_indices) {
assert(evaluations.size() >= evaluation_index + num_images);
// Extract a list of object instances of the applicable category, area
// range, and max detections requirements such that they can be sorted
image_detection_indices->clear();
evaluation_indices->clear();
detection_scores->clear();
image_detection_indices->reserve(num_images * max_detections);
evaluation_indices->reserve(num_images * max_detections);
detection_scores->reserve(num_images * max_detections);
int num_valid_ground_truth = 0;
for (auto i = 0; i < num_images; ++i) {
const ImageEvaluation& evaluation = evaluations[evaluation_index + i];
for (int d = 0;
d < (int)evaluation.detection_scores.size() && d < max_detections;
++d) { // detected instances
evaluation_indices->push_back(evaluation_index + i);
image_detection_indices->push_back(d);
detection_scores->push_back(evaluation.detection_scores[d]);
}
for (auto ground_truth_ignore : evaluation.ground_truth_ignores) {
if (!ground_truth_ignore) {
++num_valid_ground_truth;
}
}
}
// Sort detections by decreasing score, using stable sort to match
// python implementation
detection_sorted_indices->resize(detection_scores->size());
std::iota(
detection_sorted_indices->begin(), detection_sorted_indices->end(), 0);
std::stable_sort(
detection_sorted_indices->begin(),
detection_sorted_indices->end(),
[&detection_scores](size_t j1, size_t j2) {
return (*detection_scores)[j1] > (*detection_scores)[j2];
});
return num_valid_ground_truth;
}
// Helper function to Accumulate()
// Compute a precision recall curve given a sorted list of detected instances
// encoded in evaluations, evaluation_indices, detection_scores,
// detection_sorted_indices, image_detection_indices (see
// BuildSortedDetectionList()). Using vectors precisions and recalls
// and temporary storage, output the results into precisions_out, recalls_out,
// and scores_out, which are large buffers containing many precion/recall curves
// for all possible parameter settings, with precisions_out_index and
// recalls_out_index defining the applicable indices to store results.
void ComputePrecisionRecallCurve(
const int64_t precisions_out_index,
const int64_t precisions_out_stride,
const int64_t recalls_out_index,
const std::vector& recall_thresholds,
const int iou_threshold_index,
const int num_iou_thresholds,
const int num_valid_ground_truth,
const std::vector& evaluations,
const std::vector& evaluation_indices,
const std::vector& detection_scores,
const std::vector& detection_sorted_indices,
const std::vector& image_detection_indices,
std::vector* precisions,
std::vector* recalls,
std::vector* precisions_out,
std::vector* scores_out,
std::vector* recalls_out) {
assert(recalls_out->size() > recalls_out_index);
// Compute precision/recall for each instance in the sorted list of detections
int64_t true_positives_sum = 0, false_positives_sum = 0;
precisions->clear();
recalls->clear();
precisions->reserve(detection_sorted_indices.size());
recalls->reserve(detection_sorted_indices.size());
assert(!evaluations.empty() || detection_sorted_indices.empty());
for (auto detection_sorted_index : detection_sorted_indices) {
const ImageEvaluation& evaluation =
evaluations[evaluation_indices[detection_sorted_index]];
const auto num_detections =
evaluation.detection_matches.size() / num_iou_thresholds;
const auto detection_index = iou_threshold_index * num_detections +
image_detection_indices[detection_sorted_index];
assert(evaluation.detection_matches.size() > detection_index);
assert(evaluation.detection_ignores.size() > detection_index);
const int64_t detection_match =
evaluation.detection_matches[detection_index];
const bool detection_ignores =
evaluation.detection_ignores[detection_index];
const auto true_positive = detection_match > 0 && !detection_ignores;
const auto false_positive = detection_match == 0 && !detection_ignores;
if (true_positive) {
++true_positives_sum;
}
if (false_positive) {
++false_positives_sum;
}
const double recall =
static_cast(true_positives_sum) / num_valid_ground_truth;
recalls->push_back(recall);
const int64_t num_valid_detections =
true_positives_sum + false_positives_sum;
const double precision = num_valid_detections > 0
? static_cast(true_positives_sum) / num_valid_detections
: 0.0;
precisions->push_back(precision);
}
(*recalls_out)[recalls_out_index] = !recalls->empty() ? recalls->back() : 0;
for (int64_t i = static_cast(precisions->size()) - 1; i > 0; --i) {
if ((*precisions)[i] > (*precisions)[i - 1]) {
(*precisions)[i - 1] = (*precisions)[i];
}
}
// Sample the per instance precision/recall list at each recall threshold
for (size_t r = 0; r < recall_thresholds.size(); ++r) {
// first index in recalls >= recall_thresholds[r]
std::vector::iterator low = std::lower_bound(
recalls->begin(), recalls->end(), recall_thresholds[r]);
size_t precisions_index = low - recalls->begin();
const auto results_ind = precisions_out_index + r * precisions_out_stride;
assert(results_ind < precisions_out->size());
assert(results_ind < scores_out->size());
if (precisions_index < precisions->size()) {
(*precisions_out)[results_ind] = (*precisions)[precisions_index];
(*scores_out)[results_ind] =
detection_scores[detection_sorted_indices[precisions_index]];
} else {
(*precisions_out)[results_ind] = 0;
(*scores_out)[results_ind] = 0;
}
}
}
py::dict Accumulate(
const py::object& params,
const std::vector& evaluations) {
const std::vector recall_thresholds =
list_to_vec(params.attr("recThrs"));
const std::vector max_detections =
list_to_vec(params.attr("maxDets"));
const int num_iou_thresholds = py::len(params.attr("iouThrs"));
const int num_recall_thresholds = py::len(params.attr("recThrs"));
const int num_categories = params.attr("useCats").cast() == 1
? py::len(params.attr("catIds"))
: 1;
const int num_area_ranges = py::len(params.attr("areaRng"));
const int num_max_detections = py::len(params.attr("maxDets"));
const int num_images = py::len(params.attr("imgIds"));
std::vector precisions_out(
num_iou_thresholds * num_recall_thresholds * num_categories *
num_area_ranges * num_max_detections,
-1);
std::vector recalls_out(
num_iou_thresholds * num_categories * num_area_ranges *
num_max_detections,
-1);
std::vector scores_out(
num_iou_thresholds * num_recall_thresholds * num_categories *
num_area_ranges * num_max_detections,
-1);
// Consider the list of all detected instances in the entire dataset in one
// large list. evaluation_indices, detection_scores,
// image_detection_indices, and detection_sorted_indices all have the same
// length as this list, such that each entry corresponds to one detected
// instance
std::vector evaluation_indices; // indices into evaluations[]
std::vector detection_scores; // detection scores of each instance
std::vector detection_sorted_indices; // sorted indices of all
// instances in the dataset
std::vector
image_detection_indices; // indices into the list of detected instances in
// the same image as each instance
std::vector precisions, recalls;
for (auto c = 0; c < num_categories; ++c) {
for (auto a = 0; a < num_area_ranges; ++a) {
for (auto m = 0; m < num_max_detections; ++m) {
// The COCO PythonAPI assumes evaluations[] (the return value of
// COCOeval::EvaluateImages() is one long list storing results for each
// combination of category, area range, and image id, with categories in
// the outermost loop and images in the innermost loop.
const int64_t evaluations_index =
c * num_area_ranges * num_images + a * num_images;
int num_valid_ground_truth = BuildSortedDetectionList(
evaluations,
evaluations_index,
num_images,
max_detections[m],
&evaluation_indices,
&detection_scores,
&detection_sorted_indices,
&image_detection_indices);
if (num_valid_ground_truth == 0) {
continue;
}
for (auto t = 0; t < num_iou_thresholds; ++t) {
// recalls_out is a flattened vectors representing a
// num_iou_thresholds X num_categories X num_area_ranges X
// num_max_detections matrix
const int64_t recalls_out_index =
t * num_categories * num_area_ranges * num_max_detections +
c * num_area_ranges * num_max_detections +
a * num_max_detections + m;
// precisions_out and scores_out are flattened vectors
// representing a num_iou_thresholds X num_recall_thresholds X
// num_categories X num_area_ranges X num_max_detections matrix
const int64_t precisions_out_stride =
num_categories * num_area_ranges * num_max_detections;
const int64_t precisions_out_index = t * num_recall_thresholds *
num_categories * num_area_ranges * num_max_detections +
c * num_area_ranges * num_max_detections +
a * num_max_detections + m;
ComputePrecisionRecallCurve(
precisions_out_index,
precisions_out_stride,
recalls_out_index,
recall_thresholds,
t,
num_iou_thresholds,
num_valid_ground_truth,
evaluations,
evaluation_indices,
detection_scores,
detection_sorted_indices,
image_detection_indices,
&precisions,
&recalls,
&precisions_out,
&scores_out,
&recalls_out);
}
}
}
}
time_t rawtime;
struct tm local_time;
std::array buffer;
time(&rawtime);
#ifdef _WIN32
localtime_s(&local_time, &rawtime);
#else
localtime_r(&rawtime, &local_time);
#endif
strftime(
buffer.data(), 200, "%Y-%m-%d %H:%num_max_detections:%S", &local_time);
return py::dict(
"params"_a = params,
"counts"_a = std::vector({num_iou_thresholds,
num_recall_thresholds,
num_categories,
num_area_ranges,
num_max_detections}),
"date"_a = buffer,
"precision"_a = precisions_out,
"recall"_a = recalls_out,
"scores"_a = scores_out);
}
} // namespace COCOeval
================================================
FILE: detector/YOLOX/yolox/layers/csrc/cocoeval/cocoeval.h
================================================
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#pragma once
#include
#include
#include
#include
#include
namespace py = pybind11;
namespace COCOeval {
// Annotation data for a single object instance in an image
struct InstanceAnnotation {
InstanceAnnotation(
uint64_t id,
double score,
double area,
bool is_crowd,
bool ignore)
: id{id}, score{score}, area{area}, is_crowd{is_crowd}, ignore{ignore} {}
uint64_t id;
double score = 0.;
double area = 0.;
bool is_crowd = false;
bool ignore = false;
};
// Stores intermediate results for evaluating detection results for a single
// image that has D detected instances and G ground truth instances. This stores
// matches between detected and ground truth instances
struct ImageEvaluation {
// For each of the D detected instances, the id of the matched ground truth
// instance, or 0 if unmatched
std::vector detection_matches;
// The detection score of each of the D detected instances
std::vector detection_scores;
// Marks whether or not each of G instances was ignored from evaluation (e.g.,
// because it's outside area_range)
std::vector ground_truth_ignores;
// Marks whether or not each of D instances was ignored from evaluation (e.g.,
// because it's outside aRng)
std::vector detection_ignores;
};
template
using ImageCategoryInstances = std::vector>>;
// C++ implementation of COCO API cocoeval.py::COCOeval.evaluateImg(). For each
// combination of image, category, area range settings, and IOU thresholds to
// evaluate, it matches detected instances to ground truth instances and stores
// the results into a vector of ImageEvaluation results, which will be
// interpreted by the COCOeval::Accumulate() function to produce precion-recall
// curves. The parameters of nested vectors have the following semantics:
// image_category_ious[i][c][d][g] is the intersection over union of the d'th
// detected instance and g'th ground truth instance of
// category category_ids[c] in image image_ids[i]
// image_category_ground_truth_instances[i][c] is a vector of ground truth
// instances in image image_ids[i] of category category_ids[c]
// image_category_detection_instances[i][c] is a vector of detected
// instances in image image_ids[i] of category category_ids[c]
std::vector EvaluateImages(
const std::vector>& area_ranges, // vector of 2-tuples
int max_detections,
const std::vector& iou_thresholds,
const ImageCategoryInstances>& image_category_ious,
const ImageCategoryInstances&
image_category_ground_truth_instances,
const ImageCategoryInstances&
image_category_detection_instances);
// C++ implementation of COCOeval.accumulate(), which generates precision
// recall curves for each set of category, IOU threshold, detection area range,
// and max number of detections parameters. It is assumed that the parameter
// evaluations is the return value of the functon COCOeval::EvaluateImages(),
// which was called with the same parameter settings params
py::dict Accumulate(
const py::object& params,
const std::vector& evalutations);
} // namespace COCOeval
================================================
FILE: detector/YOLOX/yolox/layers/csrc/vision.cpp
================================================
#include "cocoeval/cocoeval.h"
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("COCOevalAccumulate", &COCOeval::Accumulate, "COCOeval::Accumulate");
m.def(
"COCOevalEvaluateImages",
&COCOeval::EvaluateImages,
"COCOeval::EvaluateImages");
pybind11::class_(m, "InstanceAnnotation")
.def(pybind11::init());
pybind11::class_(m, "ImageEvaluation")
.def(pybind11::init<>());
}
================================================
FILE: detector/YOLOX/yolox/layers/fast_coco_eval_api.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# This file comes from
# https://github.com/facebookresearch/detectron2/blob/master/detectron2/evaluation/fast_eval_api.py
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
import copy
import time
import numpy as np
from pycocotools.cocoeval import COCOeval
# import torch first to make yolox._C work without ImportError of libc10.so
# in YOLOX, env is already set in __init__.py.
from yolox import _C
class COCOeval_opt(COCOeval):
"""
This is a slightly modified version of the original COCO API, where the functions evaluateImg()
and accumulate() are implemented in C++ to speedup evaluation
"""
def evaluate(self):
"""
Run per image evaluation on given images and store results in self.evalImgs_cpp, a
datastructure that isn't readable from Python but is used by a c++ implementation of
accumulate(). Unlike the original COCO PythonAPI, we don't populate the datastructure
self.evalImgs because this datastructure is a computational bottleneck.
:return: None
"""
tic = time.time()
print("Running per image evaluation...")
p = self.params
# add backward compatibility if useSegm is specified in params
if p.useSegm is not None:
p.iouType = "segm" if p.useSegm == 1 else "bbox"
print(
"useSegm (deprecated) is not None. Running {} evaluation".format(p.iouType)
)
print("Evaluate annotation type *{}*".format(p.iouType))
p.imgIds = list(np.unique(p.imgIds))
if p.useCats:
p.catIds = list(np.unique(p.catIds))
p.maxDets = sorted(p.maxDets)
self.params = p
self._prepare()
# loop through images, area range, max detection number
catIds = p.catIds if p.useCats else [-1]
if p.iouType == "segm" or p.iouType == "bbox":
computeIoU = self.computeIoU
elif p.iouType == "keypoints":
computeIoU = self.computeOks
self.ious = {
(imgId, catId): computeIoU(imgId, catId)
for imgId in p.imgIds
for catId in catIds
}
maxDet = p.maxDets[-1]
# <<<< Beginning of code differences with original COCO API
def convert_instances_to_cpp(instances, is_det=False):
# Convert annotations for a list of instances in an image to a format that's fast
# to access in C++
instances_cpp = []
for instance in instances:
instance_cpp = _C.InstanceAnnotation(
int(instance["id"]),
instance["score"] if is_det else instance.get("score", 0.0),
instance["area"],
bool(instance.get("iscrowd", 0)),
bool(instance.get("ignore", 0)),
)
instances_cpp.append(instance_cpp)
return instances_cpp
# Convert GT annotations, detections, and IOUs to a format that's fast to access in C++
ground_truth_instances = [
[convert_instances_to_cpp(self._gts[imgId, catId]) for catId in p.catIds]
for imgId in p.imgIds
]
detected_instances = [
[
convert_instances_to_cpp(self._dts[imgId, catId], is_det=True)
for catId in p.catIds
]
for imgId in p.imgIds
]
ious = [[self.ious[imgId, catId] for catId in catIds] for imgId in p.imgIds]
if not p.useCats:
# For each image, flatten per-category lists into a single list
ground_truth_instances = [
[[o for c in i for o in c]] for i in ground_truth_instances
]
detected_instances = [
[[o for c in i for o in c]] for i in detected_instances
]
# Call C++ implementation of self.evaluateImgs()
self._evalImgs_cpp = _C.COCOevalEvaluateImages(
p.areaRng,
maxDet,
p.iouThrs,
ious,
ground_truth_instances,
detected_instances,
)
self._evalImgs = None
self._paramsEval = copy.deepcopy(self.params)
toc = time.time()
print("COCOeval_opt.evaluate() finished in {:0.2f} seconds.".format(toc - tic))
# >>>> End of code differences with original COCO API
def accumulate(self):
"""
Accumulate per image evaluation results and store the result in self.eval. Does not
support changing parameter settings from those used by self.evaluate()
"""
print("Accumulating evaluation results...")
tic = time.time()
if not hasattr(self, "_evalImgs_cpp"):
print("Please run evaluate() first")
self.eval = _C.COCOevalAccumulate(self._paramsEval, self._evalImgs_cpp)
# recall is num_iou_thresholds X num_categories X num_area_ranges X num_max_detections
self.eval["recall"] = np.array(self.eval["recall"]).reshape(
self.eval["counts"][:1] + self.eval["counts"][2:]
)
# precision and scores are num_iou_thresholds X num_recall_thresholds X num_categories X
# num_area_ranges X num_max_detections
self.eval["precision"] = np.array(self.eval["precision"]).reshape(
self.eval["counts"]
)
self.eval["scores"] = np.array(self.eval["scores"]).reshape(self.eval["counts"])
toc = time.time()
print(
"COCOeval_opt.accumulate() finished in {:0.2f} seconds.".format(toc - tic)
)
================================================
FILE: detector/YOLOX/yolox/models/__init__.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
from .darknet import CSPDarknet, Darknet
from .losses import IOUloss
from .yolo_fpn import YOLOFPN
from .yolo_head import YOLOXHead
from .yolo_pafpn import YOLOPAFPN
from .yolox import YOLOX
================================================
FILE: detector/YOLOX/yolox/models/darknet.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
from torch import nn
from .network_blocks import BaseConv, CSPLayer, DWConv, Focus, ResLayer, SPPBottleneck
class Darknet(nn.Module):
# number of blocks from dark2 to dark5.
depth2blocks = {21: [1, 2, 2, 1], 53: [2, 8, 8, 4]}
def __init__(
self, depth, in_channels=3, stem_out_channels=32, out_features=("dark3", "dark4", "dark5"),
):
"""
Args:
depth (int): depth of darknet used in model, usually use [21, 53] for this param.
in_channels (int): number of input channels, for example, use 3 for RGB image.
stem_out_channels (int): number of output chanels of darknet stem.
It decides channels of darknet layer2 to layer5.
out_features (Tuple[str]): desired output layer name.
"""
super().__init__()
assert out_features, "please provide output features of Darknet"
self.out_features = out_features
self.stem = nn.Sequential(
BaseConv(in_channels, stem_out_channels, ksize=3, stride=1, act="lrelu"),
*self.make_group_layer(stem_out_channels, num_blocks=1, stride=2),
)
in_channels = stem_out_channels * 2 # 64
num_blocks = Darknet.depth2blocks[depth]
# create darknet with `stem_out_channels` and `num_blocks` layers.
# to make model structure more clear, we don't use `for` statement in python.
self.dark2 = nn.Sequential(*self.make_group_layer(in_channels, num_blocks[0], stride=2))
in_channels *= 2 # 128
self.dark3 = nn.Sequential(*self.make_group_layer(in_channels, num_blocks[1], stride=2))
in_channels *= 2 # 256
self.dark4 = nn.Sequential(*self.make_group_layer(in_channels, num_blocks[2], stride=2))
in_channels *= 2 # 512
self.dark5 = nn.Sequential(
*self.make_group_layer(in_channels, num_blocks[3], stride=2),
*self.make_spp_block([in_channels, in_channels * 2], in_channels * 2),
)
def make_group_layer(self, in_channels: int, num_blocks: int, stride: int = 1):
"starts with conv layer then has `num_blocks` `ResLayer`"
return [
BaseConv(in_channels, in_channels * 2, ksize=3, stride=stride, act="lrelu"),
*[(ResLayer(in_channels * 2)) for _ in range(num_blocks)]
]
def make_spp_block(self, filters_list, in_filters):
m = nn.Sequential(
*[
BaseConv(in_filters, filters_list[0], 1, stride=1, act="lrelu"),
BaseConv(filters_list[0], filters_list[1], 3, stride=1, act="lrelu"),
SPPBottleneck(
in_channels=filters_list[1],
out_channels=filters_list[0],
activation="lrelu"
),
BaseConv(filters_list[0], filters_list[1], 3, stride=1, act="lrelu"),
BaseConv(filters_list[1], filters_list[0], 1, stride=1, act="lrelu"),
]
)
return m
def forward(self, x):
outputs = {}
x = self.stem(x)
outputs["stem"] = x
x = self.dark2(x)
outputs["dark2"] = x
x = self.dark3(x)
outputs["dark3"] = x
x = self.dark4(x)
outputs["dark4"] = x
x = self.dark5(x)
outputs["dark5"] = x
return {k: v for k, v in outputs.items() if k in self.out_features}
class CSPDarknet(nn.Module):
def __init__(
self, dep_mul, wid_mul,
out_features=("dark3", "dark4", "dark5"),
depthwise=False, act="silu",
):
super().__init__()
assert out_features, "please provide output features of Darknet"
self.out_features = out_features
Conv = DWConv if depthwise else BaseConv
base_channels = int(wid_mul * 64) # 64
base_depth = max(round(dep_mul * 3), 1) # 3
# stem
self.stem = Focus(3, base_channels, ksize=3, act=act)
# dark2
self.dark2 = nn.Sequential(
Conv(base_channels, base_channels * 2, 3, 2, act=act),
CSPLayer(
base_channels * 2, base_channels * 2,
n=base_depth, depthwise=depthwise, act=act
),
)
# dark3
self.dark3 = nn.Sequential(
Conv(base_channels * 2, base_channels * 4, 3, 2, act=act),
CSPLayer(
base_channels * 4, base_channels * 4,
n=base_depth * 3, depthwise=depthwise, act=act,
),
)
# dark4
self.dark4 = nn.Sequential(
Conv(base_channels * 4, base_channels * 8, 3, 2, act=act),
CSPLayer(
base_channels * 8, base_channels * 8,
n=base_depth * 3, depthwise=depthwise, act=act,
),
)
# dark5
self.dark5 = nn.Sequential(
Conv(base_channels * 8, base_channels * 16, 3, 2, act=act),
SPPBottleneck(base_channels * 16, base_channels * 16, activation=act),
CSPLayer(
base_channels * 16, base_channels * 16, n=base_depth,
shortcut=False, depthwise=depthwise, act=act,
),
)
def forward(self, x):
outputs = {}
x = self.stem(x)
outputs["stem"] = x
x = self.dark2(x)
outputs["dark2"] = x
x = self.dark3(x)
outputs["dark3"] = x
x = self.dark4(x)
outputs["dark4"] = x
x = self.dark5(x)
outputs["dark5"] = x
return {k: v for k, v in outputs.items() if k in self.out_features}
================================================
FILE: detector/YOLOX/yolox/models/losses.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
import torch
import torch.nn as nn
class IOUloss(nn.Module):
def __init__(self, reduction="none", loss_type="iou"):
super(IOUloss, self).__init__()
self.reduction = reduction
self.loss_type = loss_type
def forward(self, pred, target):
assert pred.shape[0] == target.shape[0]
pred = pred.view(-1, 4)
target = target.view(-1, 4)
tl = torch.max(
(pred[:, :2] - pred[:, 2:] / 2), (target[:, :2] - target[:, 2:] / 2)
)
br = torch.min(
(pred[:, :2] + pred[:, 2:] / 2), (target[:, :2] + target[:, 2:] / 2)
)
area_p = torch.prod(pred[:, 2:], 1)
area_g = torch.prod(target[:, 2:], 1)
en = (tl < br).type(tl.type()).prod(dim=1)
area_i = torch.prod(br - tl, 1) * en
iou = (area_i) / (area_p + area_g - area_i + 1e-16)
if self.loss_type == "iou":
loss = 1 - iou ** 2
elif self.loss_type == "giou":
c_tl = torch.min(
(pred[:, :2] - pred[:, 2:] / 2), (target[:, :2] - target[:, 2:] / 2)
)
c_br = torch.max(
(pred[:, :2] + pred[:, 2:] / 2), (target[:, :2] + target[:, 2:] / 2)
)
area_c = torch.prod(c_br - c_tl, 1)
giou = iou - (area_c - area_i) / area_c.clamp(1e-16)
loss = 1 - giou.clamp(min=-1.0, max=1.0)
if self.reduction == "mean":
loss = loss.mean()
elif self.reduction == "sum":
loss = loss.sum()
return loss
================================================
FILE: detector/YOLOX/yolox/models/network_blocks.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
import torch
import torch.nn as nn
class SiLU(nn.Module):
"""export-friendly version of nn.SiLU()"""
@staticmethod
def forward(x):
return x * torch.sigmoid(x)
def get_activation(name="silu", inplace=True):
if name == "silu":
module = nn.SiLU(inplace=inplace)
elif name == "relu":
module = nn.ReLU(inplace=inplace)
elif name == "lrelu":
module = nn.LeakyReLU(0.1, inplace=inplace)
else:
raise AttributeError("Unsupported act type: {}".format(name))
return module
class BaseConv(nn.Module):
"""A Conv2d -> Batchnorm -> silu/leaky relu block"""
def __init__(self, in_channels, out_channels, ksize, stride, groups=1, bias=False, act="silu"):
super().__init__()
# same padding
pad = (ksize - 1) // 2
self.conv = nn.Conv2d(
in_channels,
out_channels,
kernel_size=ksize,
stride=stride,
padding=pad,
groups=groups,
bias=bias,
)
self.bn = nn.BatchNorm2d(out_channels)
self.act = get_activation(act, inplace=True)
def forward(self, x):
return self.act(self.bn(self.conv(x)))
def fuseforward(self, x):
return self.act(self.conv(x))
class DWConv(nn.Module):
"""Depthwise Conv + Conv"""
def __init__(self, in_channels, out_channels, ksize, stride=1, act="silu"):
super().__init__()
self.dconv = BaseConv(
in_channels, in_channels, ksize=ksize,
stride=stride, groups=in_channels, act=act
)
self.pconv = BaseConv(
in_channels, out_channels, ksize=1,
stride=1, groups=1, act=act
)
def forward(self, x):
x = self.dconv(x)
return self.pconv(x)
class Bottleneck(nn.Module):
# Standard bottleneck
def __init__(
self, in_channels, out_channels, shortcut=True,
expansion=0.5, depthwise=False, act="silu"
):
super().__init__()
hidden_channels = int(out_channels * expansion)
Conv = DWConv if depthwise else BaseConv
self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
self.conv2 = Conv(hidden_channels, out_channels, 3, stride=1, act=act)
self.use_add = shortcut and in_channels == out_channels
def forward(self, x):
y = self.conv2(self.conv1(x))
if self.use_add:
y = y + x
return y
class ResLayer(nn.Module):
"Residual layer with `in_channels` inputs."
def __init__(self, in_channels: int):
super().__init__()
mid_channels = in_channels // 2
self.layer1 = BaseConv(in_channels, mid_channels, ksize=1, stride=1, act="lrelu")
self.layer2 = BaseConv(mid_channels, in_channels, ksize=3, stride=1, act="lrelu")
def forward(self, x):
out = self.layer2(self.layer1(x))
return x + out
class SPPBottleneck(nn.Module):
"""Spatial pyramid pooling layer used in YOLOv3-SPP"""
def __init__(self, in_channels, out_channels, kernel_sizes=(5, 9, 13), activation="silu"):
super().__init__()
hidden_channels = in_channels // 2
self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=activation)
self.m = nn.ModuleList(
[nn.MaxPool2d(kernel_size=ks, stride=1, padding=ks // 2) for ks in kernel_sizes]
)
conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
self.conv2 = BaseConv(conv2_channels, out_channels, 1, stride=1, act=activation)
def forward(self, x):
x = self.conv1(x)
x = torch.cat([x] + [m(x) for m in self.m], dim=1)
x = self.conv2(x)
return x
class CSPLayer(nn.Module):
"""C3 in yolov5, CSP Bottleneck with 3 convolutions"""
def __init__(
self, in_channels, out_channels, n=1,
shortcut=True, expansion=0.5, depthwise=False, act="silu"
):
"""
Args:
in_channels (int): input channels.
out_channels (int): output channels.
n (int): number of Bottlenecks. Default value: 1.
"""
# ch_in, ch_out, number, shortcut, groups, expansion
super().__init__()
hidden_channels = int(out_channels * expansion) # hidden channels
self.conv1 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
self.conv2 = BaseConv(in_channels, hidden_channels, 1, stride=1, act=act)
self.conv3 = BaseConv(2 * hidden_channels, out_channels, 1, stride=1, act=act)
module_list = [
Bottleneck(hidden_channels, hidden_channels, shortcut, 1.0, depthwise, act=act)
for _ in range(n)
]
self.m = nn.Sequential(*module_list)
def forward(self, x):
x_1 = self.conv1(x)
x_2 = self.conv2(x)
x_1 = self.m(x_1)
x = torch.cat((x_1, x_2), dim=1)
return self.conv3(x)
class Focus(nn.Module):
"""Focus width and height information into channel space."""
def __init__(self, in_channels, out_channels, ksize=1, stride=1, act="silu"):
super().__init__()
self.conv = BaseConv(in_channels * 4, out_channels, ksize, stride, act=act)
def forward(self, x):
# shape of x (b,c,w,h) -> y(b,4c,w/2,h/2)
patch_top_left = x[..., ::2, ::2]
patch_top_right = x[..., ::2, 1::2]
patch_bot_left = x[..., 1::2, ::2]
patch_bot_right = x[..., 1::2, 1::2]
x = torch.cat(
(patch_top_left, patch_bot_left, patch_top_right, patch_bot_right,), dim=1,
)
return self.conv(x)
================================================
FILE: detector/YOLOX/yolox/models/yolo_fpn.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
import torch
import torch.nn as nn
from .darknet import Darknet
from .network_blocks import BaseConv
class YOLOFPN(nn.Module):
"""
YOLOFPN module. Darknet 53 is the default backbone of this model.
"""
def __init__(
self, depth=53, in_features=["dark3", "dark4", "dark5"],
):
super().__init__()
self.backbone = Darknet(depth)
self.in_features = in_features
# out 1
self.out1_cbl = self._make_cbl(512, 256, 1)
self.out1 = self._make_embedding([256, 512], 512 + 256)
# out 2
self.out2_cbl = self._make_cbl(256, 128, 1)
self.out2 = self._make_embedding([128, 256], 256 + 128)
# upsample
self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
def _make_cbl(self, _in, _out, ks):
return BaseConv(_in, _out, ks, stride=1, act="lrelu")
def _make_embedding(self, filters_list, in_filters):
m = nn.Sequential(
*[
self._make_cbl(in_filters, filters_list[0], 1),
self._make_cbl(filters_list[0], filters_list[1], 3),
self._make_cbl(filters_list[1], filters_list[0], 1),
self._make_cbl(filters_list[0], filters_list[1], 3),
self._make_cbl(filters_list[1], filters_list[0], 1),
]
)
return m
def load_pretrained_model(self, filename="./weights/darknet53.mix.pth"):
with open(filename, "rb") as f:
state_dict = torch.load(f, map_location="cpu")
print("loading pretrained weights...")
self.backbone.load_state_dict(state_dict)
def forward(self, inputs):
"""
Args:
inputs (Tensor): input image.
Returns:
Tuple[Tensor]: FPN output features..
"""
# backbone
out_features = self.backbone(inputs)
x2, x1, x0 = [out_features[f] for f in self.in_features]
# yolo branch 1
x1_in = self.out1_cbl(x0)
x1_in = self.upsample(x1_in)
x1_in = torch.cat([x1_in, x1], 1)
out_dark4 = self.out1(x1_in)
# yolo branch 2
x2_in = self.out2_cbl(out_dark4)
x2_in = self.upsample(x2_in)
x2_in = torch.cat([x2_in, x2], 1)
out_dark3 = self.out2(x2_in)
outputs = (out_dark3, out_dark4, x0)
return outputs
================================================
FILE: detector/YOLOX/yolox/models/yolo_head.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
import math
from loguru import logger
import torch
import torch.nn as nn
import torch.nn.functional as F
from yolox.utils import bboxes_iou
from .losses import IOUloss
from .network_blocks import BaseConv, DWConv
class YOLOXHead(nn.Module):
def __init__(
self, num_classes, width=1.0, strides=[8, 16, 32],
in_channels=[256, 512, 1024], act="silu", depthwise=False
):
"""
Args:
act (str): activation type of conv. Defalut value: "silu".
depthwise (bool): wheather apply depthwise conv in conv branch. Defalut value: False.
"""
super().__init__()
self.n_anchors = 1
self.num_classes = num_classes
self.decode_in_inference = True # for deploy, set to False
self.cls_convs = nn.ModuleList()
self.reg_convs = nn.ModuleList()
self.cls_preds = nn.ModuleList()
self.reg_preds = nn.ModuleList()
self.obj_preds = nn.ModuleList()
self.stems = nn.ModuleList()
Conv = DWConv if depthwise else BaseConv
for i in range(len(in_channels)):
self.stems.append(
BaseConv(
in_channels=int(in_channels[i] * width),
out_channels=int(256 * width),
ksize=1,
stride=1,
act=act,
)
)
self.cls_convs.append(
nn.Sequential(
*[
Conv(
in_channels=int(256 * width),
out_channels=int(256 * width),
ksize=3,
stride=1,
act=act,
),
Conv(
in_channels=int(256 * width),
out_channels=int(256 * width),
ksize=3,
stride=1,
act=act,
),
]
)
)
self.reg_convs.append(
nn.Sequential(
*[
Conv(
in_channels=int(256 * width),
out_channels=int(256 * width),
ksize=3,
stride=1,
act=act,
),
Conv(
in_channels=int(256 * width),
out_channels=int(256 * width),
ksize=3,
stride=1,
act=act,
),
]
)
)
self.cls_preds.append(
nn.Conv2d(
in_channels=int(256 * width),
out_channels=self.n_anchors * self.num_classes,
kernel_size=1,
stride=1,
padding=0,
)
)
self.reg_preds.append(
nn.Conv2d(
in_channels=int(256 * width),
out_channels=4,
kernel_size=1,
stride=1,
padding=0,
)
)
self.obj_preds.append(
nn.Conv2d(
in_channels=int(256 * width),
out_channels=self.n_anchors * 1,
kernel_size=1,
stride=1,
padding=0,
)
)
self.use_l1 = False
self.l1_loss = nn.L1Loss(reduction="none")
self.bcewithlog_loss = nn.BCEWithLogitsLoss(reduction="none")
self.iou_loss = IOUloss(reduction="none")
self.strides = strides
self.grids = [torch.zeros(1)] * len(in_channels)
self.expanded_strides = [None] * len(in_channels)
def initialize_biases(self, prior_prob):
for conv in self.cls_preds:
b = conv.bias.view(self.n_anchors, -1)
b.data.fill_(-math.log((1 - prior_prob) / prior_prob))
conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
for conv in self.obj_preds:
b = conv.bias.view(self.n_anchors, -1)
b.data.fill_(-math.log((1 - prior_prob) / prior_prob))
conv.bias = torch.nn.Parameter(b.view(-1), requires_grad=True)
def forward(self, xin, labels=None, imgs=None):
outputs = []
origin_preds = []
x_shifts = []
y_shifts = []
expanded_strides = []
for k, (cls_conv, reg_conv, stride_this_level, x) in enumerate(
zip(self.cls_convs, self.reg_convs, self.strides, xin)
):
x = self.stems[k](x)
cls_x = x
reg_x = x
cls_feat = cls_conv(cls_x)
cls_output = self.cls_preds[k](cls_feat)
reg_feat = reg_conv(reg_x)
reg_output = self.reg_preds[k](reg_feat)
obj_output = self.obj_preds[k](reg_feat)
if self.training:
output = torch.cat([reg_output, obj_output, cls_output], 1)
output, grid = self.get_output_and_grid(output, k, stride_this_level, xin[0].type())
x_shifts.append(grid[:, :, 0])
y_shifts.append(grid[:, :, 1])
expanded_strides.append(
torch.zeros(1, grid.shape[1]).fill_(stride_this_level).type_as(xin[0])
)
if self.use_l1:
batch_size = reg_output.shape[0]
hsize, wsize = reg_output.shape[-2:]
reg_output = reg_output.view(batch_size, self.n_anchors, 4, hsize, wsize)
reg_output = (
reg_output.permute(0, 1, 3, 4, 2)
.reshape(batch_size, -1, 4)
)
origin_preds.append(reg_output.clone())
else:
output = torch.cat([reg_output, obj_output.sigmoid(), cls_output.sigmoid()], 1)
outputs.append(output)
if self.training:
return self.get_losses(
imgs, x_shifts, y_shifts, expanded_strides, labels,
torch.cat(outputs, 1), origin_preds, dtype=xin[0].dtype
)
else:
self.hw = [x.shape[-2:] for x in outputs]
# [batch, n_anchors_all, 85]
outputs = torch.cat([x.flatten(start_dim=2) for x in outputs], dim=2).permute(0, 2, 1)
if self.decode_in_inference:
return self.decode_outputs(outputs, dtype=xin[0].type())
else:
return outputs
def get_output_and_grid(self, output, k, stride, dtype):
grid = self.grids[k]
batch_size = output.shape[0]
n_ch = 5 + self.num_classes
hsize, wsize = output.shape[-2:]
if grid.shape[2:4] != output.shape[2:4]:
yv, xv = torch.meshgrid([torch.arange(hsize), torch.arange(wsize)])
grid = torch.stack((xv, yv), 2).view(1, 1, hsize, wsize, 2).type(dtype)
self.grids[k] = grid
output = output.view(batch_size, self.n_anchors, n_ch, hsize, wsize)
output = (
output.permute(0, 1, 3, 4, 2)
.reshape(batch_size, self.n_anchors * hsize * wsize, -1)
)
grid = grid.view(1, -1, 2)
output[..., :2] = (output[..., :2] + grid) * stride
output[..., 2:4] = torch.exp(output[..., 2:4]) * stride
return output, grid
def decode_outputs(self, outputs, dtype):
grids = []
strides = []
for (hsize, wsize), stride in zip(self.hw, self.strides):
yv, xv = torch.meshgrid([torch.arange(hsize), torch.arange(wsize)])
grid = torch.stack((xv, yv), 2).view(1, -1, 2)
grids.append(grid)
shape = grid.shape[:2]
strides.append(torch.full((*shape, 1), stride))
grids = torch.cat(grids, dim=1).type(dtype)
strides = torch.cat(strides, dim=1).type(dtype)
outputs[..., :2] = (outputs[..., :2] + grids) * strides
outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides
return outputs
def get_losses(
self, imgs, x_shifts, y_shifts, expanded_strides, labels, outputs, origin_preds, dtype,
):
bbox_preds = outputs[:, :, :4] # [batch, n_anchors_all, 4]
obj_preds = outputs[:, :, 4].unsqueeze(-1) # [batch, n_anchors_all, 1]
cls_preds = outputs[:, :, 5:] # [batch, n_anchors_all, n_cls]
# calculate targets
mixup = labels.shape[2] > 5
if mixup:
label_cut = labels[..., :5]
else:
label_cut = labels
nlabel = (label_cut.sum(dim=2) > 0).sum(dim=1) # number of objects
total_num_anchors = outputs.shape[1]
x_shifts = torch.cat(x_shifts, 1) # [1, n_anchors_all]
y_shifts = torch.cat(y_shifts, 1) # [1, n_anchors_all]
expanded_strides = torch.cat(expanded_strides, 1)
if self.use_l1:
origin_preds = torch.cat(origin_preds, 1)
cls_targets = []
reg_targets = []
l1_targets = []
obj_targets = []
fg_masks = []
num_fg = 0.0
num_gts = 0.0
for batch_idx in range(outputs.shape[0]):
num_gt = int(nlabel[batch_idx])
num_gts += num_gt
if num_gt == 0:
cls_target = outputs.new_zeros((0, self.num_classes))
reg_target = outputs.new_zeros((0, 4))
l1_target = outputs.new_zeros((0, 4))
obj_target = outputs.new_zeros((total_num_anchors, 1))
fg_mask = outputs.new_zeros(total_num_anchors).bool()
else:
gt_bboxes_per_image = labels[batch_idx, :num_gt, 1:5]
gt_classes = labels[batch_idx, :num_gt, 0]
bboxes_preds_per_image = bbox_preds[batch_idx]
try:
gt_matched_classes, fg_mask, pred_ious_this_matching, matched_gt_inds, num_fg_img = self.get_assignments( # noqa
batch_idx, num_gt, total_num_anchors, gt_bboxes_per_image, gt_classes,
bboxes_preds_per_image, expanded_strides, x_shifts, y_shifts,
cls_preds, bbox_preds, obj_preds, labels, imgs,
)
except RuntimeError:
logger.error(
"OOM RuntimeError is raised due to the huge memory cost during label assignment. \
CPU mode is applied in this batch. If you want to avoid this issue, \
try to reduce the batch size or image size."
)
torch.cuda.empty_cache()
gt_matched_classes, fg_mask, pred_ious_this_matching, matched_gt_inds, num_fg_img = self.get_assignments( # noqa
batch_idx, num_gt, total_num_anchors, gt_bboxes_per_image, gt_classes,
bboxes_preds_per_image, expanded_strides, x_shifts, y_shifts,
cls_preds, bbox_preds, obj_preds, labels, imgs, "cpu",
)
torch.cuda.empty_cache()
num_fg += num_fg_img
cls_target = F.one_hot(
gt_matched_classes.to(torch.int64), self.num_classes
) * pred_ious_this_matching.unsqueeze(-1)
obj_target = fg_mask.unsqueeze(-1)
reg_target = gt_bboxes_per_image[matched_gt_inds]
if self.use_l1:
l1_target = self.get_l1_target(
outputs.new_zeros((num_fg_img, 4)),
gt_bboxes_per_image[matched_gt_inds],
expanded_strides[0][fg_mask],
x_shifts=x_shifts[0][fg_mask],
y_shifts=y_shifts[0][fg_mask],
)
cls_targets.append(cls_target)
reg_targets.append(reg_target)
obj_targets.append(obj_target.to(dtype))
fg_masks.append(fg_mask)
if self.use_l1:
l1_targets.append(l1_target)
cls_targets = torch.cat(cls_targets, 0)
reg_targets = torch.cat(reg_targets, 0)
obj_targets = torch.cat(obj_targets, 0)
fg_masks = torch.cat(fg_masks, 0)
if self.use_l1:
l1_targets = torch.cat(l1_targets, 0)
num_fg = max(num_fg, 1)
loss_iou = (self.iou_loss(bbox_preds.view(-1, 4)[fg_masks], reg_targets)).sum() / num_fg
loss_obj = (self.bcewithlog_loss(obj_preds.view(-1, 1), obj_targets)).sum() / num_fg
loss_cls = (
self.bcewithlog_loss(cls_preds.view(-1, self.num_classes)[fg_masks], cls_targets)
).sum() / num_fg
if self.use_l1:
loss_l1 = (self.l1_loss(origin_preds.view(-1, 4)[fg_masks], l1_targets)).sum() / num_fg
else:
loss_l1 = 0.0
reg_weight = 5.0
loss = reg_weight * loss_iou + loss_obj + loss_cls + loss_l1
return loss, reg_weight * loss_iou, loss_obj, loss_cls, loss_l1, num_fg / max(num_gts, 1)
def get_l1_target(self, l1_target, gt, stride, x_shifts, y_shifts, eps=1e-8):
l1_target[:, 0] = gt[:, 0] / stride - x_shifts
l1_target[:, 1] = gt[:, 1] / stride - y_shifts
l1_target[:, 2] = torch.log(gt[:, 2] / stride + eps)
l1_target[:, 3] = torch.log(gt[:, 3] / stride + eps)
return l1_target
@torch.no_grad()
def get_assignments(
self, batch_idx, num_gt, total_num_anchors, gt_bboxes_per_image, gt_classes,
bboxes_preds_per_image, expanded_strides, x_shifts, y_shifts,
cls_preds, bbox_preds, obj_preds, labels, imgs, mode="gpu",
):
if mode == "cpu":
print("------------CPU Mode for This Batch-------------")
gt_bboxes_per_image = gt_bboxes_per_image.cpu().float()
bboxes_preds_per_image = bboxes_preds_per_image.cpu().float()
gt_classes = gt_classes.cpu().float()
expanded_strides = expanded_strides.cpu().float()
x_shifts = x_shifts.cpu()
y_shifts = y_shifts.cpu()
fg_mask, is_in_boxes_and_center = self.get_in_boxes_info(
gt_bboxes_per_image, expanded_strides, x_shifts, y_shifts, total_num_anchors, num_gt,
)
bboxes_preds_per_image = bboxes_preds_per_image[fg_mask]
cls_preds_ = cls_preds[batch_idx][fg_mask]
obj_preds_ = obj_preds[batch_idx][fg_mask]
num_in_boxes_anchor = bboxes_preds_per_image.shape[0]
if mode == "cpu":
gt_bboxes_per_image = gt_bboxes_per_image.cpu()
bboxes_preds_per_image = bboxes_preds_per_image.cpu()
pair_wise_ious = bboxes_iou(
gt_bboxes_per_image, bboxes_preds_per_image, False
)
gt_cls_per_image = (
F.one_hot(gt_classes.to(torch.int64), self.num_classes).float()
.unsqueeze(1).repeat(1, num_in_boxes_anchor, 1)
)
pair_wise_ious_loss = -torch.log(pair_wise_ious + 1e-8)
if mode == "cpu":
cls_preds_, obj_preds_ = cls_preds_.cpu(), obj_preds_.cpu()
cls_preds_ = (
cls_preds_.float().unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_()
* obj_preds_.unsqueeze(0).repeat(num_gt, 1, 1).sigmoid_()
)
pair_wise_cls_loss = F.binary_cross_entropy(
cls_preds_.sqrt_(), gt_cls_per_image, reduction="none"
).sum(-1)
del cls_preds_
cost = (
pair_wise_cls_loss
+ 3.0 * pair_wise_ious_loss
+ 100000.0 * (~is_in_boxes_and_center)
)
(
num_fg, gt_matched_classes, pred_ious_this_matching, matched_gt_inds
) = self.dynamic_k_matching(cost, pair_wise_ious, gt_classes, num_gt, fg_mask)
del pair_wise_cls_loss, cost, pair_wise_ious, pair_wise_ious_loss
if mode == "cpu":
gt_matched_classes = gt_matched_classes.cuda()
fg_mask = fg_mask.cuda()
pred_ious_this_matching = pred_ious_this_matching.cuda()
matched_gt_inds = matched_gt_inds.cuda()
return gt_matched_classes, fg_mask, pred_ious_this_matching, matched_gt_inds, num_fg
def get_in_boxes_info(
self, gt_bboxes_per_image, expanded_strides, x_shifts, y_shifts, total_num_anchors, num_gt,
):
expanded_strides_per_image = expanded_strides[0]
x_shifts_per_image = x_shifts[0] * expanded_strides_per_image
y_shifts_per_image = y_shifts[0] * expanded_strides_per_image
x_centers_per_image = (
(x_shifts_per_image + 0.5 * expanded_strides_per_image)
.unsqueeze(0)
.repeat(num_gt, 1)
) # [n_anchor] -> [n_gt, n_anchor]
y_centers_per_image = (
(y_shifts_per_image + 0.5 * expanded_strides_per_image)
.unsqueeze(0)
.repeat(num_gt, 1)
)
gt_bboxes_per_image_l = (
(gt_bboxes_per_image[:, 0] - 0.5 * gt_bboxes_per_image[:, 2])
.unsqueeze(1)
.repeat(1, total_num_anchors)
)
gt_bboxes_per_image_r = (
(gt_bboxes_per_image[:, 0] + 0.5 * gt_bboxes_per_image[:, 2])
.unsqueeze(1)
.repeat(1, total_num_anchors)
)
gt_bboxes_per_image_t = (
(gt_bboxes_per_image[:, 1] - 0.5 * gt_bboxes_per_image[:, 3])
.unsqueeze(1)
.repeat(1, total_num_anchors)
)
gt_bboxes_per_image_b = (
(gt_bboxes_per_image[:, 1] + 0.5 * gt_bboxes_per_image[:, 3])
.unsqueeze(1)
.repeat(1, total_num_anchors)
)
b_l = x_centers_per_image - gt_bboxes_per_image_l
b_r = gt_bboxes_per_image_r - x_centers_per_image
b_t = y_centers_per_image - gt_bboxes_per_image_t
b_b = gt_bboxes_per_image_b - y_centers_per_image
bbox_deltas = torch.stack([b_l, b_t, b_r, b_b], 2)
is_in_boxes = bbox_deltas.min(dim=-1).values > 0.0
is_in_boxes_all = is_in_boxes.sum(dim=0) > 0
# in fixed center
center_radius = 2.5
gt_bboxes_per_image_l = (gt_bboxes_per_image[:, 0]).unsqueeze(1).repeat(
1, total_num_anchors
) - center_radius * expanded_strides_per_image.unsqueeze(0)
gt_bboxes_per_image_r = (gt_bboxes_per_image[:, 0]).unsqueeze(1).repeat(
1, total_num_anchors
) + center_radius * expanded_strides_per_image.unsqueeze(0)
gt_bboxes_per_image_t = (gt_bboxes_per_image[:, 1]).unsqueeze(1).repeat(
1, total_num_anchors
) - center_radius * expanded_strides_per_image.unsqueeze(0)
gt_bboxes_per_image_b = (gt_bboxes_per_image[:, 1]).unsqueeze(1).repeat(
1, total_num_anchors
) + center_radius * expanded_strides_per_image.unsqueeze(0)
c_l = x_centers_per_image - gt_bboxes_per_image_l
c_r = gt_bboxes_per_image_r - x_centers_per_image
c_t = y_centers_per_image - gt_bboxes_per_image_t
c_b = gt_bboxes_per_image_b - y_centers_per_image
center_deltas = torch.stack([c_l, c_t, c_r, c_b], 2)
is_in_centers = center_deltas.min(dim=-1).values > 0.0
is_in_centers_all = is_in_centers.sum(dim=0) > 0
# in boxes and in centers
is_in_boxes_anchor = is_in_boxes_all | is_in_centers_all
is_in_boxes_and_center = (
is_in_boxes[:, is_in_boxes_anchor] & is_in_centers[:, is_in_boxes_anchor]
)
return is_in_boxes_anchor, is_in_boxes_and_center
def dynamic_k_matching(self, cost, pair_wise_ious, gt_classes, num_gt, fg_mask):
# Dynamic K
# ---------------------------------------------------------------
matching_matrix = torch.zeros_like(cost)
ious_in_boxes_matrix = pair_wise_ious
n_candidate_k = 10
topk_ious, _ = torch.topk(ious_in_boxes_matrix, n_candidate_k, dim=1)
dynamic_ks = torch.clamp(topk_ious.sum(1).int(), min=1)
for gt_idx in range(num_gt):
_, pos_idx = torch.topk(
cost[gt_idx], k=dynamic_ks[gt_idx].item(), largest=False
)
matching_matrix[gt_idx][pos_idx] = 1.0
del topk_ious, dynamic_ks, pos_idx
anchor_matching_gt = matching_matrix.sum(0)
if (anchor_matching_gt > 1).sum() > 0:
cost_min, cost_argmin = torch.min(cost[:, anchor_matching_gt > 1], dim=0)
matching_matrix[:, anchor_matching_gt > 1] *= 0.0
matching_matrix[cost_argmin, anchor_matching_gt > 1] = 1.0
fg_mask_inboxes = matching_matrix.sum(0) > 0.0
num_fg = fg_mask_inboxes.sum().item()
fg_mask[fg_mask.clone()] = fg_mask_inboxes
matched_gt_inds = matching_matrix[:, fg_mask_inboxes].argmax(0)
gt_matched_classes = gt_classes[matched_gt_inds]
pred_ious_this_matching = (matching_matrix * pair_wise_ious).sum(0)[fg_mask_inboxes]
return num_fg, gt_matched_classes, pred_ious_this_matching, matched_gt_inds
================================================
FILE: detector/YOLOX/yolox/models/yolo_pafpn.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
import torch
import torch.nn as nn
from .darknet import CSPDarknet
from .network_blocks import BaseConv, CSPLayer, DWConv
class YOLOPAFPN(nn.Module):
"""
YOLOv3 model. Darknet 53 is the default backbone of this model.
"""
def __init__(
self, depth=1.0, width=1.0, in_features=("dark3", "dark4", "dark5"),
in_channels=[256, 512, 1024], depthwise=False, act="silu",
):
super().__init__()
self.backbone = CSPDarknet(depth, width, depthwise=depthwise, act=act)
self.in_features = in_features
self.in_channels = in_channels
Conv = DWConv if depthwise else BaseConv
self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
self.lateral_conv0 = BaseConv(
int(in_channels[2] * width), int(in_channels[1] * width), 1, 1, act=act
)
self.C3_p4 = CSPLayer(
int(2 * in_channels[1] * width),
int(in_channels[1] * width),
round(3 * depth),
False,
depthwise=depthwise,
act=act,
) # cat
self.reduce_conv1 = BaseConv(
int(in_channels[1] * width), int(in_channels[0] * width), 1, 1, act=act
)
self.C3_p3 = CSPLayer(
int(2 * in_channels[0] * width),
int(in_channels[0] * width),
round(3 * depth),
False,
depthwise=depthwise,
act=act,
)
# bottom-up conv
self.bu_conv2 = Conv(
int(in_channels[0] * width), int(in_channels[0] * width), 3, 2, act=act
)
self.C3_n3 = CSPLayer(
int(2 * in_channels[0] * width),
int(in_channels[1] * width),
round(3 * depth),
False,
depthwise=depthwise,
act=act,
)
# bottom-up conv
self.bu_conv1 = Conv(
int(in_channels[1] * width), int(in_channels[1] * width), 3, 2, act=act
)
self.C3_n4 = CSPLayer(
int(2 * in_channels[1] * width),
int(in_channels[2] * width),
round(3 * depth),
False,
depthwise=depthwise,
act=act,
)
def forward(self, input):
"""
Args:
inputs: input images.
Returns:
Tuple[Tensor]: FPN feature.
"""
# backbone
out_features = self.backbone(input)
features = [out_features[f] for f in self.in_features]
[x2, x1, x0] = features
fpn_out0 = self.lateral_conv0(x0) # 1024->512/32
f_out0 = self.upsample(fpn_out0) # 512/16
f_out0 = torch.cat([f_out0, x1], 1) # 512->1024/16
f_out0 = self.C3_p4(f_out0) # 1024->512/16
fpn_out1 = self.reduce_conv1(f_out0) # 512->256/16
f_out1 = self.upsample(fpn_out1) # 256/8
f_out1 = torch.cat([f_out1, x2], 1) # 256->512/8
pan_out2 = self.C3_p3(f_out1) # 512->256/8
p_out1 = self.bu_conv2(pan_out2) # 256->256/16
p_out1 = torch.cat([p_out1, fpn_out1], 1) # 256->512/16
pan_out1 = self.C3_n3(p_out1) # 512->512/16
p_out0 = self.bu_conv1(pan_out1) # 512->512/32
p_out0 = torch.cat([p_out0, fpn_out0], 1) # 512->1024/32
pan_out0 = self.C3_n4(p_out0) # 1024->1024/32
outputs = (pan_out2, pan_out1, pan_out0)
return outputs
================================================
FILE: detector/YOLOX/yolox/models/yolox.py
================================================
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
import torch.nn as nn
from .yolo_head import YOLOXHead
from .yolo_pafpn import YOLOPAFPN
class YOLOX(nn.Module):
"""
YOLOX model module. The module list is defined by create_yolov3_modules function.
The network returns loss values from three YOLO layers during training
and detection results during test.
"""
def __init__(self, backbone=None, head=None):
super().__init__()
if backbone is None:
backbone = YOLOPAFPN()
if head is None:
head = YOLOXHead(80)
self.backbone = backbone
self.head = head
def forward(self, x, targets=None):
# fpn output content features of [dark3, dark4, dark5]
fpn_outs = self.backbone(x)
if self.training:
assert targets is not None
loss, iou_loss, conf_loss, cls_loss, l1_loss, num_fg = self.head(
fpn_outs, targets, x
)
outputs = {
"total_loss": loss,
"iou_loss": iou_loss,
"l1_loss": l1_loss,
"conf_loss": conf_loss,
"cls_loss": cls_loss,
"num_fg": num_fg,
}
else:
outputs = self.head(fpn_outs)
return outputs
================================================
FILE: detector/YOLOX/yolox/utils/__init__.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
from .allreduce_norm import *
from .boxes import *
from .checkpoint import load_ckpt, save_checkpoint
from .demo_utils import *
from .dist import *
from .ema import ModelEMA
from .logger import setup_logger
from .lr_scheduler import LRScheduler
from .metric import *
from .model_utils import *
from .setup_env import *
from .visualize import *
================================================
FILE: detector/YOLOX/yolox/utils/allreduce_norm.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
import pickle
from collections import OrderedDict
import torch
from torch import distributed as dist
from torch import nn
from .dist import _get_global_gloo_group, get_world_size
ASYNC_NORM = (
nn.BatchNorm1d,
nn.BatchNorm2d,
nn.BatchNorm3d,
nn.InstanceNorm1d,
nn.InstanceNorm2d,
nn.InstanceNorm3d,
)
__all__ = [
"get_async_norm_states", "pyobj2tensor", "tensor2pyobj", "all_reduce", "all_reduce_norm"
]
def get_async_norm_states(module):
async_norm_states = OrderedDict()
for name, child in module.named_modules():
if isinstance(child, ASYNC_NORM):
for k, v in child.state_dict().items():
async_norm_states[".".join([name, k])] = v
return async_norm_states
def pyobj2tensor(pyobj, device="cuda"):
"""serialize picklable python object to tensor"""
storage = torch.ByteStorage.from_buffer(pickle.dumps(pyobj))
return torch.ByteTensor(storage).to(device=device)
def tensor2pyobj(tensor):
"""deserialize tensor to picklable python object"""
return pickle.loads(tensor.cpu().numpy().tobytes())
def _get_reduce_op(op_name):
return {
"sum": dist.ReduceOp.SUM,
"mean": dist.ReduceOp.SUM,
}[op_name.lower()]
def all_reduce(py_dict, op="sum", group=None):
"""
Apply all reduce function for python dict object.
NOTE: make sure that every py_dict has the same keys and values are in the same shape.
Args:
py_dict (dict): dict to apply all reduce op.
op (str): operator, could be "sum" or "mean".
"""
world_size = get_world_size()
if world_size == 1:
return py_dict
if group is None:
group = _get_global_gloo_group()
if dist.get_world_size(group) == 1:
return py_dict
# all reduce logic across different devices.
py_key = list(py_dict.keys())
py_key_tensor = pyobj2tensor(py_key)
dist.broadcast(py_key_tensor, src=0)
py_key = tensor2pyobj(py_key_tensor)
tensor_shapes = [py_dict[k].shape for k in py_key]
tensor_numels = [py_dict[k].numel() for k in py_key]
flatten_tensor = torch.cat([py_dict[k].flatten() for k in py_key])
dist.all_reduce(flatten_tensor, op=_get_reduce_op(op))
if op == "mean":
flatten_tensor /= world_size
split_tensors = [
x.reshape(shape) for x, shape in zip(
torch.split(flatten_tensor, tensor_numels), tensor_shapes
)
]
return OrderedDict({k: v for k, v in zip(py_key, split_tensors)})
def all_reduce_norm(module):
"""
All reduce norm statistics in different devices.
"""
states = get_async_norm_states(module)
states = all_reduce(states, op="mean")
module.load_state_dict(states, strict=False)
================================================
FILE: detector/YOLOX/yolox/utils/boxes.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
import numpy as np
import torch
import torchvision
__all__ = [
"filter_box", "postprocess", "bboxes_iou", "matrix_iou",
"adjust_box_anns", "xyxy2xywh",
]
def filter_box(output, scale_range):
"""
output: (N, 5+class) shape
"""
min_scale, max_scale = scale_range
w = output[:, 2] - output[:, 0]
h = output[:, 3] - output[:, 1]
keep = (w * h > min_scale * min_scale) & (w * h < max_scale * max_scale)
return output[keep]
def postprocess(prediction, num_classes, conf_thre=0.7, nms_thre=0.45):
box_corner = prediction.new(prediction.shape)
box_corner[:, :, 0] = prediction[:, :, 0] - prediction[:, :, 2] / 2
box_corner[:, :, 1] = prediction[:, :, 1] - prediction[:, :, 3] / 2
box_corner[:, :, 2] = prediction[:, :, 0] + prediction[:, :, 2] / 2
box_corner[:, :, 3] = prediction[:, :, 1] + prediction[:, :, 3] / 2
prediction[:, :, :4] = box_corner[:, :, :4]
output = [None for _ in range(len(prediction))]
for i, image_pred in enumerate(prediction):
# If none are remaining => process next image
if not image_pred.size(0):
continue
# Get score and class with highest confidence
class_conf, class_pred = torch.max(image_pred[:, 5: 5 + num_classes], 1, keepdim=True)
conf_mask = (image_pred[:, 4] * class_conf.squeeze() >= conf_thre).squeeze()
# _, conf_mask = torch.topk((image_pred[:, 4] * class_conf.squeeze()), 1000)
# Detections ordered as (x1, y1, x2, y2, obj_conf, class_conf, class_pred)
detections = torch.cat((image_pred[:, :5], class_conf, class_pred.float()), 1)
detections = detections[conf_mask]
if not detections.size(0):
continue
nms_out_index = torchvision.ops.batched_nms(
detections[:, :4],
detections[:, 4] * detections[:, 5],
detections[:, 6],
nms_thre,
)
detections = detections[nms_out_index]
if output[i] is None:
output[i] = detections
else:
output[i] = torch.cat((output[i], detections))
return output
def bboxes_iou(bboxes_a, bboxes_b, xyxy=True):
if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4:
raise IndexError
if xyxy:
tl = torch.max(bboxes_a[:, None, :2], bboxes_b[:, :2])
br = torch.min(bboxes_a[:, None, 2:], bboxes_b[:, 2:])
area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1)
area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1)
else:
tl = torch.max(
(bboxes_a[:, None, :2] - bboxes_a[:, None, 2:] / 2),
(bboxes_b[:, :2] - bboxes_b[:, 2:] / 2),
)
br = torch.min(
(bboxes_a[:, None, :2] + bboxes_a[:, None, 2:] / 2),
(bboxes_b[:, :2] + bboxes_b[:, 2:] / 2),
)
area_a = torch.prod(bboxes_a[:, 2:], 1)
area_b = torch.prod(bboxes_b[:, 2:], 1)
en = (tl < br).type(tl.type()).prod(dim=2)
area_i = torch.prod(br - tl, 2) * en # * ((tl < br).all())
return area_i / (area_a[:, None] + area_b - area_i)
def matrix_iou(a, b):
"""
return iou of a and b, numpy version for data augenmentation
"""
lt = np.maximum(a[:, np.newaxis, :2], b[:, :2])
rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2)
area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
return area_i / (area_a[:, np.newaxis] + area_b - area_i + 1e-12)
def adjust_box_anns(bbox, scale_ratio, padw, padh, w_max, h_max):
bbox[:, 0::2] = np.clip(bbox[:, 0::2] * scale_ratio + padw, 0, w_max)
bbox[:, 1::2] = np.clip(bbox[:, 1::2] * scale_ratio + padh, 0, h_max)
return bbox
def xyxy2xywh(bboxes):
bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 0]
bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 1]
return bboxes
================================================
FILE: detector/YOLOX/yolox/utils/checkpoint.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
import os
import shutil
from loguru import logger
import torch
def load_ckpt(model, ckpt):
model_state_dict = model.state_dict()
load_dict = {}
for key_model, v in model_state_dict.items():
if key_model not in ckpt:
logger.warning(
"{} is not in the ckpt. Please double check and see if this is desired.".format(
key_model
)
)
continue
v_ckpt = ckpt[key_model]
if v.shape != v_ckpt.shape:
logger.warning(
"Shape of {} in checkpoint is {}, while shape of {} in model is {}.".format(
key_model, v_ckpt.shape, key_model, v.shape
)
)
continue
load_dict[key_model] = v_ckpt
model.load_state_dict(load_dict, strict=False)
return model
def save_checkpoint(state, is_best, save_dir, model_name=""):
if not os.path.exists(save_dir):
os.makedirs(save_dir)
filename = os.path.join(save_dir, model_name + "_ckpt.pth.tar")
torch.save(state, filename)
if is_best:
best_filename = os.path.join(save_dir, "best_ckpt.pth.tar")
shutil.copyfile(filename, best_filename)
================================================
FILE: detector/YOLOX/yolox/utils/demo_utils.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
import os
import numpy as np
__all__ = ["mkdir", "nms", "multiclass_nms", "demo_postprocess"]
def mkdir(path):
if not os.path.exists(path):
os.makedirs(path)
def nms(boxes, scores, nms_thr):
"""Single class NMS implemented in Numpy."""
x1 = boxes[:, 0]
y1 = boxes[:, 1]
x2 = boxes[:, 2]
y2 = boxes[:, 3]
areas = (x2 - x1 + 1) * (y2 - y1 + 1)
order = scores.argsort()[::-1]
keep = []
while order.size > 0:
i = order[0]
keep.append(i)
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
w = np.maximum(0.0, xx2 - xx1 + 1)
h = np.maximum(0.0, yy2 - yy1 + 1)
inter = w * h
ovr = inter / (areas[i] + areas[order[1:]] - inter)
inds = np.where(ovr <= nms_thr)[0]
order = order[inds + 1]
return keep
def multiclass_nms(boxes, scores, nms_thr, score_thr):
"""Multiclass NMS implemented in Numpy"""
final_dets = []
num_classes = scores.shape[1]
for cls_ind in range(num_classes):
cls_scores = scores[:, cls_ind]
valid_score_mask = cls_scores > score_thr
if valid_score_mask.sum() == 0:
continue
else:
valid_scores = cls_scores[valid_score_mask]
valid_boxes = boxes[valid_score_mask]
keep = nms(valid_boxes, valid_scores, nms_thr)
if len(keep) > 0:
cls_inds = np.ones((len(keep), 1)) * cls_ind
dets = np.concatenate([valid_boxes[keep], valid_scores[keep, None], cls_inds], 1)
final_dets.append(dets)
if len(final_dets) == 0:
return None
return np.concatenate(final_dets, 0)
def demo_postprocess(outputs, img_size, p6=False):
grids = []
expanded_strides = []
if not p6:
strides = [8, 16, 32]
else:
strides = [8, 16, 32, 64]
hsizes = [img_size[0]//stride for stride in strides]
wsizes = [img_size[1]//stride for stride in strides]
for hsize, wsize, stride in zip(hsizes, wsizes, strides):
xv, yv = np.meshgrid(np.arange(hsize), np.arange(wsize))
grid = np.stack((xv, yv), 2).reshape(1, -1, 2)
grids.append(grid)
shape = grid.shape[:2]
expanded_strides.append(np.full((*shape, 1), stride))
grids = np.concatenate(grids, 1)
expanded_strides = np.concatenate(expanded_strides, 1)
outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides
outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides
return outputs
================================================
FILE: detector/YOLOX/yolox/utils/dist.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# This file mainly comes from
# https://github.com/facebookresearch/detectron2/blob/master/detectron2/utils/comm.py
# Copyright (c) Facebook, Inc. and its affiliates.
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
"""
This file contains primitives for multi-gpu communication.
This is useful when doing distributed training.
"""
import functools
import logging
import pickle
import time
import numpy as np
import torch
from torch import distributed as dist
__all__ = [
"is_main_process",
"synchronize",
"get_world_size",
"get_rank",
"get_local_rank",
"get_local_size",
"time_synchronized",
"gather",
"all_gather",
]
_LOCAL_PROCESS_GROUP = None
def synchronize():
"""
Helper function to synchronize (barrier) among all processes when using distributed training
"""
if not dist.is_available():
return
if not dist.is_initialized():
return
world_size = dist.get_world_size()
if world_size == 1:
return
dist.barrier()
def get_world_size() -> int:
if not dist.is_available():
return 1
if not dist.is_initialized():
return 1
return dist.get_world_size()
def get_rank() -> int:
if not dist.is_available():
return 0
if not dist.is_initialized():
return 0
return dist.get_rank()
def get_local_rank() -> int:
"""
Returns:
The rank of the current process within the local (per-machine) process group.
"""
if not dist.is_available():
return 0
if not dist.is_initialized():
return 0
assert _LOCAL_PROCESS_GROUP is not None
return dist.get_rank(group=_LOCAL_PROCESS_GROUP)
def get_local_size() -> int:
"""
Returns:
The size of the per-machine process group, i.e. the number of processes per machine.
"""
if not dist.is_available():
return 1
if not dist.is_initialized():
return 1
return dist.get_world_size(group=_LOCAL_PROCESS_GROUP)
def is_main_process() -> bool:
return get_rank() == 0
@functools.lru_cache()
def _get_global_gloo_group():
"""
Return a process group based on gloo backend, containing all the ranks
The result is cached.
"""
if dist.get_backend() == "nccl":
return dist.new_group(backend="gloo")
else:
return dist.group.WORLD
def _serialize_to_tensor(data, group):
backend = dist.get_backend(group)
assert backend in ["gloo", "nccl"]
device = torch.device("cpu" if backend == "gloo" else "cuda")
buffer = pickle.dumps(data)
if len(buffer) > 1024 ** 3:
logger = logging.getLogger(__name__)
logger.warning(
"Rank {} trying to all-gather {:.2f} GB of data on device {}".format(
get_rank(), len(buffer) / (1024 ** 3), device
)
)
storage = torch.ByteStorage.from_buffer(buffer)
tensor = torch.ByteTensor(storage).to(device=device)
return tensor
def _pad_to_largest_tensor(tensor, group):
"""
Returns:
list[int]: size of the tensor, on each rank
Tensor: padded tensor that has the max size
"""
world_size = dist.get_world_size(group=group)
assert (
world_size >= 1
), "comm.gather/all_gather must be called from ranks within the given group!"
local_size = torch.tensor([tensor.numel()], dtype=torch.int64, device=tensor.device)
size_list = [
torch.zeros([1], dtype=torch.int64, device=tensor.device)
for _ in range(world_size)
]
dist.all_gather(size_list, local_size, group=group)
size_list = [int(size.item()) for size in size_list]
max_size = max(size_list)
# we pad the tensor because torch all_gather does not support
# gathering tensors of different shapes
if local_size != max_size:
padding = torch.zeros(
(max_size - local_size,), dtype=torch.uint8, device=tensor.device
)
tensor = torch.cat((tensor, padding), dim=0)
return size_list, tensor
def all_gather(data, group=None):
"""
Run all_gather on arbitrary picklable data (not necessarily tensors).
Args:
data: any picklable object
group: a torch process group. By default, will use a group which
contains all ranks on gloo backend.
Returns:
list[data]: list of data gathered from each rank
"""
if get_world_size() == 1:
return [data]
if group is None:
group = _get_global_gloo_group()
if dist.get_world_size(group) == 1:
return [data]
tensor = _serialize_to_tensor(data, group)
size_list, tensor = _pad_to_largest_tensor(tensor, group)
max_size = max(size_list)
# receiving Tensor from all ranks
tensor_list = [
torch.empty((max_size,), dtype=torch.uint8, device=tensor.device)
for _ in size_list
]
dist.all_gather(tensor_list, tensor, group=group)
data_list = []
for size, tensor in zip(size_list, tensor_list):
buffer = tensor.cpu().numpy().tobytes()[:size]
data_list.append(pickle.loads(buffer))
return data_list
def gather(data, dst=0, group=None):
"""
Run gather on arbitrary picklable data (not necessarily tensors).
Args:
data: any picklable object
dst (int): destination rank
group: a torch process group. By default, will use a group which
contains all ranks on gloo backend.
Returns:
list[data]: on dst, a list of data gathered from each rank. Otherwise,
an empty list.
"""
if get_world_size() == 1:
return [data]
if group is None:
group = _get_global_gloo_group()
if dist.get_world_size(group=group) == 1:
return [data]
rank = dist.get_rank(group=group)
tensor = _serialize_to_tensor(data, group)
size_list, tensor = _pad_to_largest_tensor(tensor, group)
# receiving Tensor from all ranks
if rank == dst:
max_size = max(size_list)
tensor_list = [
torch.empty((max_size,), dtype=torch.uint8, device=tensor.device)
for _ in size_list
]
dist.gather(tensor, tensor_list, dst=dst, group=group)
data_list = []
for size, tensor in zip(size_list, tensor_list):
buffer = tensor.cpu().numpy().tobytes()[:size]
data_list.append(pickle.loads(buffer))
return data_list
else:
dist.gather(tensor, [], dst=dst, group=group)
return []
def shared_random_seed():
"""
Returns:
int: a random number that is the same across all workers.
If workers need a shared RNG, they can use this shared seed to
create one.
All workers must call this function, otherwise it will deadlock.
"""
ints = np.random.randint(2 ** 31)
all_ints = all_gather(ints)
return all_ints[0]
def time_synchronized():
"""pytorch-accurate time"""
if torch.cuda.is_available():
torch.cuda.synchronize()
return time.time()
================================================
FILE: detector/YOLOX/yolox/utils/ema.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
import math
from copy import deepcopy
import torch
import torch.nn as nn
def is_parallel(model):
"""check if model is in parallel mode."""
import apex
parallel_type = (
nn.parallel.DataParallel,
nn.parallel.DistributedDataParallel,
apex.parallel.distributed.DistributedDataParallel,
)
return isinstance(model, parallel_type)
def copy_attr(a, b, include=(), exclude=()):
# Copy attributes from b to a, options to only include [...] and to exclude [...]
for k, v in b.__dict__.items():
if (len(include) and k not in include) or k.startswith("_") or k in exclude:
continue
else:
setattr(a, k, v)
class ModelEMA:
"""
Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models
Keep a moving average of everything in the model state_dict (parameters and buffers).
This is intended to allow functionality like
https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
A smoothed version of the weights is necessary for some training schemes to perform well.
This class is sensitive where it is initialized in the sequence of model init,
GPU assignment and distributed training wrappers.
"""
def __init__(self, model, decay=0.9999, updates=0):
"""
Args:
model (nn.Module): model to apply EMA.
decay (float): ema decay reate.
updates (int): counter of EMA updates.
"""
# Create EMA(FP32)
self.ema = deepcopy(model.module if is_parallel(model) else model).eval()
self.updates = updates
# decay exponential ramp (to help early epochs)
self.decay = lambda x: decay * (1 - math.exp(-x / 2000))
for p in self.ema.parameters():
p.requires_grad_(False)
def update(self, model):
# Update EMA parameters
with torch.no_grad():
self.updates += 1
d = self.decay(self.updates)
msd = (
model.module.state_dict() if is_parallel(model) else model.state_dict()
) # model state_dict
for k, v in self.ema.state_dict().items():
if v.dtype.is_floating_point:
v *= d
v += (1.0 - d) * msd[k].detach()
def update_attr(self, model, include=(), exclude=("process_group", "reducer")):
# Update EMA attributes
copy_attr(self.ema, model, include, exclude)
================================================
FILE: detector/YOLOX/yolox/utils/logger.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
import inspect
import os
import sys
from loguru import logger
def get_caller_name(depth=0):
"""
Args:
depth (int): Depth of caller conext, use 0 for caller depth. Default value: 0.
Returns:
str: module name of the caller
"""
# the following logic is a little bit faster than inspect.stack() logic
frame = inspect.currentframe().f_back
for _ in range(depth):
frame = frame.f_back
return frame.f_globals["__name__"]
class StreamToLoguru:
"""
stream object that redirects writes to a logger instance.
"""
def __init__(self, level="INFO", caller_names=("apex", "pycocotools")):
"""
Args:
level(str): log level string of loguru. Default value: "INFO".
caller_names(tuple): caller names of redirected module.
Default value: (apex, pycocotools).
"""
self.level = level
self.linebuf = ""
self.caller_names = caller_names
def write(self, buf):
full_name = get_caller_name(depth=1)
module_name = full_name.rsplit(".", maxsplit=-1)[0]
if module_name in self.caller_names:
for line in buf.rstrip().splitlines():
# use caller level log
logger.opt(depth=2).log(self.level, line.rstrip())
else:
sys.__stdout__.write(buf)
def flush(self):
pass
def redirect_sys_output(log_level="INFO"):
redirect_logger = StreamToLoguru(log_level)
sys.stderr = redirect_logger
sys.stdout = redirect_logger
def setup_logger(save_dir, distributed_rank=0, filename="log.txt", mode="a"):
"""setup logger for training and testing.
Args:
save_dir(str): location to save log file
distributed_rank(int): device rank when multi-gpu environment
filename (string): log save name.
mode(str): log file write mode, `append` or `override`. default is `a`.
Return:
logger instance.
"""
loguru_format = (
"{time:YYYY-MM-DD HH:mm:ss} | "
"{level: <8} | "
"{name} :{line} - {message} "
)
logger.remove()
save_file = os.path.join(save_dir, filename)
if mode == "o" and os.path.exists(save_file):
os.remove(save_file)
# only keep logger in rank0 process
if distributed_rank == 0:
logger.add(
sys.stderr,
format=loguru_format,
level="INFO",
enqueue=True,
)
logger.add(save_file)
# redirect stdout/stderr to loguru
redirect_sys_output("INFO")
================================================
FILE: detector/YOLOX/yolox/utils/lr_scheduler.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
import math
from functools import partial
class LRScheduler:
def __init__(self, name, lr, iters_per_epoch, total_epochs, **kwargs):
"""
Supported lr schedulers: [cos, warmcos, multistep]
Args:
lr (float): learning rate.
iters_per_peoch (int): number of iterations in one epoch.
total_epochs (int): number of epochs in training.
kwargs (dict):
- cos: None
- warmcos: [warmup_epochs, warmup_lr_start (default 1e-6)]
- multistep: [milestones (epochs), gamma (default 0.1)]
"""
self.lr = lr
self.iters_per_epoch = iters_per_epoch
self.total_epochs = total_epochs
self.total_iters = iters_per_epoch * total_epochs
self.__dict__.update(kwargs)
self.lr_func = self._get_lr_func(name)
def update_lr(self, iters):
return self.lr_func(iters)
def _get_lr_func(self, name):
if name == "cos": # cosine lr schedule
lr_func = partial(cos_lr, self.lr, self.total_iters)
elif name == "warmcos":
warmup_total_iters = self.iters_per_epoch * self.warmup_epochs
warmup_lr_start = getattr(self, "warmup_lr_start", 1e-6)
lr_func = partial(
warm_cos_lr,
self.lr,
self.total_iters,
warmup_total_iters,
warmup_lr_start,
)
elif name == "yoloxwarmcos":
warmup_total_iters = self.iters_per_epoch * self.warmup_epochs
no_aug_iters = self.iters_per_epoch * self.no_aug_epochs
warmup_lr_start = getattr(self, "warmup_lr_start", 0)
min_lr_ratio = getattr(self, "min_lr_ratio", 0.2)
lr_func = partial(
yolox_warm_cos_lr,
self.lr,
min_lr_ratio,
self.total_iters,
warmup_total_iters,
warmup_lr_start,
no_aug_iters,
)
elif name == "yoloxsemiwarmcos":
warmup_lr_start = getattr(self, "warmup_lr_start", 0)
min_lr_ratio = getattr(self, "min_lr_ratio", 0.2)
warmup_total_iters = self.iters_per_epoch * self.warmup_epochs
no_aug_iters = self.iters_per_epoch * self.no_aug_epochs
normal_iters = self.iters_per_epoch * self.semi_epoch
semi_iters = self.iters_per_epoch_semi * (
self.total_epochs - self.semi_epoch - self.no_aug_epochs
)
lr_func = partial(
yolox_semi_warm_cos_lr,
self.lr,
min_lr_ratio,
warmup_lr_start,
self.total_iters,
normal_iters,
no_aug_iters,
warmup_total_iters,
semi_iters,
self.iters_per_epoch,
self.iters_per_epoch_semi,
)
elif name == "multistep": # stepwise lr schedule
milestones = [
int(self.total_iters * milestone / self.total_epochs)
for milestone in self.milestones
]
gamma = getattr(self, "gamma", 0.1)
lr_func = partial(multistep_lr, self.lr, milestones, gamma)
else:
raise ValueError("Scheduler version {} not supported.".format(name))
return lr_func
def cos_lr(lr, total_iters, iters):
"""Cosine learning rate"""
lr *= 0.5 * (1.0 + math.cos(math.pi * iters / total_iters))
return lr
def warm_cos_lr(lr, total_iters, warmup_total_iters, warmup_lr_start, iters):
"""Cosine learning rate with warm up."""
if iters <= warmup_total_iters:
lr = (lr - warmup_lr_start) * iters / float(
warmup_total_iters
) + warmup_lr_start
else:
lr *= 0.5 * (
1.0
+ math.cos(
math.pi
* (iters - warmup_total_iters)
/ (total_iters - warmup_total_iters)
)
)
return lr
def yolox_warm_cos_lr(
lr,
min_lr_ratio,
total_iters,
warmup_total_iters,
warmup_lr_start,
no_aug_iter,
iters,
):
"""Cosine learning rate with warm up."""
min_lr = lr * min_lr_ratio
if iters <= warmup_total_iters:
# lr = (lr - warmup_lr_start) * iters / float(warmup_total_iters) + warmup_lr_start
lr = (lr - warmup_lr_start) * pow(
iters / float(warmup_total_iters), 2
) + warmup_lr_start
elif iters >= total_iters - no_aug_iter:
lr = min_lr
else:
lr = min_lr + 0.5 * (lr - min_lr) * (
1.0
+ math.cos(
math.pi
* (iters - warmup_total_iters)
/ (total_iters - warmup_total_iters - no_aug_iter)
)
)
return lr
def yolox_semi_warm_cos_lr(
lr,
min_lr_ratio,
warmup_lr_start,
total_iters,
normal_iters,
no_aug_iters,
warmup_total_iters,
semi_iters,
iters_per_epoch,
iters_per_epoch_semi,
iters,
):
"""Cosine learning rate with warm up."""
min_lr = lr * min_lr_ratio
if iters <= warmup_total_iters:
# lr = (lr - warmup_lr_start) * iters / float(warmup_total_iters) + warmup_lr_start
lr = (lr - warmup_lr_start) * pow(
iters / float(warmup_total_iters), 2
) + warmup_lr_start
elif iters >= normal_iters + semi_iters:
lr = min_lr
elif iters <= normal_iters:
lr = min_lr + 0.5 * (lr - min_lr) * (
1.0
+ math.cos(
math.pi
* (iters - warmup_total_iters)
/ (total_iters - warmup_total_iters - no_aug_iters)
)
)
else:
lr = min_lr + 0.5 * (lr - min_lr) * (
1.0
+ math.cos(
math.pi
* (
normal_iters
- warmup_total_iters
+ (iters - normal_iters)
* iters_per_epoch
* 1.0
/ iters_per_epoch_semi
)
/ (total_iters - warmup_total_iters - no_aug_iters)
)
)
return lr
def multistep_lr(lr, milestones, gamma, iters):
"""MultiStep learning rate"""
for milestone in milestones:
lr *= gamma if iters >= milestone else 1.0
return lr
================================================
FILE: detector/YOLOX/yolox/utils/metric.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
import functools
import os
import time
from collections import defaultdict, deque
import numpy as np
import torch
__all__ = [
"AverageMeter",
"MeterBuffer",
"get_total_and_free_memory_in_Mb",
"occumpy_mem",
"gpu_mem_usage",
]
def get_total_and_free_memory_in_Mb(cuda_device):
devices_info_str = os.popen(
"nvidia-smi --query-gpu=memory.total,memory.used --format=csv,nounits,noheader"
)
devices_info = devices_info_str.read().strip().split("\n")
total, used = devices_info[int(cuda_device)].split(",")
return int(total), int(used)
def occumpy_mem(cuda_device, mem_ratio=0.9):
"""
pre-allocate gpu memory for training to avoid memory Fragmentation.
"""
total, used = get_total_and_free_memory_in_Mb(cuda_device)
max_mem = int(total * mem_ratio)
block_mem = max_mem - used
x = torch.cuda.FloatTensor(256, 1024, block_mem)
del x
time.sleep(5)
def gpu_mem_usage():
"""
Compute the GPU memory usage for the current device (MB).
"""
mem_usage_bytes = torch.cuda.max_memory_allocated()
return mem_usage_bytes / (1024 * 1024)
class AverageMeter:
"""Track a series of values and provide access to smoothed values over a
window or the global series average.
"""
def __init__(self, window_size=50):
self._deque = deque(maxlen=window_size)
self._total = 0.0
self._count = 0
def update(self, value):
self._deque.append(value)
self._count += 1
self._total += value
@property
def median(self):
d = np.array(list(self._deque))
return np.median(d)
@property
def avg(self):
# if deque is empty, nan will be returned.
d = np.array(list(self._deque))
return d.mean()
@property
def global_avg(self):
return self._total / max(self._count, 1e-5)
@property
def latest(self):
return self._deque[-1] if len(self._deque) > 0 else None
@property
def total(self):
return self._total
def reset(self):
self._deque.clear()
self._total = 0.0
self._count = 0
def clear(self):
self._deque.clear()
class MeterBuffer(defaultdict):
"""Computes and stores the average and current value"""
def __init__(self, window_size=20):
factory = functools.partial(AverageMeter, window_size=window_size)
super().__init__(factory)
def reset(self):
for v in self.values():
v.reset()
def get_filtered_meter(self, filter_key="time"):
return {k: v for k, v in self.items() if filter_key in k}
def update(self, values=None, **kwargs):
if values is None:
values = {}
values.update(kwargs)
for k, v in values.items():
self[k].update(v)
def clear_meters(self):
for v in self.values():
v.clear()
================================================
FILE: detector/YOLOX/yolox/utils/model_utils.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
from copy import deepcopy
import torch
import torch.nn as nn
from thop import profile
__all__ = [
"fuse_conv_and_bn", "fuse_model", "get_model_info", "replace_module",
]
def get_model_info(model, tsize):
stride = 64
img = torch.zeros((1, 3, stride, stride), device=next(model.parameters()).device)
flops, params = profile(deepcopy(model), inputs=(img,), verbose=False)
params /= 1e6
flops /= 1e9
flops *= tsize[0] * tsize[1] / stride / stride * 2 # Gflops
info = "Params: {:.2f}M, Gflops: {:.2f}".format(params, flops)
return info
def fuse_conv_and_bn(conv, bn):
# Fuse convolution and batchnorm layers https://tehnokv.com/posts/fusing-batchnorm-and-conv/
fusedconv = (
nn.Conv2d(
conv.in_channels,
conv.out_channels,
kernel_size=conv.kernel_size,
stride=conv.stride,
padding=conv.padding,
groups=conv.groups,
bias=True,
)
.requires_grad_(False)
.to(conv.weight.device)
)
# prepare filters
w_conv = conv.weight.clone().view(conv.out_channels, -1)
w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var)))
fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.shape))
# prepare spatial bias
b_conv = (
torch.zeros(conv.weight.size(0), device=conv.weight.device)
if conv.bias is None
else conv.bias
)
b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(
torch.sqrt(bn.running_var + bn.eps)
)
fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn)
return fusedconv
def fuse_model(model):
from yolox.models.network_blocks import BaseConv
for m in model.modules():
if type(m) is BaseConv and hasattr(m, "bn"):
m.conv = fuse_conv_and_bn(m.conv, m.bn) # update conv
delattr(m, "bn") # remove batchnorm
m.forward = m.fuseforward # update forward
return model
def replace_module(module, replaced_module_type, new_module_type, replace_func=None):
"""
Replace given type in module to a new type. mostly used in deploy.
Args:
module (nn.Module): model to apply replace operation.
replaced_module_type (Type): module type to be replaced.
new_module_type (Type)
replace_func (function): python function to describe replace logic. Defalut value None.
Returns:
model (nn.Module): module that already been replaced.
"""
def default_replace_func(replaced_module_type, new_module_type):
return new_module_type()
if replace_func is None:
replace_func = default_replace_func
model = module
if isinstance(module, replaced_module_type):
model = replace_func(replaced_module_type, new_module_type)
else: # recurrsively replace
for name, child in module.named_children():
new_child = replace_module(child, replaced_module_type, new_module_type)
if new_child is not child: # child is already replaced
model.add_module(name, new_child)
return model
================================================
FILE: detector/YOLOX/yolox/utils/setup_env.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
import os
import subprocess
import cv2
__all__ = ["configure_nccl", "configure_module"]
def configure_nccl():
"""Configure multi-machine environment variables of NCCL."""
os.environ["NCCL_LAUNCH_MODE"] = "PARALLEL"
os.environ["NCCL_IB_HCA"] = subprocess.getoutput(
"pushd /sys/class/infiniband/ > /dev/null; for i in mlx5_*; "
"do cat $i/ports/1/gid_attrs/types/* 2>/dev/null "
"| grep v >/dev/null && echo $i ; done; popd > /dev/null"
)
os.environ["NCCL_IB_GID_INDEX"] = "3"
os.environ["NCCL_IB_TC"] = "106"
def configure_module(ulimit_value=8192):
"""
Configure pytorch module environment. setting of ulimit and cv2 will be set.
Args:
ulimit_value(int): default open file number on linux. Default value: 8192.
"""
# system setting
try:
import resource
rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
resource.setrlimit(resource.RLIMIT_NOFILE, (ulimit_value, rlimit[1]))
except Exception:
# Exception might be raised in Windows OS or rlimit reaches max limit number.
# However, set rlimit value might not be necessary.
pass
# cv2
# multiprocess might be harmful on performance of torch dataloader
os.environ["OPENCV_OPENCL_RUNTIME"] = "disabled"
try:
cv2.setNumThreads(0)
cv2.ocl.setUseOpenCL(False)
except Exception:
# cv2 version mismatch might rasie exceptions.
pass
================================================
FILE: detector/YOLOX/yolox/utils/visualize.py
================================================
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
import cv2
import numpy as np
__all__ = ["vis"]
def vis(img, boxes, scores, cls_ids, conf=0.5, class_names=None):
for i in range(len(boxes)):
box = boxes[i]
cls_id = int(cls_ids[i])
score = scores[i]
if score < conf:
continue
x0 = int(box[0])
y0 = int(box[1])
x1 = int(box[2])
y1 = int(box[3])
color = (_COLORS[cls_id] * 255).astype(np.uint8).tolist()
text = '{}:{:.1f}%'.format(class_names[cls_id], score * 100)
txt_color = (0, 0, 0) if np.mean(_COLORS[cls_id]) > 0.5 else (255, 255, 255)
font = cv2.FONT_HERSHEY_SIMPLEX
txt_size = cv2.getTextSize(text, font, 0.4, 1)[0]
cv2.rectangle(img, (x0, y0), (x1, y1), color, 2)
txt_bk_color = (_COLORS[cls_id] * 255 * 0.7).astype(np.uint8).tolist()
cv2.rectangle(
img,
(x0, y0 + 1),
(x0 + txt_size[0] + 1, y0 + int(1.5*txt_size[1])),
txt_bk_color,
-1
)
cv2.putText(img, text, (x0, y0 + txt_size[1]), font, 0.4, txt_color, thickness=1)
return img
_COLORS = np.array(
[
0.000, 0.447, 0.741,
0.850, 0.325, 0.098,
0.929, 0.694, 0.125,
0.494, 0.184, 0.556,
0.466, 0.674, 0.188,
0.301, 0.745, 0.933,
0.635, 0.078, 0.184,
0.300, 0.300, 0.300,
0.600, 0.600, 0.600,
1.000, 0.000, 0.000,
1.000, 0.500, 0.000,
0.749, 0.749, 0.000,
0.000, 1.000, 0.000,
0.000, 0.000, 1.000,
0.667, 0.000, 1.000,
0.333, 0.333, 0.000,
0.333, 0.667, 0.000,
0.333, 1.000, 0.000,
0.667, 0.333, 0.000,
0.667, 0.667, 0.000,
0.667, 1.000, 0.000,
1.000, 0.333, 0.000,
1.000, 0.667, 0.000,
1.000, 1.000, 0.000,
0.000, 0.333, 0.500,
0.000, 0.667, 0.500,
0.000, 1.000, 0.500,
0.333, 0.000, 0.500,
0.333, 0.333, 0.500,
0.333, 0.667, 0.500,
0.333, 1.000, 0.500,
0.667, 0.000, 0.500,
0.667, 0.333, 0.500,
0.667, 0.667, 0.500,
0.667, 1.000, 0.500,
1.000, 0.000, 0.500,
1.000, 0.333, 0.500,
1.000, 0.667, 0.500,
1.000, 1.000, 0.500,
0.000, 0.333, 1.000,
0.000, 0.667, 1.000,
0.000, 1.000, 1.000,
0.333, 0.000, 1.000,
0.333, 0.333, 1.000,
0.333, 0.667, 1.000,
0.333, 1.000, 1.000,
0.667, 0.000, 1.000,
0.667, 0.333, 1.000,
0.667, 0.667, 1.000,
0.667, 1.000, 1.000,
1.000, 0.000, 1.000,
1.000, 0.333, 1.000,
1.000, 0.667, 1.000,
0.333, 0.000, 0.000,
0.500, 0.000, 0.000,
0.667, 0.000, 0.000,
0.833, 0.000, 0.000,
1.000, 0.000, 0.000,
0.000, 0.167, 0.000,
0.000, 0.333, 0.000,
0.000, 0.500, 0.000,
0.000, 0.667, 0.000,
0.000, 0.833, 0.000,
0.000, 1.000, 0.000,
0.000, 0.000, 0.167,
0.000, 0.000, 0.333,
0.000, 0.000, 0.500,
0.000, 0.000, 0.667,
0.000, 0.000, 0.833,
0.000, 0.000, 1.000,
0.000, 0.000, 0.000,
0.143, 0.143, 0.143,
0.286, 0.286, 0.286,
0.429, 0.429, 0.429,
0.571, 0.571, 0.571,
0.714, 0.714, 0.714,
0.857, 0.857, 0.857,
0.000, 0.447, 0.741,
0.314, 0.717, 0.741,
0.50, 0.5, 0
]
).astype(np.float32).reshape(-1, 3)
================================================
FILE: docs/DATA.md
================================================
# Dataset preparation
### Introduction
In this documentation we introduce how to prepara standard datasets to benchmark UniTrack on different tasks. We consider five tasks: Single Object Tracking (SOT) on OTB 2015 dataset, Video Object Segmentation (VOS) on DAVIS 2017 dataset, Multiple Object Tracking (MOT) on MOT 16 dataset, Multiple Object Tracking and Segmentation (MOTS) on MOTS dataset, Pose Tracking on PoseTrack 2018 dataset. Among them, SOT and VOS are propagation-type tasks, in which only one observation (usually in the very first frame) is given to indicate the object to be tracked, while others are association-type tasks that support to make use of observations in every timestamp given by an automatic detector .
- **Table of contents**
- [Prepare OTB 2015 datset for SOT](#OTB-2015-dataset-for-SOT)
- [Prepare DAVIS 2017 datset for VOS](#DAVIS-2017-dataset-for-VOS)
- [Prepare MOT 16 datset for MOT](#MOT-16-dataset-for-MOT)
- [Prepare MOTS dataset for MOTS](#MOTS-dataset-for-MOTS)
- [Prepare PoseTrack 2018 for Pose Tracking](#PoseTrack-2018-dataset-for-Pose-Tracking)
### OTB 2015 dataset for SOT
The [original source](http://cvlab.hanyang.ac.kr/tracker_benchmark/datasets.html) of OTB benchmark does not provide a convenient way to download the entire dataset. Lukily [Gluon CV](https://cv.gluon.ai/contents.html) provides a [script](https://cv.gluon.ai/_downloads/719c5c0d73fb22deacc84b4557b6fd5f/otb2015.py) for easy downloading all OTB video sequences. This script includes both dataset downloading and data processing,simply run this script:
`python otb2015.py`
and you will get all the 100 sequences of OTB. After this, you need to copy Jogging to Jogging-1 and Jogging-2, and copy Skating2 to Skating2-1 and Skating2-2 or using softlink, following [STVIR](https://github.com/STVIR/pysot/tree/master/testing_dataset). Finally, please download OTB2015.josn \[[Google Drive](https://drive.google.com/file/d/1jHYta8wsSid9DwcWl5hcNJNPzgQMcI_r/view?usp=sharing)\]\[[Baidu NetDisk](https://pan.baidu.com/s/1d9oR7ZEHq4V5i6bLpEllng)\] (code:k93s) and place it under the OTB-2015 root. The structrue should look like this:
```
${OTB_ROOT}
|——— OTB2015.json
|
└———Basketball/
|
└———Biker/
|
...
```
### DAVIS 2017 dataset for VOS
Download DAVIS 2017 trainval via [this link](https://data.vision.ee.ethz.ch/csergi/share/davis/DAVIS-2017-trainval-480p.zip) and unzip it. No other processing is needed.
### MOT 16 dataset for MOT
1. Download MOT-16 dataset from [this page](https://motchallenge.net/data/MOT16/).
2. Get detections for MOT-16 sequences. Here we offer three options:
- Using ground-truth detections. This is feasible only in the *train* split (we do not have labels for the *test* split). Run `python tools/gen_mot16_gt.py` to prepare the detections.
- Using three kind of official detections (DPM/FRCNN/SDP) provided by MOT Challenge. Detections are from MOT-17 dataset (the video sequences in MOT-17 are the same as MOT-16), so you may need download MOT-17 and unzip it under the same root of MOT-16 first. Then run `python tools/gen_mot16_label17.py` to prepare the detections. Can generate detections for both *train* and *test* splits.
- [Recommended] Using custom detectors to generate detection results. You need to first run the detector on MOT-16 dataset and output a series of `MOT16-XX.txt` files to store the detection results, where XX ranges from 01 to 14. Each line in the `.txt` file represents a bounding box in format of `[frame_index](starts from 1), x, y, w, h, confidence`. We provide an example generated by FairMOT detector \[[Google Drive](https://drive.google.com/file/d/113xks7UIZ6LeBY_CTlOh5Z_OQ0hiP551/view?usp=sharing)\]/\[[Baidu NetDisk](https://pan.baidu.com/s/1-E9SN4rWWpZRT1ermcX0JA)\] (code:k93s). Finally run `tools/gen_mot16_fairmot.py` to prepare the detections.
A good point is that you can also download `.txt` results of other trackers from the [MOT-16 leaderboard](https://motchallenge.net/results/MOT16/) or of other detectors from the [MOT-17 DET leaderboard](https://motchallenge.net/results/MOT17Det/), and use their detection results with very few modifications on `tools/gen_mot16_fairmot.py`.
### MOTS dataset for MOTS
1. Download MOT-16 dataset from [this page](https://motchallenge.net/data/MOTS/).
2. Get segmentation masks for MOTS sequences. Here we offer two options:
- Using ground-truth detections. This is feasible only in the *train* split (we do not have labels for the *test* split). Run `python tools/gen_mots_gt.py` to prepare the detections.
- [Recommended] Using custom models to generate segmentation masks. You need to first run the model on MOTS dataset and output a series of `MOTS-XX.txt` files to store the mask results. See [here](https://motchallenge.net/instructions/) for the output format. Note that we do not use the track "id" field so you can output any number as a place holder. You can download results of off-the-shelf trackers and use their masks, for example, simply download the raw data of results of the COSTA tracker in the bottom of [this page](https://motchallenge.net/method/MOTS=87&chl=17). Finally run `tools/gen_mot16_fairmot.py` to prepare the masks (The script will keep the track "id" field. But again, it should be noted that the track "id" field is ignored when run tracking with UniTrack).
### PoseTrack 2018 dataset for Pose Tracking
1. Register and download [PoseTrack 2018 dataset](https://posetrack.net/).
2. Get single-frame pose estimation results. Run a single-frame pose estimator, and save results in a `$OBS_NAME.json` file.Results should be formatted as instructed [here](https://github.com/leonid-pishchulin/poseval). The "track_id" field is ignored so you can output any number as a place holder.
3. Put the `.json` file under `$POSETRACK_ROOT/obs/$SPLIT/` folder, where `SPLIT` could be "train" or "val".
================================================
FILE: docs/INSTALL.md
================================================
# Installation
### Requirements
* Nvidia device with CUDA
* Python 3.7+
* PyTorch 1.7.0+
* torchvision 0.8.0+
* Other python packages in requirements.txt
### Code installation
#### (Recommended) Install with conda
Install conda from [here](https://repo.anaconda.com/miniconda/), Miniconda3-latest-(OS)-(platform).
```shell
# 1. Create a conda virtual environment.
conda create -n unitrack python=3.7 -y
conda activate unitrack
# 2. Install PyTorch
conda install pytorch==1.7.0 torchvision cudatoolkit
# 3. Get UniTrack
git clone https://github.com/Zhongdao/UniTrack.git
cd UniTrack
# 4. Install ohter dependency
conda install --file requirements.txt
pip install cython_bbox==0.1.3
python setup.py
```
================================================
FILE: docs/MODELZOO.md
================================================
# MODEL ZOO
### Prepare apperance models
One beneficial usage of UniTrack is that it allows easy evaluation of pre-trained models (as appearance models) on diverse tracking tasks. By far we have tested the following models, mostly self-supervised pre-trained:
| Pre-training Method | Architecture |Link |
| :---: | :---: | :---: |
| ImageNet classification | ResNet-50 | torchvision |
| InsDist| ResNet-50 | [Google Drive](https://www.dropbox.com/sh/87d24jqsl6ra7t2/AACcsSIt1_Njv7GsmsuzZ6Sta/InsDis.pth)|
| MoCo-V1| ResNet-50 |[Google Drive](https://dl.fbaipublicfiles.com/moco/moco_checkpoints/moco_v1_200ep/moco_v1_200ep_pretrain.pth.tar)|
| PCL-V1| ResNet-50 |[Google Drive](https://storage.googleapis.com/sfr-pcl-data-research/PCL_checkpoint/PCL_v1_epoch200.pth.tar)|
| PIRL| ResNet-50 | [Google Drive](https://www.dropbox.com/sh/87d24jqsl6ra7t2/AADN4jKnvTI0U5oT6hTmQZz8a/PIRL.pth)|
| PCL-V2| ResNet-50 | [Google Drive](https://storage.googleapis.com/sfr-pcl-data-research/PCL_checkpoint/PCL_v2_epoch200.pth.tar)|
| SimCLR-V1| ResNet-50 |[Google Drive](https://drive.google.com/file/d/1RdB2KaaXOtU2_t-Uk_HQbxMZgSGUcy6c/view?usp=sharing)|
| MoCo-V2| ResNet-50 |[Google Drive](https://dl.fbaipublicfiles.com/moco/moco_checkpoints/moco_v2_800ep/moco_v2_800ep_pretrain.pth.tar)|
| SimCLR-V2| ResNet-50 |[Google Drive](https://drive.google.com/file/d/1NSCrZ7MaejJaOS7yA3URtbubxLR-fz5X/view?usp=sharing)|
| SeLa-V2| ResNet-50 |[Google Drive](https://dl.fbaipublicfiles.com/deepcluster/selav2_400ep_pretrain.pth.tar)|
| InfoMin| ResNet-50 | [Google Drive](https://www.dropbox.com/sh/87d24jqsl6ra7t2/AAAzMTynP3Qc8mIE4XWkgILUa/InfoMin_800.pth)|
| BarlowTwins| ResNet-50 | [Google Drive](https://drive.google.com/file/d/1iXfAiAZP3Lrc-Hk4QHUzO-mk4M4fElQw/view?usp=sharing)|
| BYOL| ResNet-50 | [Google Drive](https://storage.googleapis.com/deepmind-byol/checkpoints/pretrain_res50x1.pkl)|
| DeepCluster-V2| ResNet-50 |[Google Drive](https://dl.fbaipublicfiles.com/deepcluster/deepclusterv2_800ep_pretrain.pth.tar)|
| SwAV| ResNet-50 |[Google Drive](https://dl.fbaipublicfiles.com/deepcluster/swav_800ep_pretrain.pth.tar)|
| PixPro| ResNet-50 |[Google Drive](https://drive.google.com/file/d/1u172sUx-kldPvrZzZxijciBHLMiSJp46/view?usp=sharing)|
| DetCo| ResNet-50 | [Google Drive](https://drive.google.com/file/d/1ahyX8HEbLUZXS-9Jr2GIMWDEZdqWe1GV/view?usp=sharing)|
| TimeCycle| ResNet-50 |[Google Drive](https://drive.google.com/file/d/1WUYLkfowJ853RG_9OhbrKpb3r-cc-cOA/view?usp=sharing)|
| ImageNet classification | ResNet-18 |torchvision|
| Colorization + memory| ResNet-18 | [Google Drive](https://drive.google.com/file/d/1gWPRgYH70t-9uwj0EId826ZxFdosbzQv/view?usp=sharing)|
| UVC| ResNet-18 |[Google Drive](https://drive.google.com/file/d/1nl0ehS8mvE5PUBOPLQSCWtrmFmS0-dPX/view?usp=sharing)|
| CRW| ResNet-18 |[Google Drive](https://drive.google.com/file/d/1C1ujnpFRijJqVD3PV7qzyYwGSWoS9fLb/view?usp=sharing)|
After downloading an appearance model, please place it under `$UNITRACK_ROOT/weights`. A large part of the model checkpoints are adopted from [ssl-transfer](https://github.com/linusericsson/ssl-transfer), many thanks to [linusericsson](https://github.com/linusericsson)!
### Test your own pre-trained models as appearance models
If your model uses the standard ResNet architecture, you can directly test it using UniTrack without additional modifications. If you use ResNet but the parameter names are not consistent with the standard naming, you can simply rename parameter groups and load your weights into the standard ResNet. If you are using other architectures, it is also possible to test it with UniTrack. You may need a little hack: just remember to let the model output 8x down-sampled feature maps. You can check out `models/hrnet.py` for an example.
================================================
FILE: docs/RESULTS.md
================================================
### Quantitative results
**Single Object Tracking (SOT) on OTB-2015**
| Method | SiamFC | SiamRPN | SiamRPN++ | UDT* | UDT+* | LUDT* | LUDT+* | UniTrack_XCorr* | UniTrack_DCF* |
| :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
| AUC | 58.2 | 63.7 | 69.6 | 59.4 | 63.2 | 60.2 | 63.9 | 55.5 | 61.8|
\* indicates non-supervised methods
**Video Object Segmentation (VOS) on DAVIS-2017 *val* split**
| Method | SiamMask | FeelVOS | STM | Colorization* | TimeCycle* | UVC* | CRW* | VFS* | UniTrack* |
| :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
| J-mean | 54.3 | 63.7 | 79.2 | 34.6 | 40.1 | 56.7 | 64.8 | 66.5 | 58.4|
\* indicates non-supervised methods
**Multiple Object Tracking (MOT) on MOT-16 [*test* set *private detector* track](https://motchallenge.net/method/MOT=3856&chl=5)**
| Method | POI | DeepSORT-2 | JDE | CTrack | TubeTK | TraDes | CSTrack | FairMOT* | UniTrack* |
| :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: | :---: |
| IDF-1 | 65.1 | 62.2 | 55.8 | 57.2 | 62.2 | 64.7 | 71.8 | 72.8 | 71.8|
| IDs | 805 | 781 | 1544 | 1897 | 1236 | 1144 | 1071 | 1074 | 683 |
| MOTA | 66.1 | 61.4 | 64.4 | 67.6 | 66.9 | 70.1 | 70.7 | 74.9 | 74.7|
\* indicates methods using the same detections
**Multiple Object Tracking and Segmentation (MOTS) on MOTS challenge [*test* set](https://motchallenge.net/method/MOTS=109&chl=17)**
| Method | TrackRCNN | SORTS | PointTrack | GMPHD | COSTA_st* | UniTrack* |
| :---: | :---: | :---: | :---: | :---: | :---: | :---: |
| IDF-1 | 42.7 | 57.3 | 42.9 | 65.6 | 70.3 | 67.2 |
| IDs | 567 | 577 | 868 | 566 | 421 | 622 |
| sMOTA | 40.6 | 55.0 | 62.3 | 69.0 | 70.2 | 68.9 |
\* indicates methods using the same detections
**Pose Tracking on PoseTrack-2018 *val* split**
| Method | MDPN | OpenSVAI | Miracle | KeyTrack | LightTrack* | UniTrack* |
| :---: | :---: | :---: | :---: | :---: | :---: | :---: |
| IDF-1 | - | - | - | - | 52.2 | 73.2 |
| IDs | - | - | - | - | 3024 | 6760 |
| sMOTA | 50.6 | 62.4 | 64.0 | 66.6 | 64.8 | 63.5 |
\* indicates methods using the same detections
================================================
FILE: docs/RUN.md
================================================
# Run evaluation on multiple tasks
### Prepare config file
To evaluate an apperance model on multiple tasks, first you need to prepare a config file `${EXP_NAME}.yaml` and place it under the `config/` folder. We provide several example config files:
1. `crw_resnet18_s3.yaml` : Self-supervised model trained with Contrastive Random Walk [1], ResNet-18 stage-3 features.
2. `imagenet_resnet18_s3.yaml`: ImageNet pre-trained model, ResNet-18 stage-3 features.
3. `crw_resnet18_s3_womotion.yaml` : Model same as 1 but motion cues are discarded in association type tasks. This way, distinctions between different representations are better highlighted and potential confounding factors are avoided.
4. `imagenet_resnet18_s3_womotion.yaml`: Model same as 2, motion cues are discared in association type tasks.
### Note for the config file
When you are testing a new model, please take care to make sure the following fields in the config file are correct:
```yaml
common:
# Experiment name, an identifier.
exp_name: crw_resnet18_s3
# Model type, currently support:
# ['imagenet18', 'imagenet50', 'imagenet101', 'random18', 'random50',
# 'imagenet_resnext50', 'imagenet_resnext101'
# 'byol', 'deepcluster-v2', 'infomin', 'insdis', 'moco-v1', 'moco-v2',
# 'pcl-v1', 'pcl-v2','pirl', 'sela-v2', 'swav', 'simclr-v1', 'simclr-v2',
# 'pixpro', 'detco', 'barlowtwins', 'crw', 'uvc', 'timecycle']
model_type: crw
# For ResNet architecture, remove layer4 means output layer3 features
remove_layers: ['layer4']
# Be careful about this
im_mean: [0.4914, 0.4822, 0.4465]
im_std: [0.2023, 0.1994, 0.2010]
# Path to the model weights.
resume: 'weights/crw.pth'
mot:
# The single-frame observations. should correspond to a folder ${mot_root}/obs/${obid}
obid: 'FairMOT'
# Dataset root
mot_root: '/home/wangzd/datasets/MOT/MOT16'
# There is no validation set, so by default we test on the train split.
mots:
# The single-frame observations. should correspond to a folder ${mots_root}/obs/${obid}
obid: 'COSTA'
# Dataset root
mots_root: '/home/wangzd/datasets/GOT/MOTS'
# There is no validation set, so by default we test on the train split.
posetrack:
# The single-frame observations. should correspond to a folder ${mots_root}/obs/val/${obid}
obid: 'lighttrack_MSRA152
# Dataset root
data_root: '/home/wangzd/datasets/GOT/Posetrack2018'
# There is a validation set, by default we test on the val split.
split: 'val'
```
For other arguments, just refer to `crw_resnet18_s3.yaml` or `crw_resnet18_s3_womotion.yaml`.
### Run
Suppose the current path is `$UNITRACK_ROOT`, you can run multiple tasks with a single command:
```shell
./eval.sh $EXP_NAME $GPU_ID
```
You will obtain a set of summaries of quantitative results under `results/summary`, and also visualizations of all results under `results`
[1]. Jabri, Allan, Andrew Owens, and Alexei A. Efros. "Space-time correspondence as a contrastive random walk." In NeurIPS, 2020.
================================================
FILE: eval/convert_davis.py
================================================
import os
import numpy as np
import cv2
import os.path as osp
import pdb
from PIL import Image
jpglist = []
import palette
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('-o', '--out_folder', default='/scratch/ajabri/davis_results/', type=str)
parser.add_argument('-i', '--in_folder', default='/scratch/ajabri/davis_results_masks/', type=str)
parser.add_argument('-d', '--dataset', default='/scratch/ajabri/data/davis/', type=str)
args = parser.parse_args()
annotations_folder = args.dataset + '/Annotations/480p/'
f1 = open(args.dataset + '/ImageSets/2017/val.txt', 'r')
for line in f1:
line = line[:-1]
jpglist.append(line)
f1.close()
out_folder = args.out_folder
current_folder = args.in_folder
if not os.path.exists(out_folder):
os.makedirs(out_folder)
palette = palette.tensor.astype(np.uint8)
def color2id(c):
return np.arange(0, palette.shape[0])[np.all(palette == c, axis=-1)]
def convert_dir(i):
fname = jpglist[i]
gtfolder = osp.join(annotations_folder,fname)
outfolder = osp.join(out_folder,fname)
if not os.path.exists(outfolder):
os.mkdir(outfolder)
files = [_ for _ in os.listdir(gtfolder) if _[-4:] == '.png']
lblimg = cv2.imread(osp.join(gtfolder,"{:05d}.png".format(0)))
height = lblimg.shape[0]
width = lblimg.shape[1]
for j in range(len(files)):
outname = osp.join(outfolder, "{:05d}.png".format(j))
inname = osp.join(current_folder, str(i) + '_' + str(j) + '_mask.png')
lblimg = cv2.imread(inname)[:,:,::-1]
flat_lblimg = lblimg.reshape(-1, 3)
lblidx = np.zeros((lblimg.shape[0], lblimg.shape[1]))
lblidx2 = np.zeros((lblimg.shape[0], lblimg.shape[1]))
colors = np.unique(flat_lblimg, axis=0)
for c in colors:
cid = color2id(c)
if len(cid) > 0:
lblidx2[np.all(lblimg == c, axis=-1)] = cid
lblidx = lblidx2
lblidx = lblidx.astype(np.uint8)
lblidx = cv2.resize(lblidx, (width, height), interpolation=cv2.INTER_NEAREST)
lblidx = lblidx.astype(np.uint8)
im = Image.fromarray(lblidx)
im.putpalette(palette.ravel())
im.save(outname, format='PNG')
import multiprocessing as mp
pool = mp.Pool(10)
results = pool.map(convert_dir, range(len(jpglist)))
================================================
FILE: eval/davis_dummy.txt
================================================
/home/wangzd/datasets/uvc/DAVIS/JPEGImages/480p/dance-twirl /home/wangzd/datasets/uvc/DAVIS/Annotations/480p/dance-twirl
/home/wangzd/datasets/uvc/DAVIS/JPEGImages/480p/dog /home/wangzd/datasets/uvc/DAVIS/Annotations/480p/dog
/home/wangzd/datasets/uvc/DAVIS/JPEGImages/480p/dogs-jump /home/wangzd/datasets/uvc/DAVIS/Annotations/480p/dogs-jump
/home/wangzd/datasets/uvc/DAVIS/JPEGImages/480p/drift-chicane /home/wangzd/datasets/uvc/DAVIS/Annotations/480p/drift-chicane
/home/wangzd/datasets/uvc/DAVIS/JPEGImages/480p/drift-straight /home/wangzd/datasets/uvc/DAVIS/Annotations/480p/drift-straight
/home/wangzd/datasets/uvc/DAVIS/JPEGImages/480p/goat /home/wangzd/datasets/uvc/DAVIS/Annotations/480p/goat
/home/wangzd/datasets/uvc/DAVIS/JPEGImages/480p/gold-fish /home/wangzd/datasets/uvc/DAVIS/Annotations/480p/gold-fish
/home/wangzd/datasets/uvc/DAVIS/JPEGImages/480p/horsejump-high /home/wangzd/datasets/uvc/DAVIS/Annotations/480p/horsejump-high
/home/wangzd/datasets/uvc/DAVIS/JPEGImages/480p/india /home/wangzd/datasets/uvc/DAVIS/Annotations/480p/india
/home/wangzd/datasets/uvc/DAVIS/JPEGImages/480p/judo /home/wangzd/datasets/uvc/DAVIS/Annotations/480p/judo
/home/wangzd/datasets/uvc/DAVIS/JPEGImages/480p/kite-surf /home/wangzd/datasets/uvc/DAVIS/Annotations/480p/kite-surf
/home/wangzd/datasets/uvc/DAVIS/JPEGImages/480p/lab-coat /home/wangzd/datasets/uvc/DAVIS/Annotations/480p/lab-coat
/home/wangzd/datasets/uvc/DAVIS/JPEGImages/480p/libby /home/wangzd/datasets/uvc/DAVIS/Annotations/480p/libby
/home/wangzd/datasets/uvc/DAVIS/JPEGImages/480p/loading /home/wangzd/datasets/uvc/DAVIS/Annotations/480p/loading
/home/wangzd/datasets/uvc/DAVIS/JPEGImages/480p/mbike-trick /home/wangzd/datasets/uvc/DAVIS/Annotations/480p/mbike-trick
/home/wangzd/datasets/uvc/DAVIS/JPEGImages/480p/motocross-jump /home/wangzd/datasets/uvc/DAVIS/Annotations/480p/motocross-jump
/home/wangzd/datasets/uvc/DAVIS/JPEGImages/480p/paragliding-launch /home/wangzd/datasets/uvc/DAVIS/Annotations/480p/paragliding-launch
/home/wangzd/datasets/uvc/DAVIS/JPEGImages/480p/parkour /home/wangzd/datasets/uvc/DAVIS/Annotations/480p/parkour
/home/wangzd/datasets/uvc/DAVIS/JPEGImages/480p/pigs /home/wangzd/datasets/uvc/DAVIS/Annotations/480p/pigs
/home/wangzd/datasets/uvc/DAVIS/JPEGImages/480p/scooter-black /home/wangzd/datasets/uvc/DAVIS/Annotations/480p/scooter-black
/home/wangzd/datasets/uvc/DAVIS/JPEGImages/480p/shooting /home/wangzd/datasets/uvc/DAVIS/Annotations/480p/shooting
/home/wangzd/datasets/uvc/DAVIS/JPEGImages/480p/soapbox /home/wangzd/datasets/uvc/DAVIS/Annotations/480p/soapbox
================================================
FILE: eval/eval_mot.py
================================================
import os
import numpy as np
import copy
import motmetrics as mm
mm.lap.default_solver = 'lap'
from utils.io import read_mot_results, unzip_objs
class Evaluator(object):
def __init__(self, data_root, seq_name, data_type='mot'):
self.data_root = data_root
self.seq_name = seq_name
self.data_type = data_type
self.load_annotations()
self.reset_accumulator()
def load_annotations(self):
assert self.data_type == 'mot'
gt_filename = os.path.join(self.data_root, self.seq_name, 'gt', 'gt.txt')
self.gt_frame_dict = read_mot_results(gt_filename, self.data_type, is_gt=True)
self.gt_ignore_frame_dict = read_mot_results(gt_filename, self.data_type, is_ignore=True)
def reset_accumulator(self):
self.acc = mm.MOTAccumulator(auto_id=True)
def eval_frame(self, frame_id, trk_tlwhs, trk_ids, rtn_events=False):
# results
trk_tlwhs = np.copy(trk_tlwhs)
trk_ids = np.copy(trk_ids)
# gts
gt_objs = self.gt_frame_dict.get(frame_id, [])
gt_tlwhs, gt_ids = unzip_objs(gt_objs)[:2]
# ignore boxes
ignore_objs = self.gt_ignore_frame_dict.get(frame_id, [])
ignore_tlwhs = unzip_objs(ignore_objs)[0]
# remove ignored results
keep = np.ones(len(trk_tlwhs), dtype=bool)
iou_distance = mm.distances.iou_matrix(ignore_tlwhs, trk_tlwhs, max_iou=0.5)
if len(iou_distance) > 0:
match_is, match_js = mm.lap.linear_sum_assignment(iou_distance)
match_is, match_js = map(lambda a: np.asarray(a, dtype=int), [match_is, match_js])
match_ious = iou_distance[match_is, match_js]
match_js = np.asarray(match_js, dtype=int)
match_js = match_js[np.logical_not(np.isnan(match_ious))]
keep[match_js] = False
trk_tlwhs = trk_tlwhs[keep]
trk_ids = trk_ids[keep]
# get distance matrix
iou_distance = mm.distances.iou_matrix(gt_tlwhs, trk_tlwhs, max_iou=0.5)
# acc
self.acc.update(gt_ids, trk_ids, iou_distance)
if rtn_events and iou_distance.size > 0 and hasattr(self.acc, 'last_mot_events'):
events = self.acc.last_mot_events # only supported by https://github.com/longcw/py-motmetrics
else:
events = None
return events
def eval_file(self, filename):
self.reset_accumulator()
result_frame_dict = read_mot_results(filename, self.data_type, is_gt=False)
frames = sorted(list(set(self.gt_frame_dict.keys()) | set(result_frame_dict.keys())))
for frame_id in frames:
trk_objs = result_frame_dict.get(frame_id, [])
trk_tlwhs, trk_ids = unzip_objs(trk_objs)[:2]
self.eval_frame(frame_id, trk_tlwhs, trk_ids, rtn_events=False)
return self.acc
@staticmethod
def get_summary(accs, names, metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1', 'precision', 'recall')):
names = copy.deepcopy(names)
if metrics is None:
metrics = mm.metrics.motchallenge_metrics
metrics = copy.deepcopy(metrics)
mh = mm.metrics.create()
summary = mh.compute_many(
accs,
metrics=metrics,
names=names,
generate_overall=True
)
return summary
@staticmethod
def save_summary(summary, filename):
import pandas as pd
writer = pd.ExcelWriter(filename)
summary.to_excel(writer)
writer.save()
================================================
FILE: eval/eval_pck.py
================================================
from scipy.io import loadmat
from numpy import transpose
import skimage.io as sio
import numpy as np
import os
import cv2
import scipy.io as sio
filelist = '/home/wangzd/datasets/GOT/JHMDB/split.txt'
src_folder = 'results/poseprop/womotion_resnet18_s3/'
f = open(filelist, 'r')
gts = []
heights = []
widths = []
preds = []
jnt_visible_set = []
human_boxes = []
feat_res = 40
for cnt, line in enumerate(f):
rows = line.strip().split()
lblpath = rows[0] #+ '/joint_positions.mat'
lbls_mat = sio.loadmat(lblpath)
lbls_coord = lbls_mat['pos_img']
lbls_coord = lbls_coord - 1
gts.append(lbls_coord)
imgpath = rows[1] + '/00001.png'
img = cv2.imread(imgpath)
heights.append(img.shape[0])
widths.append(img.shape[1])
f.close()
# gts = gts[0: 200]
print('read gt')
# read prediction results
for i in range(len(gts)):
# import pdb; pdb.set_trace()
predfile = src_folder + str(i) + '.dat'
predres = np.load(predfile, allow_pickle=True)
# import pdb; pdb.set_trace()
jnt_visible = np.ones((predres.shape[1], predres.shape[2]))
for j in range(predres.shape[1]):
for k in range(predres.shape[2]):
if predres[0, j, k] < 0:
jnt_visible[j, k] = 0
jnt_visible_set.append(jnt_visible)
now_height = heights[i]
now_width = widths[i]
predres[0, :, :] = predres[0, :, :] / float(feat_res) * now_width
predres[1, :, :] = predres[1, :, :] / float(feat_res) * now_height
preds.append(predres)
print('read prediction')
# compute the human box for normalization
for i in range(len(gts)):
nowgt = gts[i]
jnt_visible = jnt_visible_set[i]
now_boxes = np.zeros(nowgt.shape[2])
for k in range(nowgt.shape[2]):
minx = 1e6
maxx = -1
miny = 1e6
maxy = -1
for j in range(nowgt.shape[1]):
if jnt_visible[j, k] == 0:
continue
minx = np.min([minx, nowgt[0, j, k]])
miny = np.min([miny, nowgt[1, j, k]])
maxx = np.max([maxx, nowgt[0, j, k]])
maxy = np.max([maxy, nowgt[1, j, k]])
now_boxes[k] = 0.6 * np.linalg.norm(np.subtract([maxx,maxy],[minx,miny]))
# now_boxes[k] = np.max([maxy - miny, maxx - minx])
human_boxes.append(now_boxes)
print('done box')
# compute distances
distAll = {}
for pidx in range(15):
distAll[pidx] = np.zeros([0,0])
for i in range(len(gts)):
predres = preds[i]
nowgt = gts[i]
now_boxes = human_boxes[i]
jnt_visible = jnt_visible_set[i]
for j in range(nowgt.shape[1]):
for k in range(nowgt.shape[2]):
if jnt_visible[j, k] == 0:
continue
if k == 0:
continue
predx = predres[0, j, k]
predy = predres[1, j, k]
gtx = nowgt[0, j, k]
gty = nowgt[1, j, k]
d = np.linalg.norm(np.subtract([predx, predy],[gtx, gty]))
dNorm = d / now_boxes[k]
distAll[j] = np.append(distAll[j],[[dNorm]])
print('done distances')
def computePCK(distAll,distThresh):
pckAll = np.zeros([len(distAll)+1,1])
nCorrect = 0
nTotal = 0
for pidx in range(len(distAll)):
idxs = np.argwhere(distAll[pidx] <= distThresh)
pck = 100.0*len(idxs)/len(distAll[pidx])
pckAll[pidx,0] = pck
nCorrect += len(idxs)
nTotal += len(distAll[pidx])
pckAll[len(distAll),0] = np.mean(pckAll[0 :len(distAll),0]) # 100.0*nCorrect/nTotal
return pckAll
rng = [0.1, 0.2, 0.3, 0.4, 0.5]
for i in range(len(rng)):
pckall = computePCK(distAll, rng[i])
print(str(rng[i]) + ': ' + str(pckall[-1]) )
# print(pckall[-1])
================================================
FILE: eval/mots/Evaluator.py
================================================
import pdb
import sys, os
sys.path.append(os.getcwd())
import argparse
import traceback
import time
import pickle
import pandas as pd
import glob
from os import path
import numpy as np
class Evaluator(object):
""" The `Evaluator` class runs evaluation per sequence and computes the overall performance on the benchmark"""
def __init__(self):
pass
def run(self, benchmark_name = None , gt_dir = None, res_dir = None, save_pkl = None, eval_mode = "train", seqmaps_dir = "seqmaps"):
"""
Params
-----
benchmark_name: Name of benchmark, e.g. MOT17
gt_dir: directory of folders with gt data, including the c-files with sequences
res_dir: directory with result files
.txt
.txt
...
.txt
eval_mode:
seqmaps_dir:
seq_file: File name of file containing sequences, e.g. 'c10-train.txt'
save_pkl: path to output directory for final results
"""
start_time = time.time()
self.benchmark_gt_dir = gt_dir
self.seq_file = "{}-{}.txt".format(benchmark_name, eval_mode)
res_dir = res_dir
self.benchmark_name = benchmark_name
self.seqmaps_dir = seqmaps_dir
self.mode = eval_mode
self.datadir = os.path.join(gt_dir, self.mode)
# getting names of sequences to evaluate
error_traceback = ""
assert self.mode in ["train", "test", "all"], "mode: %s not valid " %s
print("Evaluating Benchmark: %s" % self.benchmark_name)
# ======================================================
# Handle evaluation
# ======================================================
# load list of all sequences
self.sequences = os.listdir(self.datadir)
self.gtfiles = []
self.tsfiles = []
for seq in self.sequences:
gtf = os.path.join(self.benchmark_gt_dir, self.mode ,seq, 'gt/gt.txt')
if path.exists(gtf): self.gtfiles.append(gtf)
else: raise Exception("Ground Truth %s missing" % gtf)
tsf = os.path.join( res_dir, "%s.txt" % seq)
if path.exists(gtf): self.tsfiles.append(tsf)
else: raise Exception("Result file %s missing" % tsf)
print('Found {} ground truth files and {} test files.'.format(len(self.gtfiles), len(self.tsfiles)))
print( self.tsfiles)
self.MULTIPROCESSING = False
MAX_NR_CORES = 10
# set number of core for mutliprocessing
if self.MULTIPROCESSING: self.NR_CORES = np.minimum( MAX_NR_CORES, len(self.tsfiles))
try:
""" run evaluation """
results = self.eval()
# calculate overall results
results_attributes = self.Overall_Results.metrics.keys()
for attr in results_attributes:
""" accumulate evaluation values over all sequences """
try:
self.Overall_Results.__dict__[attr] = sum(obj.__dict__[attr] for obj in self.results)
except:
pass
cache_attributes = self.Overall_Results.cache_dict.keys()
for attr in cache_attributes:
""" accumulate cache values over all sequences """
try:
self.Overall_Results.__dict__[attr] = self.Overall_Results.cache_dict[attr]['func']([obj.__dict__[attr] for obj in self.results])
except:
pass
print("evaluation successful")
# Compute clearmot metrics for overall and all sequences
for res in self.results:
res.compute_clearmot()
self.Overall_Results.compute_clearmot()
self.accumulate_df(type = "mail")
self.failed = False
error = None
except Exception as e:
print(str(traceback.format_exc()))
print (" Evaluation failed! ")
error_traceback+= str(traceback.format_exc())
self.failed = True
self.summary = None
end_time=time.time()
self.duration = (end_time - start_time)/60.
# ======================================================
# Collect evaluation errors
# ======================================================
if self.failed:
startExc = error_traceback.split("")
error_traceback = [m.split("")[0] for m in startExc[1:]]
error = ""
for err in error_traceback:
error+="Error: %s" % err
print( "Error Message", error)
self.error = error
print("ERROR %s" % error)
print ("Evaluation Finished")
print("Your Results")
print(self.render_summary())
# save results if path set
if save_pkl:
self.Overall_Results.save_dict(os.path.join( save_pkl, "%s-%s-overall.pkl" % (self.benchmark_name, self.mode)))
for res in self.results:
res.save_dict(os.path.join( save_pkl, "%s-%s-%s.pkl" % (self.benchmark_name, self.mode, res.seqName)))
print("Successfully save results")
return self.Overall_Results, self.results
def eval(self):
raise NotImplementedError
def accumulate_df(self, type = None):
""" create accumulated dataframe with all sequences """
for k, res in enumerate(self.results):
res.to_dataframe(display_name = True, type = type )
if k == 0: summary = res.df
else: summary = summary.append(res.df)
summary = summary.sort_index()
self.Overall_Results.to_dataframe(display_name = True, type = type )
self.summary = summary.append(self.Overall_Results.df)
def render_summary( self, buf = None):
"""Render metrics summary to console friendly tabular output.
Params
------
summary : pd.DataFrame
Dataframe containing summaries in rows.
Kwargs
------
buf : StringIO-like, optional
Buffer to write to
formatters : dict, optional
Dicionary defining custom formatters for individual metrics.
I.e `{'mota': '{:.2%}'.format}`. You can get preset formatters
from MetricsHost.formatters
namemap : dict, optional
Dictionary defining new metric names for display. I.e
`{'num_false_positives': 'FP'}`.
Returns
-------
string
Formatted string
"""
output = self.summary.to_string(
buf=buf,
formatters=self.Overall_Results.formatters,
justify = "left"
)
return output
def run_metrics( metricObject, args ):
""" Runs metric for individual sequences
Params:
-----
metricObject: metricObject that has computer_compute_metrics_per_sequence function
args: dictionary with args for evaluation function
"""
metricObject.compute_metrics_per_sequence(**args)
return metricObject
if __name__ == "__main__":
Evaluator()
================================================
FILE: eval/mots/LICENSE
================================================
MIT License
Copyright (c) 2019 Visual Computing Institute
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: eval/mots/MOTSVisualization.py
================================================
import sys
from Visualize import Visualizer
from mots_common.io import load_sequences, load_seqmap, load_txt
import pycocotools.mask as rletools
import glob
import os
import cv2
import colorsys
import numpy as np
def apply_mask(image, mask, color, alpha=0.5):
"""
Apply the given mask to the image.
"""
for c in range(3):
image[:, :, c] = np.where(mask == 1, image[:, :, c] * (1 - alpha) + alpha * color[c],image[:, :, c])
return image
class MOTSVisualizer(Visualizer):
def load(self, FilePath):
return load_txt(FilePath)
def drawResults(self, im = None, t = 0):
self.draw_boxes = False
for obj in self.resFile[t]:
color = self.colors[obj.track_id % len(self.colors)]
color = tuple([int(c*255) for c in color])
if obj.class_id == 1:
category_name = "Car"
elif obj.class_id == 2:
category_name = "Ped"
else:
category_name = "Ignore"
color = (0.7*255, 0.7*255, 0.7*255)
if obj.class_id == 1 or obj.class_id == 2: # Don't show boxes or ids for ignore regions
x, y, w, h = rletools.toBbox(obj.mask)
pt1=(int(x),int(y))
pt2=(int(x+w),int(y+h))
category_name += ":" + str(obj.track_id)
cv2.putText(im, category_name, (int(x + 0.5 * w), int( y + 0.5 * h)), cv2.FONT_HERSHEY_TRIPLEX,self.imScale,color,thickness =2)
if self.draw_boxes:
cv2.rectangle(im,pt1,pt2,color,2)
binary_mask = rletools.decode(obj.mask)
im = apply_mask(im, binary_mask, color)
return im
if __name__ == "__main__":
visualizer = MOTSVisualizer(
seqName = "MOTS20-11",
FilePath ="data/MOTS/train/MOTS20-11/gt/gt.txt",
image_dir = "data/MOTS/train/MOTS20-11/img1",
mode = "gt",
output_dir = "vid")
visualizer.generateVideo(
displayTime = True,
displayName = "seg",
showOccluder = True,
fps = 25 )
================================================
FILE: eval/mots/MOTS_metrics.py
================================================
import math
from collections import defaultdict
import pycocotools.mask as rletools
from mots_common.io import SegmentedObject
from mots_common.io import load_seqmap, load_sequences, load_txt
from Metrics import Metrics
import os, sys
import numpy as np
from scipy.optimize import linear_sum_assignment as linear_assignment
# we only consider pedestrians
IGNORE_CLASS = 10
CLASS_ID = 2
def mask_iou(a, b, criterion="union"):
is_crowd = criterion != "union"
return rletools.iou([a.mask], [b.mask], [is_crowd])[0][0]
class MOTSMetrics(Metrics):
def __init__(self, seqName = None):
super().__init__()
if seqName:
self.seqName = seqName
else: self.seqName = 0
# Evaluation metrics
self.register(name = "sMOTSA", formatter='{:.2f}'.format)
self.register(name = "MOTSA", formatter='{:.2f}'.format)
self.register(name = "MOTSP", formatter='{:.2f}'.format)
self.register(name = "MOTSAL", formatter='{:.2f}'.format, write_mail = False)
self.register(name = "MODSA", formatter='{:.2f}'.format, write_mail = False)
self.register(name = "MODSP", formatter='{:.2f}'.format, write_mail = False)
self.register(name = "IDF1", formatter='{:.2f}'.format)
self.register(name = "IDTP", formatter='{:.2f}'.format, write_mail = False)
self.register(name = "MT", formatter='{:.0f}'.format)
self.register(name = "PT", formatter='{:.0f}'.format, write_mail = False )
self.register(name = "ML", formatter='{:.0f}'.format)
self.register(name = "MTR", formatter='{:.2f}'.format)
self.register(name = "PTR", formatter='{:.2f}'.format)
self.register(name = "MLR", formatter='{:.2f}'.format)
self.register(name = "n_gt_trajectories", display_name = "GT",formatter='{:.0f}'.format, write_mail = True)
self.register(name = "tp", display_name="TP", formatter='{:.0f}'.format) # number of true positives
self.register(name = "fp", display_name="FP", formatter='{:.0f}'.format) # number of false positives
self.register(name = "fn", display_name="FN", formatter='{:.0f}'.format) # number of false negatives
self.register(name = "recall", display_name="Rcll", formatter='{:.2f}'.format)
self.register(name = "precision", display_name="Prcn", formatter='{:.2f}'.format)
self.register(name = "F1", display_name="F1", formatter='{:.2f}'.format, write_mail = False)
self.register(name = "FAR", formatter='{:.2f}'.format, write_mail = False)
self.register(name = "total_cost", display_name="COST", formatter='{:.0f}'.format, write_mail = False)
self.register(name = "fragments", display_name="FM", formatter='{:.0f}'.format)
self.register(name = "fragments_rel", display_name="FMR", formatter='{:.2f}'.format)
self.register(name = "id_switches", display_name="IDSW", formatter='{:.0f}'.format)
self.register(name = "id_switches_rel", display_name="IDSWR", formatter='{:.1f}'.format)
self.register(name = "n_tr_trajectories", display_name = "TR", formatter='{:.0f}'.format, write_mail = False)
self.register(name = "total_num_frames", display_name="TOTAL_NUM", formatter='{:.0f}'.format, write_mail = False)
self.register(name = "n_gt", display_name = "GT_OBJ", formatter='{:.0f}'.format, write_mail = False) # number of ground truth detections
self.register(name = "n_tr", display_name = "TR_OBJ", formatter='{:.0f}'.format, write_mail = False) # number of tracker detections minus ignored tracker detections
self.register(name = "n_itr",display_name="IGNORED", formatter='{:.0f}'.format, write_mail = False) # number of ignored tracker detections
self.register(name = "id_n_tr", display_name = "ID_TR_OBJ", formatter='{:.0f}'.format, write_mail = False)
self.register(name = "nbox_gt", display_name = "NBOX_GT", formatter='{:.0f}'.format, write_mail = False)
def compute_clearmot(self):
# precision/recall etc.
if (self.fp + self.tp) == 0 or (self.tp + self.fn) == 0:
self.recall = 0.
self.precision = 0.
else:
self.recall = self.tp / float(self.tp + self.fn) * 100.
self.precision = self.tp / float(self.fp + self.tp) * 100.
if (self.recall + self.precision) == 0:
self.F1 = 0.
else:
self.F1 = (2. * (self.precision * self.recall) / (self.precision + self.recall) ) * 100.
if self.total_num_frames == 0:
self.FAR = "n/a"
else:
self.FAR = (self.fp / float(self.total_num_frames) )
# compute CLEARMOT
if self.n_gt == 0:
self.MOTSA = -float("inf")
self.MODSA = -float("inf")
self.sMOTSA = -float("inf")
else:
self.MOTSA = (1 - (self.fn + self.fp + self.id_switches) / float(self.n_gt) ) * 100.
self.MODSA = (1 - (self.fn + self.fp) / float(self.n_gt)) * 100.
self.sMOTSA = ((self.total_cost - self.fp - self.id_switches) / float(self.n_gt)) * 100.
if self.tp == 0:
self.MOTSP = float("inf")
else:
self.MOTSP = self.total_cost / float(self.tp) * 100.
if self.n_gt != 0:
if self.id_switches == 0:
self.MOTSAL = (1 - (self.fn + self.fp + self.id_switches) / float(self.n_gt)) * 100.
else:
self.MOTSAL = (1 - (self.fn + self.fp + math.log10(self.id_switches)) / float(
self.n_gt))*100.
else:
self.MOTSAL = -float("inf")
if self.total_num_frames == 0:
self.MODSP = "n/a"
else:
self.MODSP = self.MODSP / float(self.total_num_frames) * 100.
if self.n_gt_trajectories == 0:
self.MTR = 0.
self.PTR = 0.
self.MLR = 0.
else:
self.MTR = self.MT * 100. / float(self.n_gt_trajectories)
self.PTR = self.PT * 100. / float(self.n_gt_trajectories)
self.MLR = self.ML * 100. / float(self.n_gt_trajectories)
# calculate relative IDSW and FM
if self.recall != 0:
self.id_switches_rel = self.id_switches/self.recall*100
self.fragments_rel = self.fragments/self.recall* 100
else:
self.id_switches_rel = float("inf")
self.fragments_rel = float("inf")
# IDF1
if self.n_gt_trajectories == 0:
self.IDF1 = 0.
else:
self.IDF1 = (2 * self.IDTP) / (self.nbox_gt + self.id_n_tr) * 100.
return self
# go through all frames and associate ground truth and tracker results
def compute_metrics_per_sequence(self, sequence, pred_file, gt_file, gtDataDir, benchmark_name,
ignore_class = IGNORE_CLASS, class_id = CLASS_ID, overlap_function = mask_iou):
gt_seq = load_txt(gt_file)
results_seq = load_txt(pred_file)
# load information about sequence
import configparser
config = configparser.ConfigParser()
config.read(os.path.join(gtDataDir, "seqinfo.ini"))
max_frames = int(config['Sequence']["seqlength"])
self.total_num_frames = max_frames + 1
seq_trajectories = defaultdict(list)
# To count number of track ids
gt_track_ids = set()
tr_track_ids = set()
# Statistics over the current sequence
seqtp = 0
seqfn = 0
seqfp = 0
seqitr = 0
n_gts = 0
n_trs = 0
frame_to_ignore_region = {}
# Iterate over frames in this sequence
for f in range(max_frames + 1):
g = []
dc = []
t = []
if f in gt_seq:
for obj in gt_seq[f]:
if obj.class_id == ignore_class:
dc.append(obj)
elif obj.class_id == class_id:
g.append(obj)
gt_track_ids.add(obj.track_id)
if f in results_seq:
for obj in results_seq[f]:
if obj.class_id == class_id:
t.append(obj)
tr_track_ids.add(obj.track_id)
# Handle ignore regions as one large ignore region
dc = SegmentedObject(mask=rletools.merge([d.mask for d in dc], intersect=False),
class_id=ignore_class, track_id=ignore_class)
frame_to_ignore_region[f] = dc
tracks_valid = [False for _ in range(len(t))]
# counting total number of ground truth and tracker objects
self.n_gt += len(g)
self.n_tr += len(t)
n_gts += len(g)
n_trs += len(t)
# tmp variables for sanity checks and MODSP computation
tmptp = 0
tmpfp = 0
tmpfn = 0
tmpc = 0 # this will sum up the overlaps for all true positives
tmpcs = [0] * len(g) # this will save the overlaps for all true positives
# the reason is that some true positives might be ignored
# later such that the corrsponding overlaps can
# be subtracted from tmpc for MODSP computation
# To associate, simply take for each ground truth the (unique!) detection with IoU>0.5 if it exists
# all ground truth trajectories are initially not associated
# extend groundtruth trajectories lists (merge lists)
for gg in g:
seq_trajectories[gg.track_id].append(-1)
num_associations = 0
for row, gg in enumerate(g):
for col, tt in enumerate(t):
c = overlap_function(gg, tt)
if c > 0.5:
tracks_valid[col] = True
self.total_cost += c
tmpc += c
tmpcs[row] = c
seq_trajectories[g[row].track_id][-1] = t[col].track_id
# true positives are only valid associations
self.tp += 1
tmptp += 1
num_associations += 1
# associate tracker and DontCare areas
# ignore tracker in neighboring classes
nignoredtracker = 0 # number of ignored tracker detections
for i, tt in enumerate(t):
overlap = overlap_function(tt, dc, "a")
if overlap > 0.5 and not tracks_valid[i]:
nignoredtracker += 1
# count the number of ignored tracker objects
self.n_itr += nignoredtracker
# false negatives = non-associated gt instances
#
tmpfn += len(g) - num_associations
self.fn += len(g) - num_associations
# false positives = tracker instances - associated tracker instances
# mismatches (mme_t)
tmpfp += len(t) - tmptp - nignoredtracker
self.fp += len(t) - tmptp - nignoredtracker
# tmpfp = len(t) - tmptp - nignoredtp # == len(t) - (tp - ignoredtp) - ignoredtp
# self.fp += len(t) - tmptp - nignoredtp
# update sequence data
seqtp += tmptp
seqfp += tmpfp
seqfn += tmpfn
seqitr += nignoredtracker
# sanity checks
# - the number of true positives minues ignored true positives
# should be greater or equal to 0
# - the number of false negatives should be greater or equal to 0
# - the number of false positives needs to be greater or equal to 0
# otherwise ignored detections might be counted double
# - the number of counted true positives (plus ignored ones)
# and the number of counted false negatives (plus ignored ones)
# should match the total number of ground truth objects
# - the number of counted true positives (plus ignored ones)
# and the number of counted false positives
# plus the number of ignored tracker detections should
# match the total number of tracker detections
if tmptp < 0:
print(tmptp)
raise NameError("Something went wrong! TP is negative")
if tmpfn < 0:
print(tmpfn, len(g), num_associations)
raise NameError("Something went wrong! FN is negative")
if tmpfp < 0:
print(tmpfp, len(t), tmptp, nignoredtracker)
raise NameError("Something went wrong! FP is negative")
if tmptp + tmpfn != len(g):
print("seqname", seq_name)
print("frame ", f)
print("TP ", tmptp)
print("FN ", tmpfn)
print("FP ", tmpfp)
print("nGT ", len(g))
print("nAss ", num_associations)
raise NameError("Something went wrong! nGroundtruth is not TP+FN")
if tmptp + tmpfp + nignoredtracker != len(t):
print(seq_name, f, len(t), tmptp, tmpfp)
print(num_associations)
raise NameError("Something went wrong! nTracker is not TP+FP")
# compute MODSP
MODSP_f = 1
if tmptp != 0:
MODSP_f = tmpc / float(tmptp)
self.MODSP += MODSP_f
assert len(seq_trajectories) == len(gt_track_ids)
self.n_gt_trajectories = len(gt_track_ids)
self.n_tr_trajectories = len(tr_track_ids)
# compute MT/PT/ML, fragments, idswitches for all groundtruth trajectories
if len(seq_trajectories) != 0:
for g in seq_trajectories.values():
# all frames of this gt trajectory are not assigned to any detections
if all([this == -1 for this in g]):
self.ML += 1
continue
# compute tracked frames in trajectory
last_id = g[0]
# first detection (necessary to be in gt_trajectories) is always tracked
tracked = 1 if g[0] >= 0 else 0
for f in range(1, len(g)):
if last_id != g[f] and last_id != -1 and g[f] != -1:
self.id_switches += 1
if f < len(g) - 1 and g[f - 1] != g[f] and last_id != -1 and g[f] != -1 and g[f + 1] != -1:
self.fragments += 1
if g[f] != -1:
tracked += 1
last_id = g[f]
# handle last frame; tracked state is handled in for loop (g[f]!=-1)
if len(g) > 1 and g[f - 1] != g[f] and last_id != -1 and g[f] != -1:
self.fragments += 1
# compute MT/PT/ML
tracking_ratio = tracked / float(len(g))
if tracking_ratio > 0.8:
self.MT += 1
elif tracking_ratio < 0.2:
self.ML += 1
else: # 0.2 <= tracking_ratio <= 0.8
self.PT += 1
# compute IDF1
idf1, idtp, nbox_gt, id_n_tr = compute_idf1_and_idtp_for_sequence(gt_seq, results_seq, gt_track_ids, tr_track_ids, frame_to_ignore_region)
self.IDTP = idtp
#self.id_ign = id_ign
self.id_n_tr = id_n_tr
self.nbox_gt = nbox_gt
return self
### IDF1 stuff
### code below adapted from https://github.com/shenh10/mot_evaluation/blob/5dd51e5cb7b45992774ea150e4386aa0b02b586f/utils/measurements.py
def compute_idf1_and_idtp_for_sequence(frame_to_gt, frame_to_pred, gt_ids, st_ids, frame_to_ignore_region):
frame_to_can_be_ignored = {}
for t in frame_to_pred.keys():
preds_t = frame_to_pred[t]
pred_masks_t = [p.mask for p in preds_t]
ignore_region_t = frame_to_ignore_region[t].mask
overlap = np.squeeze(rletools.iou(pred_masks_t, [ignore_region_t], [1]), axis=1)
frame_to_can_be_ignored[t] = overlap > 0.5
gt_ids = sorted(gt_ids)
st_ids = sorted(st_ids)
groundtruth = [[] for _ in gt_ids]
prediction = [[] for _ in st_ids]
for t, gts_t in frame_to_gt.items():
for gt_t in gts_t:
if gt_t.track_id in gt_ids:
groundtruth[gt_ids.index(gt_t.track_id)].append((t, gt_t))
for t in frame_to_pred.keys():
preds_t = frame_to_pred[t]
can_be_ignored_t = frame_to_can_be_ignored[t]
assert len(preds_t) == len(can_be_ignored_t)
for pred_t, ign_t in zip(preds_t, can_be_ignored_t):
if pred_t.track_id in st_ids:
prediction[st_ids.index(pred_t.track_id)].append((t, pred_t, ign_t))
for gt in groundtruth:
gt.sort(key=lambda x: x[0])
for pred in prediction:
pred.sort(key=lambda x: x[0])
n_gt = len(gt_ids)
n_st = len(st_ids)
cost = np.zeros((n_gt + n_st, n_st + n_gt), dtype=float)
cost[n_gt:, :n_st] = sys.maxsize # float('inf')
cost[:n_gt, n_st:] = sys.maxsize # float('inf')
fp = np.zeros(cost.shape)
fn = np.zeros(cost.shape)
ign = np.zeros(cost.shape)
# cost matrix of all trajectory pairs
cost_block, fp_block, fn_block, ign_block = cost_between_gt_pred(groundtruth, prediction)
cost[:n_gt, :n_st] = cost_block
fp[:n_gt, :n_st] = fp_block
fn[:n_gt, :n_st] = fn_block
ign[:n_gt, :n_st] = ign_block
# computed trajectory match no groundtruth trajectory, FP
for i in range(n_st):
#cost[i + n_gt, i] = prediction[i].shape[0]
#fp[i + n_gt, i] = prediction[i].shape[0]
# don't count fp in case of ignore region
fps = sum([~x[2] for x in prediction[i]])
ig = sum([x[2] for x in prediction[i]])
cost[i + n_gt, i] = fps
fp[i + n_gt, i] = fps
ign[i + n_gt, i] = ig
# groundtruth trajectory match no computed trajectory, FN
for i in range(n_gt):
#cost[i, i + n_st] = groundtruth[i].shape[0]
#fn[i, i + n_st] = groundtruth[i].shape[0]
cost[i, i + n_st] = len(groundtruth[i])
fn[i, i + n_st] = len(groundtruth[i])
# TODO: add error handling here?
matched_indices = linear_assignment(cost)
#nbox_gt = sum([groundtruth[i].shape[0] for i in range(n_gt)])
#nbox_st = sum([prediction[i].shape[0] for i in range(n_st)])
nbox_gt = sum([len(groundtruth[i]) for i in range(n_gt)])
nbox_st = sum([len(prediction[i]) for i in range(n_st)])
#IDFP = 0
IDFN = 0
id_ign = 0
for matched in zip(*matched_indices):
#IDFP += fp[matched[0], matched[1]]
IDFN += fn[matched[0], matched[1]]
# exclude detections which are not matched and ignored from total count
id_ign += ign[matched[0], matched[1]]
id_n_tr = nbox_st - id_ign
IDTP = nbox_gt - IDFN
IDF1 = 2 * IDTP / (nbox_gt + id_n_tr)
return IDF1, IDTP, nbox_gt, id_n_tr
def cost_between_gt_pred(groundtruth, prediction):
n_gt = len(groundtruth)
n_st = len(prediction)
cost = np.zeros((n_gt, n_st), dtype=float)
fp = np.zeros((n_gt, n_st), dtype=float)
fn = np.zeros((n_gt, n_st), dtype=float)
ign = np.zeros((n_gt, n_st), dtype=float)
for i in range(n_gt):
for j in range(n_st):
fp[i, j], fn[i, j], ign[i, j] = cost_between_trajectories(groundtruth[i], prediction[j])
cost[i, j] = fp[i, j] + fn[i, j]
return cost, fp, fn, ign
def cost_between_trajectories(traj1, traj2):
#[npoints1, dim1] = traj1.shape
#[npoints2, dim2] = traj2.shape
npoints1 = len(traj1)
npoints2 = len(traj2)
# find start and end frame of each trajectories
#start1 = traj1[0, 0]
#end1 = traj1[-1, 0]
#start2 = traj2[0, 0]
#end2 = traj2[-1, 0]
times1 = [x[0] for x in traj1]
times2 = [x[0] for x in traj2]
start1 = min(times1)
start2 = min(times2)
end1 = max(times1)
end2 = max(times2)
ign = [traj2[i][2] for i in range(npoints2)]
# check frame overlap
#has_overlap = max(start1, start2) < min(end1, end2)
# careful, changed this to <=, but I think now it's right
has_overlap = max(start1, start2) <= min(end1, end2)
if not has_overlap:
fn = npoints1
#fp = npoints2
# disregard detections which can be ignored
fp = sum([~x for x in ign])
ig = sum(ign)
return fp, fn, ig
# gt trajectory mapping to st, check gt missed
matched_pos1 = corresponding_frame(times1, npoints1, times2, npoints2)
# st trajectory mapping to gt, check computed one false alarms
matched_pos2 = corresponding_frame(times2, npoints2, times1, npoints1)
overlap1 = compute_overlap(traj1, traj2, matched_pos1)
overlap2 = compute_overlap(traj2, traj1, matched_pos2)
# FN
fn = sum([1 for i in range(npoints1) if overlap1[i] < 0.5])
# FP
# don't count false positive in case of ignore region
unmatched = [overlap2[i] < 0.5 for i in range(npoints2)]
#fp = sum([1 for i in range(npoints2) if overlap2[i] < 0.5 and not traj2[i][2]])
fp = sum([1 for i in range(npoints2) if unmatched[i] and not ign[i]])
ig = sum([1 for i in range(npoints2) if unmatched[i] and ign[i]])
return fp, fn, ig
def corresponding_frame(traj1, len1, traj2, len2):
"""
Find the matching position in traj2 regarding to traj1
Assume both trajectories in ascending frame ID
"""
p1, p2 = 0, 0
loc = -1 * np.ones((len1,), dtype=int)
while p1 < len1 and p2 < len2:
if traj1[p1] < traj2[p2]:
loc[p1] = -1
p1 += 1
elif traj1[p1] == traj2[p2]:
loc[p1] = p2
p1 += 1
p2 += 1
else:
p2 += 1
return loc
def compute_overlap(traj1, traj2, matched_pos):
"""
Compute the loss hit in traj2 regarding to traj1
"""
overlap = np.zeros((len(matched_pos),), dtype=float)
for i in range(len(matched_pos)):
if matched_pos[i] == -1:
continue
else:
mask1 = traj1[i][1].mask
mask2 = traj2[matched_pos[i]][1].mask
iou = rletools.iou([mask1], [mask2], [False])[0][0]
overlap[i] = iou
return overlap
================================================
FILE: eval/mots/Metrics.py
================================================
from __future__ import division
from collections import OrderedDict, Iterable
import pandas as pd
import numpy as np
import pickle
class Metrics(object):
def __init__(self):
self.metrics = OrderedDict()
self.cache_dict = OrderedDict()
def register(self, name=None, value=None, formatter=None,
display_name=None, write_db = True, write_mail = True):
"""Register a new metric.
Params
------
name: str
Name of the metric. Name is used for computation and set as attribute.
display_name: str or None
Disoplay name of variable written in db and mail
value:
formatter:
Formatter to present value of metric. E.g. `'{:.2f}'.format`
write_db: boolean, default = True
Write value into db
write_mail: boolean, default = True
Write metric in result mail to user
"""
assert not name is None, 'No name specified'.format(name)
if not value:
value = 0
self.__setattr__( name, value)
if not display_name: display_name = name
self.metrics[name] = {
'name' : name,
'write_db' : write_db,
'formatter' : formatter,
'write_mail' : write_mail,
'display_name' : display_name
}
def cache(self, name=None, value=None, func=None):
assert not name is None, 'No name specified'.format(name)
self.__setattr__( name, value)
self.cache_dict[name] = {
'name' : name,
'func' : func
}
def __call__(self, name):
return self.metrics[name]
@property
def names(self):
"""Returns the name identifiers of all registered metrics."""
return [v['name'] for v in self.metrics.values()]
@property
def display_names(self):
"""Returns the display name identifiers of all registered metrics."""
return [v['display_name'] for v in self.metrics.values()]
@property
def formatters(self):
"""Returns the formatters for all metrics that have associated formatters."""
return dict([(v['display_name'], v['formatter']) for k, v in self.metrics.items() if not v['formatter'] is None])
#@property
def val_dict(self, display_name = False, object = "metrics"):
"""Returns dictionary of all registered values of object name or display_name as key.
Params
------
display_name: boolean, default = False
If True, display_name of keys in dict. (default names)
object: "cache" or "metrics", default = "metrics"
"""
if display_name: key_string = "display_name"
else: key_string = "name"
print("object dict: ", object)
val_dict = dict([(self.__getattribute__(object)[key][key_string], self.__getattribute__(key)) for key in self.__getattribute__(object).keys() ])
return val_dict
def val_db(self, display_name = True):
"""Returns dictionary of all registered values metrics to write in db."""
if display_name: key_string = "display_name"
else: key_string = "name"
val_dict = dict([(self.metrics[key][key_string], self.__getattribute__(key)) for key in self.metrics.keys() if self.metrics[key]["write_db"] ])
return val_dict
def val_mail(self, display_name = True):
"""Returns dictionary of all registered values metrics to write in mail."""
if display_name: key_string = "display_name"
else: key_string = "name"
val_dict = dict([(self.metrics[key][key_string], self.__getattribute__(key)) for key in self.metrics.keys() if self.metrics[key]["write_mail"] ])
return val_dict
def to_dataframe(self, display_name = False, type = None):
"""Returns pandas dataframe of all registered values metrics. """
if type=="mail":
self.df = pd.DataFrame(self.val_mail(display_name = display_name), index=[self.seqName])
else:
self.df = pd.DataFrame(self.val_dict(display_name = display_name), index=[self.seqName])
def update_values(self, value_dict = None):
"""Updates registered metrics with new values in value_dict. """
if value_dict:
for key, value in value_dict.items() :
if hasattr(self, key):
self.__setattr__(key, value)
def print_type(self, object = "metrics"):
"""Prints variable type of registered metrics or caches. """
print( "OBJECT " , object)
val_dict = self.val_dict(object = object)
for key, item in val_dict.items() :
print("%s: %s; Shape: %s" %(key, type(item), np.shape(item)))
def print_results(self):
"""Prints metrics. """
result_dict = self.val_dict()
for key, item in result_dict.items():
print(key)
print("%s: %s" %(key, self.metrics[key]["formatter"](item)))
def save_dict(self, path):
"""Save value dict to path as pickle file."""
with open(path, 'wb') as handle:
pickle.dump(self.__dict__, handle, protocol=pickle.HIGHEST_PROTOCOL)
def compute_metrics_per_sequence(self):
raise NotImplementedError
================================================
FILE: eval/mots/README.md
================================================
# MOTS

## Requirements
* Python 3.6.9
* install [requirements.txt](requirements.txt)
## Usage
1) Run
```
python MOTS/evalMOTS.py
```
## Evaluation
To run the evaluation for your method please adjust the file ```MOTS/evalMOTS.py``` using the following arguments:
```benchmark_name```: Name of the benchmark, e.g. MOTS
```gt_dir```: Directory containing ground truth files in ```/