Repository: Zhongdao/UniTrack Branch: main Commit: a83e782f5c56 Files: 379 Total size: 2.3 MB Directory structure: gitextract_942_umbq/ ├── .gitignore ├── LICENSE ├── README.md ├── config/ │ ├── crw_resnet18_s3.yaml │ ├── crw_resnet18_s3_womotion.yaml │ ├── imagenet_resnet18_s3.yaml │ └── imagenet_resnet18_s3_womotion.yaml ├── core/ │ ├── association/ │ │ ├── __init__.py │ │ └── matching.py │ ├── motion/ │ │ └── kalman_filter.py │ └── propagation/ │ ├── __init__.py │ ├── propagate_box.py │ ├── propagate_mask.py │ └── propagate_pose.py ├── data/ │ ├── jhmdb.py │ ├── kinetics.py │ ├── video.py │ └── vos.py ├── demo/ │ ├── mot_demo.py │ └── sot_demo.py ├── detector/ │ └── YOLOX/ │ ├── .gitignore │ ├── LICENSE │ ├── README.md │ ├── datasets/ │ │ └── README.md │ ├── demo/ │ │ ├── ONNXRuntime/ │ │ │ ├── README.md │ │ │ └── onnx_inference.py │ │ ├── OpenVINO/ │ │ │ ├── README.md │ │ │ ├── cpp/ │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── README.md │ │ │ │ └── yolox_openvino.cpp │ │ │ └── python/ │ │ │ ├── README.md │ │ │ └── openvino_inference.py │ │ ├── TensorRT/ │ │ │ ├── cpp/ │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── README.md │ │ │ │ ├── logging.h │ │ │ │ └── yolox.cpp │ │ │ └── python/ │ │ │ └── README.md │ │ └── ncnn/ │ │ ├── android/ │ │ │ ├── README.md │ │ │ ├── app/ │ │ │ │ ├── build.gradle │ │ │ │ └── src/ │ │ │ │ └── main/ │ │ │ │ ├── AndroidManifest.xml │ │ │ │ ├── assets/ │ │ │ │ │ └── yolox.param │ │ │ │ ├── java/ │ │ │ │ │ └── com/ │ │ │ │ │ └── megvii/ │ │ │ │ │ └── yoloXncnn/ │ │ │ │ │ ├── MainActivity.java │ │ │ │ │ ├── YOLOXncnn.java │ │ │ │ │ └── yoloXncnn.java │ │ │ │ ├── jni/ │ │ │ │ │ ├── CMakeLists.txt │ │ │ │ │ └── yoloXncnn_jni.cpp │ │ │ │ └── res/ │ │ │ │ ├── layout/ │ │ │ │ │ └── main.xml │ │ │ │ └── values/ │ │ │ │ └── strings.xml │ │ │ ├── build.gradle │ │ │ ├── gradle/ │ │ │ │ └── wrapper/ │ │ │ │ ├── gradle-wrapper.jar │ │ │ │ └── gradle-wrapper.properties │ │ │ ├── gradlew │ │ │ ├── gradlew.bat │ │ │ └── settings.gradle │ │ └── cpp/ │ │ ├── README.md │ │ └── yolox.cpp │ ├── demo.py │ ├── docs/ │ │ └── train_custom_data.md │ ├── exps/ │ │ ├── default/ │ │ │ ├── nano.py │ │ │ ├── yolov3.py │ │ │ ├── yolox_l.py │ │ │ ├── yolox_m.py │ │ │ ├── yolox_s.py │ │ │ ├── yolox_tiny.py │ │ │ └── yolox_x.py │ │ └── example/ │ │ └── yolox_voc/ │ │ └── yolox_voc_s.py │ ├── requirements.txt │ ├── setup.cfg │ ├── setup.py │ ├── tools/ │ │ ├── __init__.py │ │ ├── demo.py │ │ ├── eval.py │ │ ├── export_onnx.py │ │ ├── train.py │ │ └── trt.py │ └── yolox/ │ ├── __init__.py │ ├── core/ │ │ ├── __init__.py │ │ ├── launch.py │ │ └── trainer.py │ ├── data/ │ │ ├── __init__.py │ │ ├── data_augment.py │ │ ├── data_prefetcher.py │ │ ├── dataloading.py │ │ ├── datasets/ │ │ │ ├── __init__.py │ │ │ ├── coco.py │ │ │ ├── coco_classes.py │ │ │ ├── datasets_wrapper.py │ │ │ ├── mosaicdetection.py │ │ │ ├── voc.py │ │ │ └── voc_classes.py │ │ └── samplers.py │ ├── evaluators/ │ │ ├── __init__.py │ │ ├── coco_evaluator.py │ │ ├── voc_eval.py │ │ └── voc_evaluator.py │ ├── exp/ │ │ ├── __init__.py │ │ ├── base_exp.py │ │ ├── build.py │ │ └── yolox_base.py │ ├── layers/ │ │ ├── __init__.py │ │ ├── csrc/ │ │ │ ├── cocoeval/ │ │ │ │ ├── cocoeval.cpp │ │ │ │ └── cocoeval.h │ │ │ └── vision.cpp │ │ └── fast_coco_eval_api.py │ ├── models/ │ │ ├── __init__.py │ │ ├── darknet.py │ │ ├── losses.py │ │ ├── network_blocks.py │ │ ├── yolo_fpn.py │ │ ├── yolo_head.py │ │ ├── yolo_pafpn.py │ │ └── yolox.py │ └── utils/ │ ├── __init__.py │ ├── allreduce_norm.py │ ├── boxes.py │ ├── checkpoint.py │ ├── demo_utils.py │ ├── dist.py │ ├── ema.py │ ├── logger.py │ ├── lr_scheduler.py │ ├── metric.py │ ├── model_utils.py │ ├── setup_env.py │ └── visualize.py ├── docs/ │ ├── DATA.md │ ├── INSTALL.md │ ├── MODELZOO.md │ ├── RESULTS.md │ └── RUN.md ├── eval/ │ ├── convert_davis.py │ ├── davis_dummy.txt │ ├── eval_mot.py │ ├── eval_pck.py │ ├── mots/ │ │ ├── Evaluator.py │ │ ├── LICENSE │ │ ├── MOTSVisualization.py │ │ ├── MOTS_metrics.py │ │ ├── Metrics.py │ │ ├── README.md │ │ ├── Visualize.py │ │ ├── __init__.py │ │ ├── evalMOTS.py │ │ ├── mots_common/ │ │ │ ├── images_to_txt.py │ │ │ └── io.py │ │ └── requirements.txt │ ├── palette.py │ ├── poseval/ │ │ ├── .gitignore │ │ ├── .gitmodules │ │ ├── .pylintrc │ │ ├── README.md │ │ ├── evaluate.py │ │ ├── license.txt │ │ ├── matlab/ │ │ │ ├── external/ │ │ │ │ └── jsonlab/ │ │ │ │ ├── AUTHORS.txt │ │ │ │ ├── ChangeLog.txt │ │ │ │ ├── LICENSE_BSD.txt │ │ │ │ ├── README.txt │ │ │ │ ├── examples/ │ │ │ │ │ ├── demo_jsonlab_basic.m │ │ │ │ │ ├── demo_ubjson_basic.m │ │ │ │ │ ├── example1.json │ │ │ │ │ ├── example2.json │ │ │ │ │ ├── example3.json │ │ │ │ │ ├── example4.json │ │ │ │ │ ├── jsonlab_basictest.matlab │ │ │ │ │ ├── jsonlab_selftest.m │ │ │ │ │ ├── jsonlab_selftest.matlab │ │ │ │ │ └── jsonlab_speedtest.m │ │ │ │ ├── jsonopt.m │ │ │ │ ├── loadjson.m │ │ │ │ ├── loadubjson.m │ │ │ │ ├── mergestruct.m │ │ │ │ ├── savejson.m │ │ │ │ ├── saveubjson.m │ │ │ │ ├── struct2jdata.m │ │ │ │ └── varargin2struct.m │ │ │ ├── mat2json.m │ │ │ └── startup.m │ │ └── poseval/ │ │ ├── __init__.py │ │ ├── convert.py │ │ ├── eval_helpers.py │ │ ├── evaluateAP.py │ │ ├── evaluatePCKh.py │ │ ├── evaluateTracking.py │ │ └── posetrack18_id2fname.py │ └── trackeval/ │ ├── __init__.py │ ├── _timing.py │ ├── datasets/ │ │ ├── __init__.py │ │ ├── _base_dataset.py │ │ ├── bdd100k.py │ │ ├── davis.py │ │ ├── kitti_2d_box.py │ │ ├── kitti_mots.py │ │ ├── mot_challenge_2d_box.py │ │ ├── mots_challenge.py │ │ ├── tao.py │ │ └── youtube_vis.py │ ├── eval.py │ ├── metrics/ │ │ ├── __init__.py │ │ ├── _base_metric.py │ │ ├── clear.py │ │ ├── count.py │ │ ├── hota.py │ │ ├── identity.py │ │ ├── j_and_f.py │ │ ├── track_map.py │ │ └── vace.py │ ├── plotting.py │ └── utils.py ├── eval.sh ├── model/ │ ├── __init__.py │ ├── functional.py │ ├── hrnet.py │ ├── model.py │ ├── random_feat_generator.py │ └── resnet.py ├── requirements.txt ├── setup.py ├── test/ │ ├── test_mot.py │ ├── test_mots.py │ ├── test_poseprop.py │ ├── test_posetrack.py │ ├── test_sot_cfnet.py │ ├── test_sot_siamfc.py │ ├── test_vis.py │ └── test_vos.py ├── tools/ │ ├── gen_mot16_fairmot.py │ ├── gen_mot16_gt.py │ ├── gen_mot16_label17.py │ ├── gen_mot19_det.py │ ├── gen_mots_costa.py │ └── gen_mots_gt.py ├── tracker/ │ ├── mot/ │ │ ├── basetrack.py │ │ ├── box.py │ │ ├── mask.py │ │ ├── multitracker.py │ │ └── pose.py │ └── sot/ │ └── lib/ │ ├── core/ │ │ ├── config.py │ │ ├── config_ocean.py │ │ ├── config_oceanplus.py │ │ ├── config_siamdw.py │ │ ├── eval_davis.py │ │ ├── eval_got10k.py │ │ ├── eval_lasot.py │ │ ├── eval_otb.py │ │ ├── eval_visdrone.py │ │ ├── extract_tune_logs.py │ │ └── function.py │ ├── dataset/ │ │ ├── crop/ │ │ │ ├── DAVIS/ │ │ │ │ ├── gen_json.py │ │ │ │ ├── par_crop.py │ │ │ │ └── readme.md │ │ │ ├── RGBT210/ │ │ │ │ ├── RGBT210_genjson.py │ │ │ │ ├── gen_json.py │ │ │ │ ├── par_crop.py │ │ │ │ └── readme.md │ │ │ ├── RGBT234/ │ │ │ │ ├── RGBT234_genjson.py │ │ │ │ ├── gen_json.py │ │ │ │ ├── par_crop.py │ │ │ │ └── readme.md │ │ │ ├── coco/ │ │ │ │ ├── gen_json.py │ │ │ │ ├── par_crop.py │ │ │ │ └── readme.md │ │ │ ├── det/ │ │ │ │ ├── gen_json.py │ │ │ │ ├── par_crop.py │ │ │ │ └── readme.md │ │ │ ├── got10k/ │ │ │ │ ├── gen_json.py │ │ │ │ ├── par_crop.py │ │ │ │ ├── parser_got10k.py │ │ │ │ └── readme.md │ │ │ ├── lasot/ │ │ │ │ ├── gen_json.py │ │ │ │ ├── par_crop.py │ │ │ │ ├── parser_lasot.py │ │ │ │ └── readme.md │ │ │ ├── vid/ │ │ │ │ ├── gen_json.py │ │ │ │ ├── par_crop.py │ │ │ │ ├── parse_vid.py │ │ │ │ └── readme.md │ │ │ └── visdrone/ │ │ │ ├── gen_json.py │ │ │ ├── par_crop.py │ │ │ ├── parser_visdrone.py │ │ │ └── readme.md │ │ ├── ocean.py │ │ └── siamfc.py │ ├── eval_toolkit/ │ │ ├── bin/ │ │ │ ├── _init_paths.py │ │ │ └── eval.py │ │ ├── pysot/ │ │ │ ├── __init__.py │ │ │ ├── datasets/ │ │ │ │ ├── __init__.py │ │ │ │ ├── dataset.py │ │ │ │ ├── got10k.py │ │ │ │ ├── lasot.py │ │ │ │ ├── nfs.py │ │ │ │ ├── otb.py │ │ │ │ ├── trackingnet.py │ │ │ │ ├── uav.py │ │ │ │ ├── video.py │ │ │ │ └── vot.py │ │ │ ├── evaluation/ │ │ │ │ ├── __init__.py │ │ │ │ ├── ar_benchmark.py │ │ │ │ ├── eao_benchmark.py │ │ │ │ ├── f1_benchmark.py │ │ │ │ └── ope_benchmark.py │ │ │ ├── utils/ │ │ │ │ ├── __init__.py │ │ │ │ ├── c_region.pxd │ │ │ │ ├── misc.py │ │ │ │ ├── region.c │ │ │ │ ├── region.pyx │ │ │ │ ├── setup.py │ │ │ │ ├── src/ │ │ │ │ │ ├── buffer.h │ │ │ │ │ ├── region.c │ │ │ │ │ └── region.h │ │ │ │ └── statistics.py │ │ │ └── visualization/ │ │ │ ├── __init__.py │ │ │ ├── draw_eao.py │ │ │ ├── draw_f1.py │ │ │ ├── draw_success_precision.py │ │ │ └── draw_utils.py │ │ └── requirements.txt │ ├── models/ │ │ ├── __init__.py │ │ ├── backbones.py │ │ ├── cfnet.py │ │ ├── connect.py │ │ ├── modules.py │ │ ├── online/ │ │ │ ├── __init__.py │ │ │ ├── backbone/ │ │ │ │ ├── __init__.py │ │ │ │ ├── resnet.py │ │ │ │ └── resnet18_vggm.py │ │ │ ├── bbreg/ │ │ │ │ ├── __init__.py │ │ │ │ └── iou_net.py │ │ │ ├── classifier/ │ │ │ │ ├── __init__.py │ │ │ │ ├── features.py │ │ │ │ ├── initializer.py │ │ │ │ ├── linear_filter.py │ │ │ │ └── optimizer.py │ │ │ └── layers/ │ │ │ ├── __init__.py │ │ │ ├── activation.py │ │ │ ├── blocks.py │ │ │ ├── distance.py │ │ │ ├── filter.py │ │ │ ├── normalization.py │ │ │ └── transform.py │ │ └── siamfc.py │ ├── online/ │ │ ├── __init__.py │ │ ├── augmentation.py │ │ ├── base_actor.py │ │ ├── base_trainer.py │ │ ├── complex.py │ │ ├── dcf.py │ │ ├── extractor.py │ │ ├── fourier.py │ │ ├── loading.py │ │ ├── ltr_trainer.py │ │ ├── model_constructor.py │ │ ├── operation.py │ │ ├── optim.py │ │ ├── optimization.py │ │ ├── preprocessing.py │ │ ├── tensordict.py │ │ ├── tensorlist.py │ │ └── tracking.py │ ├── tracker/ │ │ ├── ocean.py │ │ ├── oceanplus.py │ │ ├── online.py │ │ └── siamfc.py │ ├── utils/ │ │ ├── __init__.py │ │ ├── cutout.py │ │ ├── extract_tpejson_fc.py │ │ ├── extract_tpejson_ocean.py │ │ ├── extract_tpelog.py │ │ ├── extract_tpelog_fc.py │ │ ├── utils.py │ │ └── watch_tpe.sh │ └── version.py └── utils/ ├── __init__.py ├── box.py ├── io.py ├── log.py ├── mask.py ├── meter.py ├── palette.py └── visualize.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ docs/test_video.mp4 test/test_tao.py config/tao* tracker/mot/tao.py eval/error_log.txt config/got10k* config/lasot* config/tc128* config/tlp* config/trackingnet* config/vfs* config/ssib* weights/ results/ out/ vis/ *.ipynb # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2021 ZhongdaoWang Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================

-------------------------------------------------------------------------------- **[NeurIPS 2021] Do different tracking tasks require different appearance model?** **[[ArXiv](https://arxiv.org/abs/2107.02156)]** **[[Project Page](https://zhongdao.github.io/UniTrack)]** UniTrack is a simple and Unified framework for addressing multiple tracking tasks. Being a fundamental problem in computer vision, tracking has been fragmented into a multitude of different experimental setups. As a consequence, the literature has fragmented too, and now the novel approaches proposed by the community are usually specialized to fit only one specific setup. To understand to what extent this specialization is actually necessary, we present UniTrack, a solution to address multiple different tracking tasks within the same framework. All tasks share the same [appearance model](#appearance-model). UniTrack - Does **NOT** need training on a specific tracking task. - Shows [competitive performance](docs/RESULTS.md) on six out of seven tracking tasks considered. - Can be easily adapted to even [more tasks](##Demo). - Can be used as an evaluation platform to [test pre-trained self-supervised models](docs/MODELZOO.md). ## Demo **Multi-Object Tracking demo for 80 COCO classes ([YOLOX](https://github.com/Megvii-BaseDetection/YOLOX) + UniTrack)** In this demo we run the YOLOX detector and perform MOT for the 80 COCO classes. Try the demo by: ```python python demo/mot_demo.py --classes cls1 cls2 ... clsN ``` where cls1 to clsN represent the indices of classes you would like to detect and track. See [here](https://gist.github.com/AruniRC/7b3dadd004da04c80198557db5da4bda) for the index list. By default all 80 classes are detected and tracked. **Single-Object Tracking demo for custom videos** ```python python demo/sot_demo.py --config ./config/imagenet_resnet18_s3.yaml --input /path/to/your/video ``` In this demo, you are asked to annotate the target to be tracked, by drawing a rectangle in the first frame of the video. Then the algorithm tracks the target in following timesteps without object detection. ## Tasks & Framework ![tasksframework](docs/tasksframework.png) ### Tasks We classify existing tracking tasks along four axes: (1) Single or multiple targets; (2) Users specify targets or automatic detectors specify targets; (3) Observation formats (bounding box/mask/pose); (2) Class-agnostic or class-specific (i.e. human/vehicles). We mainly experiment on 5 tasks: **SOT, VOS, MOT, MOTS, and PoseTrack**. Task setups are summarized in the above figure. ### Appearance model An appearance model is the only learnable component in UniTrack. It should provide universal visual representation, and is usually pre-trained on large-scale dataset in supervised or unsupervised manners. Typical examples include ImageNet pre-trained ResNets (supervised), and recent self-supervised models such as MoCo and SimCLR (unsupervised). ### Propagation and Association *Propagation* and *Association* are the two core primitives used in UniTrack to address a wide variety of tracking tasks (currently 7, but more can be added), Both use the features extracted by the pre-trained appearance model. For propagation, we adopt exiting methods such as [cross correlation](https://www.robots.ox.ac.uk/~luca/siamese-fc.html), [DCF](https://openaccess.thecvf.com/content_cvpr_2017/html/Valmadre_End-To-End_Representation_Learning_CVPR_2017_paper.html), and [mask propation](https://github.com/ajabri/videowalk). For association we employ a simple algorithm as in [JDE](https://github.com/Zhongdao/Towards-Realtime-MOT) and develop a novel reconstruction-based similairty metric that allows to compare objects across shapes and sizes. ## Getting started 1. Installation: Please check out [docs/INSTALL.md](docs/INSTALL.md) 2. Data preparation: Please check out [docs/DATA.md](docs/DATA.md) 3. Appearance model preparation: Please check out [docs/MODELZOO.md](docs/MODELZOO.md) 4. Run evaluation on all datasets: Please check out [docs/RUN.md](docs/RUN.md) ## Results Below we show results of UniTrack with a simple **ImageNet Pre-trained ResNet-18** as the appearance model. More results can be found in [RESULTS.md](docs/RESULTS.md). **Single Object Tracking (SOT) on OTB-2015** **Video Object Segmentation (VOS) on DAVIS-2017 *val* split** **Multiple Object Tracking (MOT) on MOT-16 [*test* set *private detector* track](https://motchallenge.net/method/MOT=3856&chl=5)** (Detections from FairMOT) **Multiple Object Tracking and Segmentation (MOTS) on MOTS challenge [*test* set](https://motchallenge.net/method/MOTS=109&chl=17)** (Detections from COSTA_st) **Pose Tracking on PoseTrack-2018 *val* split** (Detections from LightTrack) ## Acknowledgement A part of code is borrowed from [VideoWalk](https://github.com/ajabri/videowalk) by Allan A. Jabri [SOT code](https://github.com/JudasDie/SOTS) by Zhipeng Zhang ## Citation ```bibtex @article{wang2021different, author = {Wang, Zhongdao and Zhao, Hengshuang and Li, Ya-Li and Wang, Shengjin and Torr, Philip and Bertinetto, Luca}, title = {Do different tracking tasks require different appearance models?}, journal = {Thirty-Fifth Conference on Neural Infromation Processing Systems}, year = {2021}, } ``` ================================================ FILE: config/crw_resnet18_s3.yaml ================================================ common: exp_name: crw_resnet18_s3 # Model related model_type: crw remove_layers: ['layer4'] im_mean: [0.4914, 0.4822, 0.4465] im_std: [0.2023, 0.1994, 0.2010] nopadding: False head_depth: -1 resume: 'weights/crw.pth' # Misc down_factor: 8 infer2D: True workers: 4 gpu_id: 0 device: cuda sot: dataset: 'OTB2015' dataroot: '/home/wangzd/datasets/GOT/OTB100/' epoch_test: False vos: davisroot: '/home/wangzd/datasets/uvc/DAVIS/' split: 'val' temperature: 0.05 topk: 10 radius: 12 videoLen: 5 cropSize: -1 head_depth: -1 no_l2: False long_mem: [0] infer2D: False norm_mask: False mot: obid: 'FairMOT' mot_root: '/home/wangzd/datasets/MOT/MOT16' feat_size: [4,10] save_videos: True save_images: False test_mot16: False track_buffer: 30 min_box_area: 200 nms_thres: 0.4 conf_thres: 0.5 iou_thres: 0.5 dup_iou_thres: 0.15 confirm_iou_thres: 0.7 img_size: [1088, 608] prop_flag: False use_kalman: True asso_with_motion: True motion_lambda: 0.98 motion_gated: True mots: obid: 'COSTA' mots_root: '/home/wangzd/datasets/GOT/MOTS' save_videos: False save_images: True test: False track_buffer: 30 nms_thres: 0.4 conf_thres: 0.5 iou_thres: 0.5 prop_flag: False max_mask_area: 200 dup_iou_thres: 0.15 confirm_iou_thres: 0.7 first_stage_thres: 0.7 feat_size: [4,10] use_kalman: True asso_with_motion: True motion_lambda: 0.98 motion_gated: False posetrack: obid: 'lighttrack_MSRA152' data_root: '/home/wangzd/datasets/GOT/Posetrack2018' split: 'val' track_buffer: 30 nms_thres: 0.4 conf_thres: 0.5 iou_thres: 0.5 frame_rate: 6 save_videos: False save_images: True prop_flag: False feat_size: [4,10] max_mask_area: 400 dup_iou_thres: 0.2 confirm_iou_thres: 0.6 first_stage_thres: 0.7 use_kalman: True asso_with_motion: True motion_lambda: 0.9999 motion_gated: False only_position: True ================================================ FILE: config/crw_resnet18_s3_womotion.yaml ================================================ common: exp_name: crw_resnet18_s3_womotion # Model related model_type: crw remove_layers: ['layer4'] im_mean: [0.4914, 0.4822, 0.4465] im_std: [0.2023, 0.1994, 0.2010] nopadding: False head_depth: -1 resume: 'weights/crw.pth' # Misc down_factor: 8 infer2D: True workers: 4 gpu_id: 0 device: cuda sot: dataset: 'OTB2015' dataroot: '/home/wangzd/datasets/GOT/OTB100/' epoch_test: False vos: davisroot: '/home/wangzd/datasets/uvc/DAVIS/' split: 'val' temperature: 0.05 topk: 10 radius: 12 videoLen: 5 cropSize: -1 head_depth: -1 no_l2: False long_mem: [0] infer2D: False norm_mask: False mot: obid: 'FairMOT' mot_root: '/home/wangzd/datasets/MOT/MOT16' feat_size: [4,10] save_videos: True save_images: False test_mot16: False track_buffer: 30 min_box_area: 200 nms_thres: 0.4 conf_thres: 0.5 iou_thres: 0.5 dup_iou_thres: 0.15 confirm_iou_thres: 0.7 img_size: [1088, 608] prop_flag: False use_kalman: True asso_with_motion: False motion_lambda: 1 motion_gated: False mots: obid: 'COSTA' mots_root: '/home/wangzd/datasets/GOT/MOTS' save_videos: False save_images: True test: False track_buffer: 30 nms_thres: 0.4 conf_thres: 0.5 iou_thres: 0.5 prop_flag: False max_mask_area: 200 dup_iou_thres: 0.15 confirm_iou_thres: 0.7 first_stage_thres: 0.7 feat_size: [4,10] use_kalman: True asso_with_motion: False motion_lambda: 1 motion_gated: False posetrack: obid: 'lighttrack_MSRA152' data_root: '/home/wangzd/datasets/GOT/Posetrack2018' split: 'val' track_buffer: 30 nms_thres: 0.4 conf_thres: 0.5 iou_thres: 0.5 frame_rate: 6 save_videos: False save_images: True prop_flag: False feat_size: [4,10] max_mask_area: 400 dup_iou_thres: 0.2 confirm_iou_thres: 0.6 first_stage_thres: 0.7 use_kalman: True asso_with_motion: False motion_lambda: 1 motion_gated: False only_position: True ================================================ FILE: config/imagenet_resnet18_s3.yaml ================================================ common: exp_name: imagenet_resnet18_s3 # Model related model_type: imagenet18 remove_layers: ['layer4'] im_mean: [0.485, 0.456, 0.406] im_std: [0.229, 0.224, 0.225] nopadding: False resume: None # Misc down_factor: 8 infer2D: True workers: 4 gpu_id: 0 device: cuda sot: dataset: 'OTB2015' dataroot: '/home/wangzd/datasets/GOT/OTB100/' epoch_test: False vos: davisroot: '/home/wangzd/datasets/uvc/DAVIS/' split: 'val' temperature: 0.05 topk: 10 radius: 12 videoLen: 5 cropSize: -1 head_depth: -1 no_l2: False long_mem: [0] infer2D: False norm_mask: False mot: obid: 'FairMOT' mot_root: '/home/wangzd/datasets/MOT/MOT16' feat_size: [4,10] save_videos: True save_images: False test_mot16: False track_buffer: 30 min_box_area: 200 nms_thres: 0.4 conf_thres: 0.5 iou_thres: 0.5 dup_iou_thres: 0.15 confirm_iou_thres: 0.7 img_size: [1088, 608] prop_flag: False use_kalman: True asso_with_motion: True motion_lambda: 0.98 motion_gated: True mots: obid: 'COSTA' mots_root: '/home/wangzd/datasets/GOT/MOTS' save_videos: False save_images: True test: False track_buffer: 30 nms_thres: 0.4 conf_thres: 0.5 iou_thres: 0.5 prop_flag: False max_mask_area: 200 dup_iou_thres: 0.15 confirm_iou_thres: 0.7 first_stage_thres: 0.7 feat_size: [4,10] use_kalman: True asso_with_motion: True motion_lambda: 0.98 motion_gated: False posetrack: obid: 'lighttrack_MSRA152' data_root: '/home/wangzd/datasets/GOT/Posetrack2018' split: 'val' track_buffer: 30 nms_thres: 0.4 conf_thres: 0.5 iou_thres: 0.5 frame_rate: 6 save_videos: False save_images: True prop_flag: False feat_size: [4,10] max_mask_area: 400 dup_iou_thres: 0.2 confirm_iou_thres: 0.6 first_stage_thres: 0.7 use_kalman: True asso_with_motion: True motion_lambda: 0.9999 motion_gated: False only_position: True ================================================ FILE: config/imagenet_resnet18_s3_womotion.yaml ================================================ common: exp_name: imagenet_resnet18_s3_womotion # Model related model_type: imagenet18 remove_layers: ['layer4'] im_mean: [0.485, 0.456, 0.406] im_std: [0.229, 0.224, 0.225] nopadding: False resume: None # Misc down_factor: 8 infer2D: True workers: 4 gpu_id: 0 device: cuda sot: dataset: 'OTB2015' dataroot: '/home/wangzd/datasets/GOT/OTB100/' epoch_test: False vos: davisroot: '/home/wangzd/datasets/uvc/DAVIS/' split: 'val' temperature: 0.05 topk: 10 radius: 12 videoLen: 5 cropSize: -1 head_depth: -1 no_l2: False long_mem: [0] infer2D: False norm_mask: False mot: obid: 'FairMOT' mot_root: '/home/wangzd/datasets/MOT/MOT16' feat_size: [4,10] save_videos: True save_images: False test_mot16: False track_buffer: 30 min_box_area: 200 nms_thres: 0.4 conf_thres: 0.5 iou_thres: 0.5 dup_iou_thres: 0.15 confirm_iou_thres: 0.7 img_size: [1088, 608] prop_flag: False use_kalman: True asso_with_motion: False motion_lambda: 1 motion_gated: False mots: obid: 'COSTA' mots_root: '/home/wangzd/datasets/GOT/MOTS' save_videos: False save_images: True test: False track_buffer: 30 nms_thres: 0.4 conf_thres: 0.5 iou_thres: 0.5 prop_flag: False max_mask_area: 200 dup_iou_thres: 0.15 confirm_iou_thres: 0.7 first_stage_thres: 0.7 feat_size: [4,10] use_kalman: True asso_with_motion: False motion_lambda: 1 motion_gated: False posetrack: obid: 'lighttrack_MSRA152' data_root: '/home/wangzd/datasets/GOT/Posetrack2018' split: 'val' track_buffer: 30 nms_thres: 0.4 conf_thres: 0.5 iou_thres: 0.5 frame_rate: 6 save_videos: False save_images: True prop_flag: False feat_size: [4,10] max_mask_area: 400 dup_iou_thres: 0.2 confirm_iou_thres: 0.6 first_stage_thres: 0.7 use_kalman: True asso_with_motion: False motion_lambda: 1 motion_gated: False only_position: True vis: obid: 'MaskTrackRCNN' data_root: '/home/wangzd/datasets/GOT/YoutubeVIS/' split: 'val' track_buffer: 30 nms_thres: 0.4 conf_thres: 0.5 iou_thres: 0.5 frame_rate: 6 save_videos: False save_images: True prop_flag: False feat_size: [12,12] max_mask_area: 1000 dup_iou_thres: 0.2 confirm_iou_thres: 0.6 first_stage_thres: 0.9 use_kalman: True asso_with_motion: False motion_lambda: 1 motion_gated: False ================================================ FILE: core/association/__init__.py ================================================ ================================================ FILE: core/association/matching.py ================================================ import pdb import cv2 import torch import torch.nn.functional as F import numpy as np import scipy from scipy.spatial.distance import cdist import lap from cython_bbox import bbox_overlaps as bbox_ious from core.motion import kalman_filter import time def merge_matches(m1, m2, shape): O,P,Q = shape m1 = np.asarray(m1) m2 = np.asarray(m2) M1 = scipy.sparse.coo_matrix((np.ones(len(m1)), (m1[:, 0], m1[:, 1])), shape=(O, P)) M2 = scipy.sparse.coo_matrix((np.ones(len(m2)), (m2[:, 0], m2[:, 1])), shape=(P, Q)) mask = M1*M2 match = mask.nonzero() match = list(zip(match[0], match[1])) unmatched_O = tuple(set(range(O)) - set([i for i, j in match])) unmatched_Q = tuple(set(range(Q)) - set([j for i, j in match])) return match, unmatched_O, unmatched_Q def linear_assignment(cost_matrix, thresh): if cost_matrix.size == 0: return np.empty((0, 2), dtype=int), tuple(range(cost_matrix.shape[0])), tuple(range(cost_matrix.shape[1])) matches, unmatched_a, unmatched_b = [], [], [] cost, x, y = lap.lapjv(cost_matrix, extend_cost=True, cost_limit=thresh) for ix, mx in enumerate(x): if mx >= 0: matches.append([ix, mx]) unmatched_a = np.where(x < 0)[0] unmatched_b = np.where(y < 0)[0] matches = np.asarray(matches) return matches, unmatched_a, unmatched_b def ious(atlbrs, btlbrs): """ Compute cost based on IoU :type atlbrs: list[tlbr] | np.ndarray :type atlbrs: list[tlbr] | np.ndarray :rtype ious np.ndarray """ ious = np.zeros((len(atlbrs), len(btlbrs)), dtype=np.float) if ious.size == 0: return ious ious = bbox_ious( np.ascontiguousarray(atlbrs, dtype=np.float), np.ascontiguousarray(btlbrs, dtype=np.float) ) return ious def iou_distance(atracks, btracks): """ Compute cost based on IoU :type atracks: list[STrack] :type btracks: list[STrack] :rtype cost_matrix np.ndarray """ if (len(atracks)>0 and isinstance(atracks[0], np.ndarray)) or (len(btracks) > 0 and isinstance(btracks[0], np.ndarray)): atlbrs = atracks btlbrs = btracks else: atlbrs = [track.tlbr for track in atracks] btlbrs = [track.tlbr for track in btracks] _ious = ious(atlbrs, btlbrs) cost_matrix = 1 - _ious return cost_matrix def embedding_distance(tracks, detections, metric='cosine'): """ :param tracks: list[STrack] :param detections: list[BaseTrack] :param metric: :return: cost_matrix np.ndarray """ cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float) if cost_matrix.size == 0: return cost_matrix det_features = np.asarray([track.curr_feat for track in detections], dtype=np.float) track_features = np.asarray([track.smooth_feat for track in tracks], dtype=np.float) cost_matrix = np.maximum(0.0, cdist(track_features, det_features)) # Nomalized features return cost_matrix def fuse_motion(kf, cost_matrix, tracks, detections, only_position=False, lambda_=0.98, gate=True): if cost_matrix.size == 0: return cost_matrix gating_dim = 2 if only_position else 4 gating_threshold = kalman_filter.chi2inv95[gating_dim] measurements = np.asarray([det.to_xyah() for det in detections]) for row, track in enumerate(tracks): gating_distance = kf.gating_distance( track.mean, track.covariance, measurements, only_position, metric='maha') if gate: cost_matrix[row, gating_distance > gating_threshold] = np.inf cost_matrix[row] = lambda_ * cost_matrix[row] + (1-lambda_)* gating_distance return cost_matrix def center_emb_distance(tracks, detections, metric='cosine'): """ :param tracks: list[STrack] :param detections: list[BaseTrack] :param metric: :return: cost_matrix np.ndarray """ cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float) if cost_matrix.size == 0: return cost_matrix det_features = torch.stack([track.curr_feat.squeeze() for track in detections]) track_features = torch.stack([track.smooth_feat.squeeze() for track in tracks]) normed_det = F.normalize(det_features) normed_track = F.normalize(track_features) cost_matrix = torch.mm(normed_track, normed_det.T) cost_matrix = 1 - cost_matrix.detach().cpu().numpy() return cost_matrix def recons_distance(tracks, detections, tmp=100): """ :param tracks: list[STrack] :param detections: list[BaseTrack] :param metric: :return: cost_matrix np.ndarray """ cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float) if cost_matrix.size == 0: return cost_matrix det_features_ = torch.stack([track.curr_feat.squeeze() for track in detections]) track_features_ = torch.stack([track.smooth_feat for track in tracks]) det_features = F.normalize(det_features_, dim=1) track_features = F.normalize(track_features_, dim=1) ndet, ndim, nw, nh = det_features.shape ntrk, _, _, _ = track_features.shape fdet = det_features.permute(0,2,3,1).reshape(-1, ndim).cuda() # ndet*nw*nh, ndim ftrk = track_features.permute(0,2,3,1).reshape(-1, ndim).cuda() # ntrk*nw*nh, ndim aff = torch.mm(ftrk, fdet.transpose(0,1)) # ntrk*nw*nh, ndet*nw*nh aff_td = F.softmax(tmp*aff, dim=1) aff_dt = F.softmax(tmp*aff, dim=0).transpose(0,1) recons_ftrk = torch.einsum('tds,dsm->tdm', aff_td.view(ntrk*nw*nh, ndet, nw*nh), fdet.view(ndet, nw*nh, ndim)) # ntrk*nw*nh, ndet, ndim recons_fdet = torch.einsum('dts,tsm->dtm', aff_dt.view(ndet*nw*nh, ntrk, nw*nh), ftrk.view(ntrk, nw*nh, ndim)) # ndet*nw*nh, ntrk, ndim res_ftrk = (recons_ftrk.permute(0,2,1) - ftrk.unsqueeze(-1)).view(ntrk, nw*nh*ndim, ndet) res_fdet = (recons_fdet.permute(0,2,1) - fdet.unsqueeze(-1)).view(ndet, nw*nh*ndim, ntrk) cost_matrix = (torch.abs(res_ftrk).mean(1) + torch.abs(res_fdet).mean(1).transpose(0,1)) * 0.5 cost_matrix = cost_matrix / cost_matrix.max(1)[0].unsqueeze(-1) #pdb.set_trace() cost_matrix = cost_matrix.cpu().numpy() return cost_matrix def get_track_feat(tracks, feat_flag='curr'): if feat_flag == 'curr': feat_list = [track.curr_feat.squeeze(0) for track in tracks] elif feat_flag == 'smooth': feat_list = [track.smooth_feat.squeeze(0) for track in tracks] else: raise NotImplementedError n = len(tracks) fdim = feat_list[0].shape[0] fdim_num = len(feat_list[0].shape) if fdim_num > 2: feat_list = [f.view(fdim,-1) for f in feat_list] numels = [f.shape[1] for f in feat_list] ret = torch.zeros(n, fdim, np.max(numels)).to(feat_list[0].device) for i, f in enumerate(feat_list): ret[i, :, :numels[i]] = f return ret def reconsdot_distance(tracks, detections, tmp=100): """ :param tracks: list[STrack] :param detections: list[BaseTrack] :param metric: :return: cost_matrix np.ndarray """ cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float) if cost_matrix.size == 0: return cost_matrix, None det_features_ = get_track_feat(detections) track_features_ = get_track_feat(tracks, feat_flag='curr') det_features = F.normalize(det_features_, dim=1) track_features = F.normalize(track_features_, dim=1) ndet, ndim, nsd = det_features.shape ntrk, _, nst = track_features.shape fdet = det_features.permute(0, 2, 1).reshape(-1, ndim).cuda() ftrk = track_features.permute(0, 2, 1).reshape(-1, ndim).cuda() aff = torch.mm(ftrk, fdet.transpose(0, 1)) aff_td = F.softmax(tmp*aff, dim=1) aff_dt = F.softmax(tmp*aff, dim=0).transpose(0, 1) recons_ftrk = torch.einsum('tds,dsm->tdm', aff_td.view(ntrk*nst, ndet, nsd), fdet.view(ndet, nsd, ndim)) recons_fdet = torch.einsum('dts,tsm->dtm', aff_dt.view(ndet*nsd, ntrk, nst), ftrk.view(ntrk, nst, ndim)) recons_ftrk = recons_ftrk.permute(0, 2, 1).view(ntrk, nst*ndim, ndet) recons_ftrk_norm = F.normalize(recons_ftrk, dim=1) recons_fdet = recons_fdet.permute(0, 2, 1).view(ndet, nsd*ndim, ntrk) recons_fdet_norm = F.normalize(recons_fdet, dim=1) dot_td = torch.einsum('tad,ta->td', recons_ftrk_norm, F.normalize(ftrk.reshape(ntrk, nst*ndim), dim=1)) dot_dt = torch.einsum('dat,da->dt', recons_fdet_norm, F.normalize(fdet.reshape(ndet, nsd*ndim), dim=1)) cost_matrix = 1 - 0.5 * (dot_td + dot_dt.transpose(0, 1)) cost_matrix = cost_matrix.detach().cpu().numpy() return cost_matrix, None def category_gate(cost_matrix, tracks, detections): """ :param tracks: list[STrack] :param detections: list[BaseTrack] :param metric: :return: cost_matrix np.ndarray """ if cost_matrix.size == 0: return cost_matrix det_categories = np.array([d.category for d in detections]) trk_categories = np.array([t.category for t in tracks]) cost_matrix = cost_matrix + np.abs( det_categories[None, :] - trk_categories[:, None]) return cost_matrix ================================================ FILE: core/motion/kalman_filter.py ================================================ # vim: expandtab:ts=4:sw=4 import numpy as np import scipy.linalg """ Table for the 0.95 quantile of the chi-square distribution with N degrees of freedom (contains values for N=1, ..., 9). Taken from MATLAB/Octave's chi2inv function and used as Mahalanobis gating threshold. """ chi2inv95 = { 1: 3.8415, 2: 5.9915, 3: 7.8147, 4: 9.4877, 5: 11.070, 6: 12.592, 7: 14.067, 8: 15.507, 9: 16.919} class KalmanFilter(object): """ A simple Kalman filter for tracking bounding boxes in image space. The 8-dimensional state space x, y, a, h, vx, vy, va, vh contains the bounding box center position (x, y), aspect ratio a, height h, and their respective velocities. Object motion follows a constant velocity model. The bounding box location (x, y, a, h) is taken as direct observation of the state space (linear observation model). """ def __init__(self): ndim, dt = 4, 1. # Create Kalman filter model matrices. self._motion_mat = np.eye(2 * ndim, 2 * ndim) for i in range(ndim): self._motion_mat[i, ndim + i] = dt self._update_mat = np.eye(ndim, 2 * ndim) # Motion and observation uncertainty are chosen relative to the current # state estimate. These weights control the amount of uncertainty in # the model. This is a bit hacky. self._std_weight_position = 1. / 20 self._std_weight_velocity = 1. / 160 def initiate(self, measurement): """Create track from unassociated measurement. Parameters ---------- measurement : ndarray Bounding box coordinates (x, y, a, h) with center position (x, y), aspect ratio a, and height h. Returns ------- (ndarray, ndarray) Returns the mean vector (8 dimensional) and covariance matrix (8x8 dimensional) of the new track. Unobserved velocities are initialized to 0 mean. """ mean_pos = measurement mean_vel = np.zeros_like(mean_pos) mean = np.r_[mean_pos, mean_vel] std = [ 2 * self._std_weight_position * measurement[3], 2 * self._std_weight_position * measurement[3], 1e-2, 2 * self._std_weight_position * measurement[3], 10 * self._std_weight_velocity * measurement[3], 10 * self._std_weight_velocity * measurement[3], 1e-5, 10 * self._std_weight_velocity * measurement[3]] covariance = np.diag(np.square(std)) return mean, covariance def predict(self, mean, covariance): """Run Kalman filter prediction step. Parameters ---------- mean : ndarray The 8 dimensional mean vector of the object state at the previous time step. covariance : ndarray The 8x8 dimensional covariance matrix of the object state at the previous time step. Returns ------- (ndarray, ndarray) Returns the mean vector and covariance matrix of the predicted state. Unobserved velocities are initialized to 0 mean. """ std_pos = [ self._std_weight_position * mean[3], self._std_weight_position * mean[3], 1e-2, self._std_weight_position * mean[3]] std_vel = [ self._std_weight_velocity * mean[3], self._std_weight_velocity * mean[3], 1e-5, self._std_weight_velocity * mean[3]] motion_cov = np.diag(np.square(np.r_[std_pos, std_vel])) mean = np.dot(mean, self._motion_mat.T) covariance = np.linalg.multi_dot(( self._motion_mat, covariance, self._motion_mat.T)) + motion_cov return mean, covariance def project(self, mean, covariance): """Project state distribution to measurement space. Parameters ---------- mean : ndarray The state's mean vector (8 dimensional array). covariance : ndarray The state's covariance matrix (8x8 dimensional). Returns ------- (ndarray, ndarray) Returns the projected mean and covariance matrix of the given state estimate. """ std = [ self._std_weight_position * mean[3], self._std_weight_position * mean[3], 1e-1, self._std_weight_position * mean[3]] innovation_cov = np.diag(np.square(std)) mean = np.dot(self._update_mat, mean) covariance = np.linalg.multi_dot(( self._update_mat, covariance, self._update_mat.T)) return mean, covariance + innovation_cov def multi_predict(self, mean, covariance): """Run Kalman filter prediction step (Vectorized version). Parameters ---------- mean : ndarray The Nx8 dimensional mean matrix of the object states at the previous time step. covariance : ndarray The Nx8x8 dimensional covariance matrics of the object states at the previous time step. Returns ------- (ndarray, ndarray) Returns the mean vector and covariance matrix of the predicted state. Unobserved velocities are initialized to 0 mean. """ std_pos = [ self._std_weight_position * mean[:, 3], self._std_weight_position * mean[:, 3], 1e-2 * np.ones_like(mean[:, 3]), self._std_weight_position * mean[:, 3]] std_vel = [ self._std_weight_velocity * mean[:, 3], self._std_weight_velocity * mean[:, 3], 1e-5 * np.ones_like(mean[:, 3]), self._std_weight_velocity * mean[:, 3]] sqr = np.square(np.r_[std_pos, std_vel]).T motion_cov = [] for i in range(len(mean)): motion_cov.append(np.diag(sqr[i])) motion_cov = np.asarray(motion_cov) mean = np.dot(mean, self._motion_mat.T) left = np.dot(self._motion_mat, covariance).transpose((1,0,2)) covariance = np.dot(left, self._motion_mat.T) + motion_cov return mean, covariance def update(self, mean, covariance, measurement): """Run Kalman filter correction step. Parameters ---------- mean : ndarray The predicted state's mean vector (8 dimensional). covariance : ndarray The state's covariance matrix (8x8 dimensional). measurement : ndarray The 4 dimensional measurement vector (x, y, a, h), where (x, y) is the center position, a the aspect ratio, and h the height of the bounding box. Returns ------- (ndarray, ndarray) Returns the measurement-corrected state distribution. """ projected_mean, projected_cov = self.project(mean, covariance) chol_factor, lower = scipy.linalg.cho_factor( projected_cov, lower=True, check_finite=False) kalman_gain = scipy.linalg.cho_solve( (chol_factor, lower), np.dot(covariance, self._update_mat.T).T, check_finite=False).T innovation = measurement - projected_mean new_mean = mean + np.dot(innovation, kalman_gain.T) new_covariance = covariance - np.linalg.multi_dot(( kalman_gain, projected_cov, kalman_gain.T)) return new_mean, new_covariance def gating_distance(self, mean, covariance, measurements, only_position=False, metric='maha'): """Compute gating distance between state distribution and measurements. A suitable distance threshold can be obtained from `chi2inv95`. If `only_position` is False, the chi-square distribution has 4 degrees of freedom, otherwise 2. Parameters ---------- mean : ndarray Mean vector over the state distribution (8 dimensional). covariance : ndarray Covariance of the state distribution (8x8 dimensional). measurements : ndarray An Nx4 dimensional matrix of N measurements, each in format (x, y, a, h) where (x, y) is the bounding box center position, a the aspect ratio, and h the height. only_position : Optional[bool] If True, distance computation is done with respect to the bounding box center position only. Returns ------- ndarray Returns an array of length N, where the i-th element contains the squared Mahalanobis distance between (mean, covariance) and `measurements[i]`. """ mean, covariance = self.project(mean, covariance) if only_position: mean, covariance = mean[:2], covariance[:2, :2] measurements = measurements[:, :2] d = measurements - mean if metric == 'gaussian': return np.sum(d * d, axis=1) elif metric == 'maha': cholesky_factor = np.linalg.cholesky(covariance) z = scipy.linalg.solve_triangular( cholesky_factor, d.T, lower=True, check_finite=False, overwrite_b=True) squared_maha = np.sum(z * z, axis=0) return squared_maha else: raise ValueError('invalid distance metric') ================================================ FILE: core/propagation/__init__.py ================================================ ################################################################### # File Name: __init__.py # Author: Zhongdao Wang # mail: wcd17@mails.tsinghua.edu.cn # Created Time: Mon Jan 18 15:57:34 2021 ################################################################### from __future__ import print_function from __future__ import division from __future__ import absolute_import from .propagate_box import propagate_box from .propagate_mask import propagate_mask from .propagate_pose import propagate_pose def propagate(temp_feats, obs, img, model, format='box'): if format == 'box': return propagate_box(temp_feats, obs, img, model) elif format == 'mask': return propagate_box(temp_feats, obs, img, model) elif format == 'pose': return propagate_pose(temp_feats, obs, img, model) else: raise ValueError('Observation format not supported.') ================================================ FILE: core/propagation/propagate_box.py ================================================ ################################################################### # File Name: propagate_box.py # Author: Zhongdao Wang # mail: wcd17@mails.tsinghua.edu.cn # Created Time: Mon Jan 18 16:01:46 2021 ################################################################### from __future__ import print_function from __future__ import division from __future__ import absolute_import def propagate_box(temp_feats, box, img, model): pass ================================================ FILE: core/propagation/propagate_mask.py ================================================ ################################################################### # File Name: propagate_box.py # Author: Zhongdao Wang # mail: wcd17@mails.tsinghua.edu.cn # Created Time: Mon Jan 18 16:01:46 2021 ################################################################### from __future__ import print_function from __future__ import division from __future__ import absolute_import def propagate_mask(temp_feats, mask, img, model): pass ================================================ FILE: core/propagation/propagate_pose.py ================================================ ################################################################### # File Name: propagate_box.py # Author: Zhongdao Wang # mail: wcd17@mails.tsinghua.edu.cn # Created Time: Mon Jan 18 16:01:46 2021 ################################################################### from __future__ import print_function from __future__ import division from __future__ import absolute_import def propagate_pose(temp_feats, pose, img, model): pass ================================================ FILE: data/jhmdb.py ================================================ from __future__ import print_function, absolute_import import os import numpy as np import math import scipy.io as sio import cv2 import torch from matplotlib import cm from utils import im_to_numpy, im_to_torch def resize(img, owidth, oheight): img = im_to_numpy(img) img = cv2.resize( img, (owidth, oheight) ) img = im_to_torch(img) return img def load_image(img_path): # H x W x C => C x H x W img = cv2.imread(img_path) # print(img_path) img = img.astype(np.float32) img = img / 255.0 img = img[:,:,::-1] img = img.copy() return im_to_torch(img) def color_normalize(x, mean, std): if x.size(0) == 1: x = x.repeat(3, 1, 1) for t, m, s in zip(x, mean, std): t.sub_(m) t.div_(s) return x import time ###################################################################### def try_np_load(p): try: return np.load(p) except: return None def make_lbl_set(lbls): print(lbls.shape) t00 = time.time() lbl_set = [np.zeros(3).astype(np.uint8)] count_lbls = [0] flat_lbls_0 = lbls[0].copy().reshape(-1, lbls.shape[-1]).astype(np.uint8) lbl_set = np.unique(flat_lbls_0, axis=0) # print(lbl_set) # if (lbl_set > 20).sum() > 0: # import pdb; pdb.set_trace() # count_lbls = [np.all(flat_lbls_0 == ll, axis=-1).sum() for ll in lbl_set] print('lbls', time.time() - t00) return lbl_set def texturize(onehot): flat_onehot = onehot.reshape(-1, onehot.shape[-1]) lbl_set = np.unique(flat_onehot, axis=0) count_lbls = [np.all(flat_onehot == ll, axis=-1).sum() for ll in lbl_set] object_id = np.argsort(count_lbls)[::-1][1] hidxs = [] for h in range(onehot.shape[0]): # appears = any(np.all(onehot[h] == lbl_set[object_id], axis=-1)) appears = np.any(onehot[h, :, 1:] == 1) if appears: hidxs.append(h) nstripes = min(10, len(hidxs)) out = np.zeros((*onehot.shape[:2], nstripes+1)) out[:, :, 0] = 1 for i, h in enumerate(hidxs): cidx = int(i // (len(hidxs) / nstripes)) w = np.any(onehot[h, :, 1:] == 1, axis=-1) out[h][w] = 0 out[h][w, cidx+1] = 1 # print(i, h, cidx) return out class JhmdbSet(torch.utils.data.Dataset): def __init__(self, args, sigma=0.5): self.filelist = args.filelist self.imgSize = args.imgSize self.videoLen = args.videoLen self.mapScale = args.mapScale self.sigma = sigma f = open(self.filelist, 'r') self.jpgfiles = [] self.lblfiles = [] for line in f: rows = line.split() jpgfile = rows[1] lblfile = rows[0] self.jpgfiles.append(jpgfile) self.lblfiles.append(lblfile) f.close() def get_onehot_lbl(self, lbl_path): name = '/' + '/'.join(lbl_path.split('.')[:-1]) + '_onehot.npy' if os.path.exists(name): return np.load(name) else: return None def make_paths(self, folder_path, label_path): I = [ ll for ll in os.listdir(folder_path) if '.png' in ll ] frame_num = len(I) + self.videoLen I.sort(key=lambda x:int(x.split('.')[0])) I_out, L_out = [], [] for i in range(frame_num): i = max(0, i - self.videoLen) img_path = "%s/%s" % (folder_path, I[i]) I_out.append(img_path) return I_out def __getitem__(self, index): folder_path = self.jpgfiles[index] label_path = self.lblfiles[index] imgs = [] imgs_orig = [] lbls = [] lbls_onehot = [] patches = [] target_imgs = [] img_paths = self.make_paths(folder_path, label_path) frame_num = len(img_paths) mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225] t000 = time.time() # frame_num = 30 for i in range(frame_num): t00 = time.time() img_path = img_paths[i] img = load_image(img_path) # CxHxW # print('loaded', i, time.time() - t00) ht, wd = img.size(1), img.size(2) if self.imgSize > 0: newh, neww = ht, wd if ht <= wd: ratio = 1.0 #float(wd) / float(ht) # width, height img = resize(img, int(self.imgSize * ratio), self.imgSize) newh = self.imgSize neww = int(self.imgSize * ratio) else: ratio = 1.0 #float(ht) / float(wd) # width, height img = resize(img, self.imgSize, int(self.imgSize * ratio)) newh = int(self.imgSize * ratio) neww = self.imgSize img_orig = img.clone() img = color_normalize(img, mean, std) imgs_orig.append(img_orig) imgs.append(img) rsz_h, rsz_w = math.ceil(img.size(1) / self.mapScale[0]), math.ceil(img.size(2) /self.mapScale[1]) lbls_mat = sio.loadmat(label_path) lbls_coord = lbls_mat['pos_img'] lbls_coord = lbls_coord - 1 lbls_coord[0, :, :] = lbls_coord[0, :, :] * float(neww) / float(wd) / self.mapScale[0] lbls_coord[1, :, :] = lbls_coord[1, :, :] * float(newh) / float(ht) / self.mapScale[1] lblsize = (rsz_h, rsz_w) lbls = np.zeros((lbls_coord.shape[2], lblsize[0], lblsize[1], lbls_coord.shape[1])) for i in range(lbls_coord.shape[2]): lbls_coord_now = lbls_coord[:, :, i] scales = lbls_coord_now.max(1) - lbls_coord_now.min(1) scale = scales.max() scale = max(0.5, scale*0.015) for j in range(lbls_coord.shape[1]): if self.sigma > 0: draw_labelmap_np(lbls[i, :, :, j], lbls_coord_now[:, j], scale) else: tx = int(lbls_coord_now[0, j]) ty = int(lbls_coord_now[1, j]) if tx < lblsize[1] and ty < lblsize[0] and tx >=0 and ty >=0: lbls[i, ty, tx, j] = 1.0 lbls_tensor = torch.zeros(frame_num, lblsize[0], lblsize[1], lbls_coord.shape[1]) for i in range(frame_num): if i < self.videoLen: nowlbl = lbls[0] else: if(i - self.videoLen < len(lbls)): nowlbl = lbls[i - self.videoLen] lbls_tensor[i] = torch.from_numpy(nowlbl) lbls_tensor = torch.cat([(lbls_tensor.sum(-1) == 0)[..., None] *1.0, lbls_tensor], dim=-1) lblset = np.arange(lbls_tensor.shape[-1]-1) lblset = np.array([[0, 0, 0]] + [cm.Paired(i)[:3] for i in lblset]) * 255.0 # Meta info meta = dict(folder_path=folder_path, img_paths=img_paths, lbl_paths=[]) imgs = torch.stack(imgs) imgs_orig = torch.stack(imgs_orig) lbls_resize = lbls_tensor #np.stack(resizes) assert lbls_resize.shape[0] == len(meta['img_paths']) #print('vid', i, 'took', time.time() - t000) return imgs, imgs_orig, lbls_resize, lbls_tensor, lblset, meta def __len__(self): return len(self.jpgfiles) def draw_labelmap_np(img, pt, sigma, type='Gaussian'): # Draw a 2D gaussian # Adopted from https://github.com/anewell/pose-hg-train/blob/master/src/pypose/draw.py # Check that any part of the gaussian is in-bounds ul = [int(pt[0] - 3 * sigma), int(pt[1] - 3 * sigma)] br = [int(pt[0] + 3 * sigma + 1), int(pt[1] + 3 * sigma + 1)] if (ul[0] >= img.shape[1] or ul[1] >= img.shape[0] or br[0] < 0 or br[1] < 0): # If not, just return the image as is return img # Generate gaussian size = 6 * sigma + 1 x = np.arange(0, size, 1, float) y = x[:, np.newaxis] x0 = y0 = size // 2 # The gaussian is not normalized, we want the center value to equal 1 if type == 'Gaussian': g = np.exp(- ((x - x0) ** 2 + (y - y0) ** 2) / (2 * sigma ** 2)) elif type == 'Cauchy': g = sigma / (((x - x0) ** 2 + (y - y0) ** 2 + sigma ** 2) ** 1.5) # Usable gaussian range g_x = max(0, -ul[0]), min(br[0], img.shape[1]) - ul[0] g_y = max(0, -ul[1]), min(br[1], img.shape[0]) - ul[1] # Image range img_x = max(0, ul[0]), min(br[0], img.shape[1]) img_y = max(0, ul[1]), min(br[1], img.shape[0]) img[img_y[0]:img_y[1], img_x[0]:img_x[1]] = g[g_y[0]:g_y[1], g_x[0]:g_x[1]] return img ================================================ FILE: data/kinetics.py ================================================ import torchvision.datasets.video_utils from torchvision.datasets.video_utils import VideoClips from torchvision.datasets.utils import list_dir from torchvision.datasets.folder import make_dataset from torchvision.datasets.vision import VisionDataset import numpy as np class Kinetics400(VisionDataset): """ `Kinetics-400 `_ dataset. Kinetics-400 is an action recognition video dataset. This dataset consider every video as a collection of video clips of fixed size, specified by ``frames_per_clip``, where the step in frames between each clip is given by ``step_between_clips``. To give an example, for 2 videos with 10 and 15 frames respectively, if ``frames_per_clip=5`` and ``step_between_clips=5``, the dataset size will be (2 + 3) = 5, where the first two elements will come from video 1, and the next three elements from video 2. Note that we drop clips which do not have exactly ``frames_per_clip`` elements, so not all frames in a video might be present. Internally, it uses a VideoClips object to handle clip creation. Args: root (string): Root directory of the Kinetics-400 Dataset. frames_per_clip (int): number of frames in a clip step_between_clips (int): number of frames between each clip transform (callable, optional): A function/transform that takes in a TxHxWxC video and returns a transformed version. Returns: video (Tensor[T, H, W, C]): the `T` video frames audio(Tensor[K, L]): the audio frames, where `K` is the number of channels and `L` is the number of points label (int): class of the video clip """ def __init__(self, root, frames_per_clip, step_between_clips=1, frame_rate=None, extensions=('mp4',), transform=None, cached=None, _precomputed_metadata=None): super(Kinetics400, self).__init__(root) extensions = extensions classes = list(sorted(list_dir(root))) class_to_idx = {classes[i]: i for i in range(len(classes))} self.samples = make_dataset(self.root, class_to_idx, extensions, is_valid_file=None) self.classes = classes video_list = [x[0] for x in self.samples] self.video_clips = VideoClips( video_list, frames_per_clip, step_between_clips, frame_rate, _precomputed_metadata, ) self.transform = transform def __len__(self): return self.video_clips.num_clips() def __getitem__(self, idx): success = False while not success: try: video, audio, info, video_idx = self.video_clips.get_clip(idx) success = True except: print('skipped idx', idx) idx = np.random.randint(self.__len__()) label = self.samples[video_idx][1] if self.transform is not None: video = self.transform(video) return video, audio, label ================================================ FILE: data/video.py ================================================ import os import pdb import glob import json import os.path as osp import cv2 import numpy as np import pycocotools.mask as mask_utils from utils.box import xyxy2xywh from torchvision.transforms import transforms as T class LoadImages: # for inference def __init__(self, path, img_size=(1088, 608)): if os.path.isdir(path): image_format = ['.jpg', '.jpeg', '.png', '.tif'] self.files = sorted(glob.glob('%s/*.*' % path)) self.files = list(filter(lambda x: os.path.splitext(x)[1].lower() in image_format, self.files)) elif os.path.isfile(path): self.files = [path] self.nF = len(self.files) # number of image files self.width = img_size[0] self.height = img_size[1] self.count = 0 assert self.nF > 0, 'No images found in ' + path def __iter__(self): self.count = -1 return self def __next__(self): self.count += 1 if self.count == self.nF: raise StopIteration img_path = self.files[self.count] # Read image img0 = cv2.imread(img_path) # BGR assert img0 is not None, 'Failed to load ' + img_path # Padded resize img, _, _, _ = letterbox(img0, height=self.height, width=self.width) # Normalize RGB img = img[:, :, ::-1].transpose(2, 0, 1) img = np.ascontiguousarray(img, dtype=np.float32) img /= 255.0 return img_path, img, img0 def __getitem__(self, idx): idx = idx % self.nF img_path = self.files[idx] # Read image img0 = cv2.imread(img_path) # BGR assert img0 is not None, 'Failed to load ' + img_path # Padded resize img, _, _, _ = letterbox(img0, height=self.height, width=self.width) # Normalize RGB img = img[:, :, ::-1].transpose(2, 0, 1) img = np.ascontiguousarray(img, dtype=np.float32) img /= 255.0 return img_path, img, img0 def __len__(self): return self.nF # number of files class LoadVideo: # for inference def __init__(self, path, img_size=(1088, 608)): self.cap = cv2.VideoCapture(path) self.frame_rate = int(round(self.cap.get(cv2.CAP_PROP_FPS))) self.vw = int(self.cap.get(cv2.CAP_PROP_FRAME_WIDTH)) self.vh = int(self.cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) self.vn = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT)) self.width = img_size[0] self.height = img_size[1] self.count = 0 self.w, self.h = self.get_size(self.vw, self.vh, self.width, self.height) print('Lenth of the video: {:d} frames'.format(self.vn)) def get_size(self, vw, vh, dw, dh): wa, ha = float(dw) / vw, float(dh) / vh a = min(wa, ha) return int(vw * a), int(vh*a) def __iter__(self): self.count = -1 return self def __next__(self): self.count += 1 if self.count == len(self): raise StopIteration # Read image res, img0 = self.cap.read() # BGR assert img0 is not None, 'Failed to load frame {:d}'.format(self.count) img0 = cv2.resize(img0, (self.w, self.h)) # Padded resize img, _, _, _ = letterbox(img0, height=self.height, width=self.width) # Normalize RGB img = img[:, :, ::-1] img = np.ascontiguousarray(img, dtype=np.float32) return self.count, img, img0 def __len__(self): return self.vn # number of files class LoadImagesAndObs: def __init__(self, path, opt): obid = opt.obid img_size = getattr(opt,'img_size', None) if os.path.isdir(path): image_format = ['.jpg', '.jpeg', '.png', '.tif'] self.img_files = sorted(glob.glob('%s/*.*' % path)) self.img_files = list(filter( lambda x: os.path.splitext(x)[1].lower() in image_format, self.img_files)) elif os.path.isfile(path): self.img_files = [path,] self.label_files = [x.replace('images', osp.join('obs', obid)).replace( '.png', '.txt').replace('.jpg', '.txt') for x in self.img_files] self.nF = len(self.img_files) # number of image files self.transforms = T.Compose([T.ToTensor(), T.Normalize(opt.im_mean, opt.im_std)]) self.use_lab = getattr(opt, 'use_lab', False) if not img_size is None: self.width = img_size[0] self.height = img_size[1] def __getitem__(self, files_index): img_path = self.img_files[files_index] label_path = self.label_files[files_index] return self.get_data(img_path, label_path) def get_data(self, img_path, label_path): height = self.height width = self.width img_ori = cv2.imread(img_path) # BGR if img_ori is None: raise ValueError('File corrupt {}'.format(img_path)) h, w, _ = img_ori.shape img, ratio, padw, padh = letterbox(img_ori, height=height, width=width) # Load labels if os.path.isfile(label_path): labels0 = np.loadtxt(label_path, dtype=np.float32).reshape(-1, 5) # Normalized xywh to pixel xyxy format labels = labels0.copy() labels[:, 0] = ratio * w * (labels0[:, 0] - labels0[:, 2] / 2) + padw labels[:, 1] = ratio * h * (labels0[:, 1] - labels0[:, 3] / 2) + padh labels[:, 2] = ratio * w * (labels0[:, 0] + labels0[:, 2] / 2) + padw labels[:, 3] = ratio * h * (labels0[:, 1] + labels0[:, 3] / 2) + padh else: labels = np.array([]) nL = len(labels) if nL > 0: # convert xyxy to xywh labels[:, 0:4] = xyxy2xywh(labels[:, 0:4].copy()) labels[:, 0] /= width labels[:, 1] /= height labels[:, 2] /= width labels[:, 3] /= height if self.use_lab: img = cv2.cvtColor(img, cv2.COLOR_BGR2LAB) img = np.array([img[:, :, 0], ]*3) img = img.transpose(1, 2, 0) img = img / 255. img = np.ascontiguousarray(img[:, :, ::-1]) # BGR to RGB if self.transforms is not None: img = self.transforms(img) return img, labels, img_ori, (h, w) def __len__(self): return self.nF # number of batches class LoadImagesAndObsTAO: def __init__(self, root, video_meta, obs, opt): self.dataroot = root self.img_ind = [x['id'] for x in video_meta] self.img_files = [x['file_name'] for x in video_meta] self.img_files = [osp.join(root, 'frames', x) for x in self.img_files] self.obs = [obs.get(x, []) for x in self.img_ind] self.use_lab = getattr(opt, 'use_lab', False) self.transforms = T.Compose([T.ToTensor(), T.Normalize(opt.im_mean, opt.im_std)]) def __getitem__(self, index): img_ori = cv2.imread(self.img_files[index]) if img_ori is None: raise ValueError('File corrupt {}'.format(img_path)) h, w, _ = img_ori.shape img = img_ori if self.use_lab: img = cv2.cvtColor(img, cv2.COLOR_BGR2LAB) img = np.array([img[:,:,0],]*3) img = img.transpose(1,2,0) img = img / 255. img = np.ascontiguousarray(img[ :, :, ::-1]) # BGR to RGB if self.transforms is not None: img = self.transforms(img) obs = self.obs[index] if len(obs) == 0: labels = np.array([[0,0,1,1,-1,-1]]) else: boxes = np.array([x.get('bbox', [0,0,1,1]) for x in obs]) scores = np.array([x.get('score', 0) for x in obs])[:, None] cat_ids = np.array([x.get('category_id',-1) for x in obs])[:, None] labels = np.concatenate([boxes, scores, cat_ids], axis=1) if len(labels) > 0: # From tlwh to xywh: (x,y) is the box center labels[:, 0] = labels[:, 0] + labels[:, 2] / 2 labels[:, 1] = labels[:, 1] + labels[:, 3] / 2 labels[:, 0] /= w labels[:, 1] /= h labels[:, 2] /= w labels[:, 3] /= h return img, labels, img_ori, (h,w) def __len__(self): return len(self.img_files) class LoadImagesAndMaskObsVIS: def __init__(self, path, info, obs, opt): self.dataroot = path self.nF = info['length'] self.img_files = [osp.join(path, p) for p in info['file_names']] self.obsbyobj = obs self.transforms = T.Compose([T.ToTensor(), T.Normalize(opt.im_mean, opt.im_std)]) self.use_lab = getattr(opt, 'use_lab', False) def __getitem__(self, idx): img_ori = cv2.imread(self.img_files[idx]) if img_ori is None: raise ValueError('File corrupt {}'.format(img_path)) h, w, _ = img_ori.shape img = img_ori if self.use_lab: img = cv2.cvtColor(img, cv2.COLOR_BGR2LAB) img = np.array([img[:,:,0],]*3) img = img.transpose(1,2,0) img = img / 255. img = np.ascontiguousarray(img[ :, :, ::-1]) # BGR to RGB if self.transforms is not None: img = self.transforms(img) labels = list() for obj in self.obsbyobj: RLE = obj['segmentations'][idx] if RLE: labels.append(mask_utils.decode(RLE)) else: labels.append(np.zeros((h, w), dtype=np.uint8)) labels = np.stack(labels) return img, labels, img_ori, (h, w) def __len__(self): return self.nF class LoadImagesAndMaskObsMOTS(LoadImagesAndObs): def __init__(self, path, opt): super(LoadImagesAndMaskObsMOTS, self).__init__(path, opt) def get_data(self, img_path, label_path): img_ori = cv2.imread(img_path) # BGR if img_ori is None: raise ValueError('File corrupt {}'.format(img_path)) h, w, _ = img_ori.shape img = img_ori if self.use_lab: img = cv2.cvtColor(img, cv2.COLOR_BGR2LAB) img = np.array([img[:,:,0],]*3) img = img.transpose(1,2,0) img = img / 255. img = np.ascontiguousarray(img[ :, :, ::-1]) # BGR to RGB if self.transforms is not None: img = self.transforms(img) # Load labels labels = [] if os.path.isfile(label_path): with open(label_path, 'r') as f: for line in f: labels.append(line.strip().split()) nL = len(labels) if nL > 0: labels = [{'size':(int(h),int(w)), 'counts':m} for \ _, _,cid,h,w,m in labels if cid=='2'] labels = [mask_utils.decode(rle) for rle in labels] labels = np.stack(labels) return img, labels, img_ori, (h, w) class LoadImagesAndPoseObs(LoadImagesAndObs): def __init__(self, obs_jpath, opt): fjson = open(obs_jpath, 'r') self.infoj = json.load(fjson)['annolist'] self.dataroot = opt.data_root self.nF = len(self.infoj) self.img_files = [osp.join(opt.data_root, p['image'][0]['name']) for p in self.infoj] self.transforms = T.Compose([T.ToTensor(), T.Normalize(opt.im_mean, opt.im_std)]) self.use_lab = getattr(opt, 'use_lab', False) def __getitem__(self, idx): img_ori = cv2.imread(self.img_files[idx]) if img_ori is None: raise ValueError('File corrupt {}'.format(img_path)) h, w, _ = img_ori.shape img = img_ori if self.use_lab: img = cv2.cvtColor(img, cv2.COLOR_BGR2LAB) img = np.array([img[:,:,0],]*3) img = img.transpose(1,2,0) img = img / 255. img = np.ascontiguousarray(img[ :, :, ::-1]) # BGR to RGB if self.transforms is not None: img = self.transforms(img) info_label = self.infoj[idx]['annorect'] nobj = len(info_label) labels = list() labels = [l['annopoints'][0]['point'] for l in info_label] return img, labels, img_ori, (h, w) def letterbox(img, height=608, width=1088, color=(127.5, 127.5, 127.5)): # resize a rectangular image to a padded rectangular shape = img.shape[:2] # shape = [height, width] ratio = min(float(height)/shape[0], float(width)/shape[1]) new_shape = (round(shape[1] * ratio), round(shape[0] * ratio)) # new_shape = [width, height] dw = (width - new_shape[0]) / 2 # width padding dh = (height - new_shape[1]) / 2 # height padding top, bottom = round(dh - 0.1), round(dh + 0.1) left, right = round(dw - 0.1), round(dw + 0.1) img = cv2.resize(img, new_shape, interpolation=cv2.INTER_AREA) # resized, no border img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # padded rectangular return img, ratio, dw, dh ================================================ FILE: data/vos.py ================================================ from __future__ import print_function, absolute_import import os import pdb import os.path as osp import numpy as np import math import cv2 import torch import time from matplotlib import cm from utils import im_to_numpy, im_to_torch def resize(img, owidth, oheight): img = im_to_numpy(img) img = cv2.resize(img, (owidth, oheight)) img = im_to_torch(img) return img def load_image(img): # H x W x C => C x H x W if isinstance(img, str): img = cv2.imread(img) if len(img.shape) == 2: img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) img = img.astype(np.float32) img = img / 255.0 img = img[:, :, ::-1] img = img.copy() return im_to_torch(img) def color_normalize(x, mean, std): if x.size(0) == 1: x = x.repeat(3, 1, 1) for t, m, s in zip(x, mean, std): t.sub_(m) t.div_(s) return x ###################################################################### def try_np_load(p): try: return np.load(p) except: return None def make_lbl_set(lbls): lbl_set = [np.zeros(3).astype(np.uint8)] flat_lbls_0 = lbls[0].copy().reshape(-1, lbls.shape[-1]).astype(np.uint8) lbl_set = np.unique(flat_lbls_0, axis=0) return lbl_set def texturize(onehot): flat_onehot = onehot.reshape(-1, onehot.shape[-1]) lbl_set = np.unique(flat_onehot, axis=0) count_lbls = [np.all(flat_onehot == ll, axis=-1).sum() for ll in lbl_set] object_id = np.argsort(count_lbls)[::-1][1] hidxs = [] for h in range(onehot.shape[0]): appears = np.any(onehot[h, :, 1:] == 1) if appears: hidxs.append(h) nstripes = min(10, len(hidxs)) out = np.zeros((*onehot.shape[:2], nstripes+1)) out[:, :, 0] = 1 for i, h in enumerate(hidxs): cidx = int(i // (len(hidxs) / nstripes)) w = np.any(onehot[h, :, 1:] == 1, axis=-1) out[h][w] = 0 out[h][w, cidx+1] = 1 return out class VOSDataset(torch.utils.data.Dataset): def __init__(self, args): self.davisroot = args.davisroot self.split = args.split self.imgSize = args.imgSize self.videoLen = args.videoLen self.mapScale = args.mapScale self.texture = False self.round = False self.use_lab = getattr(args, 'use_lab', False) self.im_mean = args.im_mean self.im_std = args.im_std filelist = osp.join(self.davisroot, 'ImageSets/2017', self.split+'.txt') f = open(filelist, 'r') self.jpgfiles = [] self.lblfiles = [] for line in f: seq = line.strip() self.jpgfiles.append(osp.join(self.davisroot,'JPEGImages','480p', seq)) self.lblfiles.append(osp.join(self.davisroot, 'Annotations','480p', seq)) f.close() def get_onehot_lbl(self, lbl_path): name = '/' + '/'.join(lbl_path.split('.')[:-1]) + '_onehot.npy' if os.path.exists(name): return np.load(name) else: return None def make_paths(self, folder_path, label_path): I, L = os.listdir(folder_path), os.listdir(label_path) L = [ll for ll in L if 'npy' not in ll] frame_num = len(I) + self.videoLen I.sort(key=lambda x:int(x.split('.')[0])) L.sort(key=lambda x:int(x.split('.')[0])) I_out, L_out = [], [] for i in range(frame_num): i = max(0, i - self.videoLen) img_path = "%s/%s" % (folder_path, I[i]) lbl_path = "%s/%s" % (label_path, L[i]) I_out.append(img_path) L_out.append(lbl_path) return I_out, L_out def __getitem__(self, index): folder_path = self.jpgfiles[index] label_path = self.lblfiles[index] imgs = [] imgs_orig = [] lbls = [] lbls_onehot = [] patches = [] target_imgs = [] frame_num = len(os.listdir(folder_path)) + self.videoLen img_paths, lbl_paths = self.make_paths(folder_path, label_path) t000 = time.time() for i in range(frame_num): t00 = time.time() img_path, lbl_path = img_paths[i], lbl_paths[i] img = load_image(img_path) # CxHxW lblimg = cv2.imread(lbl_path) ''' Resize img to 320x320 ''' ht, wd = img.size(1), img.size(2) if self.imgSize > 0: newh, neww = ht, wd if ht <= wd: ratio = 1.0 #float(wd) / float(ht) # width, height img = resize(img, int(self.imgSize * ratio), self.imgSize) newh = self.imgSize neww = int(self.imgSize * ratio) else: ratio = 1.0 #float(ht) / float(wd) # width, height img = resize(img, self.imgSize, int(self.imgSize * ratio)) newh = int(self.imgSize * ratio) neww = self.imgSize lblimg = cv2.resize(lblimg, (newh, neww), cv2.INTER_NEAREST) # Resized, but not augmented image img_orig = img.clone() ''' Transforms ''' if self.use_lab: img = im_to_numpy(img) img = (img * 255).astype(np.uint8)[:,:,::-1] img = cv2.cvtColor(img, cv2.COLOR_BGR2LAB) img = im_to_torch(img) / 255. img = color_normalize(img, self.im_mean, self.im_std) img = torch.stack([img[0]]*3) else: img = color_normalize(img, self.im_mean, self.im_std) imgs_orig.append(img_orig) imgs.append(img) lbls.append(lblimg.copy()) # Meta info meta = dict(folder_path=folder_path, img_paths=img_paths, lbl_paths=lbl_paths) ######################################################## # Load reshaped label information (load cached versions if possible) lbls = np.stack(lbls) prefix = '/' + '/'.join(lbl_paths[0].split('.')[:-1]) # Get lblset lblset = make_lbl_set(lbls) if np.all((lblset[1:] - lblset[:-1]) == 1): lblset = lblset[:, 0:1] onehots = [] resizes = [] rsz_h, rsz_w = math.ceil(img.size(1) / self.mapScale[0]), math.ceil(img.size(2) /self.mapScale[1]) for i,p in enumerate(lbl_paths): prefix = '/' + '/'.join(p.split('.')[:-1]) # print(prefix) oh_path = "%s_%s.npy" % (prefix, 'onehot') rz_path = "%s_%s.npy" % (prefix, 'size%sx%s' % (rsz_h, rsz_w)) onehot = try_np_load(oh_path) if onehot is None: print('computing onehot lbl for', oh_path) onehot = np.stack([np.all(lbls[i] == ll, axis=-1) for ll in lblset], axis=-1) np.save(oh_path, onehot) resized = try_np_load(rz_path) if resized is None: print('computing resized lbl for', rz_path) resized = cv2.resize(np.float32(onehot), (rsz_w, rsz_h), cv2.INTER_LINEAR) np.save(rz_path, resized) if self.texture: texturized = texturize(resized) resizes.append(texturized) lblset = np.array([[0, 0, 0]] + [cm.Paired(i)[:3] for i in range(texturized.shape[-1])]) * 255.0 break else: resizes.append(resized) onehots.append(onehot) if self.texture: resizes = resizes * self.videoLen for _ in range(len(lbl_paths)-self.videoLen): resizes.append(np.zeros(resizes[0].shape)) onehots = resizes ######################################################## imgs = torch.stack(imgs) imgs_orig = torch.stack(imgs_orig) lbls_tensor = torch.from_numpy(np.stack(lbls)) lbls_resize = np.stack(resizes) assert lbls_resize.shape[0] == len(meta['lbl_paths']) return imgs, imgs_orig, lbls_resize, lbls_tensor, lblset, meta def __len__(self): return len(self.jpgfiles) ================================================ FILE: demo/mot_demo.py ================================================ ################################################################### # File Name: mot_demo.py # Author: Zhongdao Wang # mail: wcd17@mails.tsinghua.edu.cn # Created Time: Sat Jul 24 16:07:23 2021 ################################################################### import os import sys import yaml import argparse import os.path as osp from loguru import logger import cv2 import torch import numpy as np from torchvision.transforms import transforms as T sys.path[0] = os.getcwd() from data.video import LoadVideo from utils.meter import Timer from utils import visualize as vis from detector.YOLOX.yolox.exp import get_exp from detector.YOLOX.yolox.utils import get_model_info from detector.YOLOX.yolox.data.datasets import COCO_CLASSES from detector.YOLOX.tools.demo import Predictor from utils.box import scale_box_input_size from tracker.mot.box import BoxAssociationTracker def make_parser(): parser = argparse.ArgumentParser("YOLOX + UniTrack MOT demo") # Common arguments parser.add_argument('--demo', default='video', help='demo type, eg. video or webcam') parser.add_argument('--path', default='./docs/test_video.mp3', help='path to images or video') parser.add_argument('--save_result', action='store_true', help='whether to save result') parser.add_argument("--nms", default=None, type=float, help="test nms threshold") parser.add_argument("--tsize", default=[640, 480], type=int, nargs='+', help="test img size") parser.add_argument("--exp_file", type=str, default='./detector/YOLOX/exps/default/yolox_x.py', help="pls input your expriment description file") parser.add_argument('--output-root', default='./results/mot_demo', help='output directory') parser.add_argument('--classes', type=int, nargs='+', default=list(range(90)), help='COCO_CLASSES') # Detector related parser.add_argument("-c", "--ckpt", type=str, default='./detector/YOLOX/weights/yolox_x.pth', help="model weights of the detector") parser.add_argument("--conf", default=0.65, type=float, help="detection confidence threshold") # UniTrack related parser.add_argument('--config', type=str, help='tracker config file', default='./config/imagenet_resnet18_s3.yaml') return parser def dets2obs(dets, imginfo, cls): if dets is None or len(dets) == 0: return np.array([]) obs = dets.cpu().numpy() h, w = imginfo['height'], imginfo['width'] # To xywh ret = np.zeros((len(obs), 6)) ret[:, 0] = (obs[:, 0] + obs[:, 2]) * 0.5 / w ret[:, 1] = (obs[:, 1] + obs[:, 3]) * 0.5 / h ret[:, 2] = (obs[:, 2] - obs[:, 0]) / w ret[:, 3] = (obs[:, 3] - obs[:, 1]) / h ret[:, 4] = obs[:, 4] * obs[:, 5] ret[:, 5] = obs[:, 6] ret = [r for r in ret if int(r[5]) in cls] ret = np.array(ret) return ret def eval_seq(opt, dataloader, detector, tracker, result_filename, save_dir=None, show_image=True): transforms = T.Compose([T.ToTensor(), T.Normalize(opt.im_mean, opt.im_std)]) if save_dir: os.makedirs(save_dir, exist_ok=True) timer = Timer() results = [] for frame_id, (_, img, img0) in enumerate(dataloader): if frame_id % 20 == 0: logger.info('Processing frame {} ({:.2f} fps)'.format( frame_id, 1./max(1e-5, timer.average_time))) # run tracking timer.tic() det_outputs, img_info = detector.inference(img) img = img / 255. img = transforms(img) obs = dets2obs(det_outputs[0], img_info, opt.classes) if len(obs) == 0: online_targets = [] else: online_targets = tracker.update(img, img0, obs) online_tlwhs = [] online_ids = [] for t in online_targets: tlwh = t.tlwh tid = t.track_id online_tlwhs.append(tlwh) online_ids.append(tid) timer.toc() # save results results.append((frame_id + 1, online_tlwhs, online_ids)) if show_image or save_dir is not None: online_im = vis.plot_tracking( img0, online_tlwhs, online_ids, frame_id=frame_id, fps=1. / timer.average_time) if show_image: cv2.imshow('online_im', online_im) if save_dir is not None: cv2.imwrite(os.path.join( save_dir, '{:05d}.jpg'.format(frame_id)), online_im) return frame_id, timer.average_time, timer.calls def main(exp, args): logger.info("Args: {}".format(args)) # Data, I/O dataloader = LoadVideo(args.path, args.tsize) video_name = osp.basename(args.path).split('.')[0] result_root = osp.join(args.output_root, video_name) result_filename = os.path.join(result_root, 'results.txt') args.frame_rate = dataloader.frame_rate # Detector init det_model = exp.get_model() logger.info("Model Summary: {}".format( get_model_info(det_model, exp.test_size))) det_model.cuda() det_model.eval() logger.info("loading checkpoint") ckpt = torch.load(args.ckpt, map_location="cpu") # load the model state dict det_model.load_state_dict(ckpt["model"]) logger.info("loaded checkpoint done.") detector = Predictor(det_model, exp, COCO_CLASSES, None, None, 'gpu') # Tracker init tracker = BoxAssociationTracker(args) frame_dir = osp.join(result_root, 'frame') try: eval_seq(args, dataloader, detector, tracker, result_filename, save_dir=frame_dir, show_image=False) except Exception as e: print(e) output_video_path = osp.join(result_root, video_name+'.avi') cmd_str = 'ffmpeg -f image2 -i {}/%05d.jpg -c:v copy {}'.format( osp.join(result_root, 'frame'), output_video_path) os.system(cmd_str) if __name__ == '__main__': args = make_parser().parse_args() with open(args.config) as f: common_args = yaml.load(f) for k, v in common_args['common'].items(): setattr(args, k, v) for k, v in common_args['mot'].items(): setattr(args, k, v) exp = get_exp(args.exp_file, None) if args.conf is not None: args.conf_thres = args.conf exp.test_conf = args.conf if args.nms is not None: exp.nmsthre = args.nms if args.tsize is not None: exp.test_size = args.tsize[::-1] args.img_size = args.tsize args.classes = [x for x in args.classes] main(exp, args) ================================================ FILE: demo/sot_demo.py ================================================ # ------------------------------------------------------------------------------ # Copyright (c) Microsoft # Licensed under the MIT License. # Written by Zhipeng Zhang (zhangzhipeng2017@ia.ac.cn) # ------------------------------------------------------------------------------ import os import pdb import sys sys.path[0] = os.getcwd() import cv2 import yaml import argparse from PIL import Image from glob import glob from os.path import exists, join from easydict import EasyDict as edict import torch import numpy as np import tracker.sot.lib.models as models from tracker.sot.lib.utils.utils import load_dataset, crop_chw, \ gaussian_shaped_labels, cxy_wh_2_rect1, rect1_2_cxy_wh, cxy_wh_2_bbox from tracker.sot.lib.core.eval_otb import eval_auc_tune import utils from model import AppearanceModel, partial_load from data.vos import color_normalize, load_image, im_to_numpy, im_to_torch def get_frames(video_name): if not video_name: cap = cv2.VideoCapture(0) # warmup for i in range(5): cap.read() while True: ret, frame = cap.read() if ret: yield frame else: break elif video_name.endswith('avi') or video_name.endswith('mp4'): cap = cv2.VideoCapture(video_name) while True: ret, frame = cap.read() if ret: yield frame else: break else: images = glob(os.path.join(video_name, '*.jp*')) images = sorted(images, key=lambda x: int(x.split('/')[-1].split('.')[0])) for img in images: frame = cv2.imread(img) yield frame def preproc(img, im_mean, im_std, use_lab=False): img = load_image(img) if use_lab: img = im_to_numpy(img) img = (img*255).astype(np.uint8)[:, :, ::-1] img = cv2.cvtColor(img, cv2.COLOR_BGR2LAB) img = im_to_torch(img) / 255. img = color_normalize(img, im_mean, im_std) if use_lab: img = torch.stack([img[0], ]*3) img = img.permute(1, 2, 0).numpy() # H, W, C return img class TrackerConfig(object): crop_sz = 512 + 8 downscale = 8 temp_sz = crop_sz // downscale lambda0 = 1e-4 padding = 3.5 interp_factor = 0.01 num_scale = 3 scale_step = 1.0275 scale_factor = scale_step ** (np.arange(num_scale) - num_scale // 2) min_scale_factor = 0.2 max_scale_factor = 5 scale_penalty = 0.985 scale_penalties = scale_penalty ** (np.abs((np.arange(num_scale) - num_scale // 2))) net_output_size = [temp_sz, temp_sz] cos_window = torch.Tensor(np.outer(np.hanning(temp_sz), np.hanning(temp_sz))).cuda() def track(net, args): toc = 0 config = TrackerConfig() video_name = os.path.basename(args.input) if args.input else 'webcam' regions = [] # FINAL RESULTS for f, img_raw in enumerate(get_frames(args.input)): img_raw = cv2.resize(img_raw, (640,480)) use_lab = getattr(args, 'use_lab', False) im = preproc(img_raw, args.im_mean, args.im_std, use_lab) tic = cv2.getTickCount() # Init if f == 0: try: init_rect = cv2.selectROI(video_name, img_raw, False, False) except Exception: exit() target_pos, target_sz = rect1_2_cxy_wh(init_rect) min_sz = np.maximum(config.min_scale_factor * target_sz, 4) max_sz = np.minimum(im.shape[:2], config.max_scale_factor * target_sz) # crop template window_sz = target_sz * (1 + config.padding) bbox = cxy_wh_2_bbox(target_pos, window_sz) patch = crop_chw(im, bbox, config.crop_sz) target = patch net.update(torch.Tensor(np.expand_dims(target, axis=0)).cuda(), lr=1) regions.append(cxy_wh_2_rect1(target_pos, target_sz)) patch_crop = np.zeros((config.num_scale, patch.shape[0], patch.shape[1], patch.shape[2]), np.float32) # Track else: for i in range(config.num_scale): # crop multi-scale search region window_sz = target_sz * (config.scale_factor[i] * (1 + config.padding)) bbox = cxy_wh_2_bbox(target_pos, window_sz) patch_crop[i, :] = crop_chw(im, bbox, config.crop_sz) search = patch_crop response = net(torch.Tensor(search).cuda()) net_output_size = [response.shape[-2], response.shape[-1]] peak, idx = torch.max(response.view(config.num_scale, -1), 1) peak = peak.data.cpu().numpy() * config.scale_penalties best_scale = np.argmax(peak) r_max, c_max = np.unravel_index(idx[best_scale].cpu(), net_output_size) r_max = r_max - net_output_size[0] * 0.5 c_max = c_max - net_output_size[1] * 0.5 window_sz = target_sz * (config.scale_factor[best_scale] * (1 + config.padding)) target_pos = target_pos + np.array([c_max, r_max]) * window_sz / net_output_size target_sz = np.minimum(np.maximum(window_sz / (1 + config.padding), min_sz), max_sz) # model update window_sz = target_sz * (1 + config.padding) bbox = cxy_wh_2_bbox(target_pos, window_sz) patch = crop_chw(im, bbox, config.crop_sz) target = patch regions.append(cxy_wh_2_rect1(target_pos, target_sz)) # 1-index toc += cv2.getTickCount() - tic bbox = list(map(int, regions[-1])) cv2.rectangle(img_raw, (bbox[0], bbox[1]), (bbox[0]+bbox[2], bbox[1]+bbox[3]), (0, 255, 0), 3) cv2.imshow(video_name, img_raw) cv2.waitKey(40) toc /= cv2.getTickFrequency() def main(): parser = argparse.ArgumentParser() parser.add_argument('--config', default='', required=True, type=str) parser.add_argument('--input', required=True, type=str) args = parser.parse_args() with open(args.config) as f: common_args = yaml.load(f) for k, v in common_args['common'].items(): setattr(args, k, v) for k, v in common_args['sot'].items(): setattr(args, k, v) args.arch = 'SiamFC' # prepare model base = AppearanceModel(args).to(args.device) print('Total params: %.2fM' % (sum(p.numel() for p in base.parameters())/1e6)) print(base) net = models.__dict__[args.arch](base=base, config=TrackerConfig()) net.eval() net = net.cuda() track(net, args) if __name__ == '__main__': main() ================================================ FILE: detector/YOLOX/.gitignore ================================================ ### Linux ### *~ # temporary files which can be created if a process still has a handle open of a deleted file .fuse_hidden* # KDE directory preferences .directory # Linux trash folder which might appear on any partition or disk .Trash-* # .nfs files are created when an open file is removed but is still being accessed .nfs* ### PyCharm ### # User-specific stuff .idea # CMake cmake-build-*/ # Mongo Explorer plugin .idea/**/mongoSettings.xml # File-based project format *.iws # IntelliJ out/ # mpeltonen/sbt-idea plugin .idea_modules/ # JIRA plugin atlassian-ide-plugin.xml # Cursive Clojure plugin .idea/replstate.xml # Crashlytics plugin (for Android Studio and IntelliJ) com_crashlytics_export_strings.xml crashlytics.properties crashlytics-build.properties fabric.properties # Editor-based Rest Client .idea/httpRequests # Android studio 3.1+ serialized cache file .idea/caches/build_file_checksums.ser # JetBrains templates **___jb_tmp___ ### Python ### # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ docs/build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don’t work, or not # install all needed dependencies. #Pipfile.lock # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ ### Vim ### # Swap [._]*.s[a-v][a-z] [._]*.sw[a-p] [._]s[a-rt-v][a-z] [._]ss[a-gi-z] [._]sw[a-p] # Session Session.vim # Temporary .netrwhist # Auto-generated tag files tags # Persistent undo [._]*.un~ # output docs/api .code-workspace.code-workspace *.pkl *.npy *.pth *.onnx events.out.tfevents* # vscode *.code-workspace .vscode # vim .vim ================================================ FILE: detector/YOLOX/LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "{}" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2021 Megvii, Base Detection Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: detector/YOLOX/README.md ================================================

## Introduction YOLOX is an anchor-free version of YOLO, with a simpler design but better performance! It aims to bridge the gap between research and industrial communities. For more details, please refer to our [report on Arxiv](https://arxiv.org/abs/2107.08430). ## Updates!! * 【2021/07/20】 We have released our technical report on [Arxiv](https://arxiv.org/abs/2107.08430). ## Comming soon - [ ] YOLOX-P6 and larger model. - [ ] Objects365 pretrain. - [ ] Transformer modules. - [ ] More features in need. ## Benchmark #### Standard Models. |Model |size |mAPtest
0.5:0.95 | Speed V100
(ms) | Params
(M) |FLOPs
(G)| weights | | ------ |:---: | :---: |:---: |:---: | :---: | :----: | |[YOLOX-s](./exps/default/yolox_s.py) |640 |39.6 |9.8 |9.0 | 26.8 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EW62gmO2vnNNs5npxjzunVwB9p307qqygaCkXdTO88BLUg?e=NMTQYw)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_s.pth) | |[YOLOX-m](./exps/default/yolox_m.py) |640 |46.4 |12.3 |25.3 |73.8| [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/ERMTP7VFqrVBrXKMU7Vl4TcBQs0SUeCT7kvc-JdIbej4tQ?e=1MDo9y)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_m.pth) | |[YOLOX-l](./exps/default/yolox_l.py) |640 |50.0 |14.5 |54.2| 155.6 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EWA8w_IEOzBKvuueBqfaZh0BeoG5sVzR-XYbOJO4YlOkRw?e=wHWOBE)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_l.pth) | |[YOLOX-x](./exps/default/yolox_x.py) |640 |**51.2** | 17.3 |99.1 |281.9 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EdgVPHBziOVBtGAXHfeHI5kBza0q9yyueMGdT0wXZfI1rQ?e=tABO5u)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_x.pth) | |[YOLOX-Darknet53](./exps/default/yolov3.py) |640 | 47.4 | 11.1 |63.7 | 185.3 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EZ-MV1r_fMFPkPrNjvbJEMoBLOLAnXH-XKEB77w8LhXL6Q?e=mf6wOc)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_darknet53.pth) | #### Light Models. |Model |size |mAPval
0.5:0.95 | Params
(M) |FLOPs
(G)| weights | | ------ |:---: | :---: |:---: |:---: | :---: | |[YOLOX-Nano](./exps/default/nano.py) |416 |25.3 | 0.91 |1.08 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EdcREey-krhLtdtSnxolxiUBjWMy6EFdiaO9bdOwZ5ygCQ?e=yQpdds)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_nano.pth) | |[YOLOX-Tiny](./exps/default/yolox_tiny.py) |416 |31.7 | 5.06 |6.45 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EYtjNFPqvZBBrQ-VowLcSr4B6Z5TdTflUsr_gO2CwhC3bQ?e=SBTwXj)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_tiny.pth) | ## Quick Start
Installation Step1. Install YOLOX. ```shell git clone git@github.com:Megvii-BaseDetection/YOLOX.git cd YOLOX pip3 install -U pip && pip3 install -r requirements.txt pip3 install -v -e . # or python3 setup.py develop ``` Step2. Install [apex](https://github.com/NVIDIA/apex). ```shell # skip this step if you don't want to train model. git clone https://github.com/NVIDIA/apex cd apex pip3 install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ ``` Step3. Install [pycocotools](https://github.com/cocodataset/cocoapi). ```shell pip3 install cython; pip3 install 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI' ```
Demo Step1. Download a pretrained model from the benchmark table. Step2. Use either -n or -f to specify your detector's config. For example: ```shell python tools/demo.py image -n yolox-s -c /path/to/your/yolox_s.pth.tar --path assets/dog.jpg --conf 0.3 --nms 0.65 --tsize 640 --save_result --device [cpu/gpu] ``` or ```shell python tools/demo.py image -f exps/default/yolox_s.py -c /path/to/your/yolox_s.pth.tar --path assets/dog.jpg --conf 0.3 --nms 0.65 --tsize 640 --save_result --device [cpu/gpu] ``` Demo for video: ```shell python tools/demo.py video -n yolox-s -c /path/to/your/yolox_s.pth.tar --path /path/to/your/video --conf 0.3 --nms 0.65 --tsize 640 --save_result --device [cpu/gpu] ```
Reproduce our results on COCO Step1. Prepare COCO dataset ```shell cd ln -s /path/to/your/COCO ./datasets/COCO ``` Step2. Reproduce our results on COCO by specifying -n: ```shell python tools/train.py -n yolox-s -d 8 -b 64 --fp16 -o yolox-m yolox-l yolox-x ``` * -d: number of gpu devices * -b: total batch size, the recommended number for -b is num-gpu * 8 * --fp16: mixed precision training When using -f, the above commands are equivalent to: ```shell python tools/train.py -f exps/default/yolox-s.py -d 8 -b 64 --fp16 -o exps/default/yolox-m.py exps/default/yolox-l.py exps/default/yolox-x.py ```
Evaluation We support batch testing for fast evaluation: ```shell python tools/eval.py -n yolox-s -c yolox_s.pth.tar -b 64 -d 8 --conf 0.001 [--fp16] [--fuse] yolox-m yolox-l yolox-x ``` * --fuse: fuse conv and bn * -d: number of GPUs used for evaluation. DEFAULT: All GPUs available will be used. * -b: total batch size across on all GPUs To reproduce speed test, we use the following command: ```shell python tools/eval.py -n yolox-s -c yolox_s.pth.tar -b 1 -d 1 --conf 0.001 --fp16 --fuse yolox-m yolox-l yolox-x ```
Tutorials * [Training on custom data](docs/train_custom_data.md).
## Deployment 1. [ONNX export and an ONNXRuntime](./demo/ONNXRuntime) 2. [TensorRT in C++ and Python](./demo/TensorRT) 3. [ncnn in C++ and Java](./demo/ncnn) 4. [OpenVINO in C++ and Python](./demo/OpenVINO) ## Third-party resources * The ncnn android app with video support: [ncnn-android-yolox](https://github.com/FeiGeChuanShu/ncnn-android-yolox) from [FeiGeChuanShu](https://github.com/FeiGeChuanShu) * YOLOX with Tengine support: [Tengine](https://github.com/OAID/Tengine/blob/tengine-lite/examples/tm_yolox.cpp) from [BUG1989](https://github.com/BUG1989) * YOLOX + ROS2 Foxy: [YOLOX-ROS](https://github.com/Ar-Ray-code/YOLOX-ROS) from [Ar-Ray](https://github.com/Ar-Ray-code) * YOLOX Deploy DeepStream: [YOLOX-deepstream](https://github.com/nanmi/YOLOX-deepstream) from [nanmi](https://github.com/nanmi) * YOLOX ONNXRuntime C++ Demo: [lite.ai](https://github.com/DefTruth/lite.ai/blob/main/ort/cv/yolox.cpp) from [DefTruth](https://github.com/DefTruth) ## Cite YOLOX If you use YOLOX in your research, please cite our work by using the following BibTeX entry: ```latex @article{yolox2021, title={YOLOX: Exceeding YOLO Series in 2021}, author={Ge, Zheng and Liu, Songtao and Wang, Feng and Li, Zeming and Sun, Jian}, journal={arXiv preprint arXiv:2107.08430}, year={2021} } ``` ================================================ FILE: detector/YOLOX/datasets/README.md ================================================ # Prepare datasets If you have a dataset directory, you could use os environment variable named `YOLOX_DATADIR`. Under this directory, YOLOX will look for datasets in the structure described below, if needed. ``` $YOLOX_DATADIR/ COCO/ ``` You can set the location for builtin datasets by ```shell export YOLOX_DATADIR=/path/to/your/datasets ``` If `YOLOX_DATADIR` is not set, the default value of dataset directory is `./datasets` relative to your current working directory. ## Expected dataset structure for [COCO detection](https://cocodataset.org/#download): ``` COCO/ annotations/ instances_{train,val}2017.json {train,val}2017/ # image files that are mentioned in the corresponding json ``` You can use the 2014 version of the dataset as well. ================================================ FILE: detector/YOLOX/demo/ONNXRuntime/README.md ================================================ ## YOLOX-ONNXRuntime in Python This doc introduces how to convert your pytorch model into onnx, and how to run an onnxruntime demo to verify your convertion. ### Download ONNX models. | Model | Parameters | GFLOPs | Test Size | mAP | Weights | |:------| :----: | :----: | :---: | :---: | :---: | | YOLOX-Nano | 0.91M | 1.08 | 416x416 | 25.3 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EfAGwvevU-lNhW5OqFAyHbwBJdI_7EaKu5yU04fgF5BU7w?e=gvq4hf)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_nano.onnx) | | YOLOX-Tiny | 5.06M | 6.45 | 416x416 |31.7 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EVigCszU1ilDn-MwLwHCF1ABsgTy06xFdVgZ04Yyo4lHVA?e=hVKiCw)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_tiny.onnx) | | YOLOX-S | 9.0M | 26.8 | 640x640 |39.6 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/Ec0L1d1x2UtIpbfiahgxhtgBZVjb1NCXbotO8SCOdMqpQQ?e=siyIsK)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_s.onnx) | | YOLOX-M | 25.3M | 73.8 | 640x640 |46.4 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/ERUKlQe-nlxBoTKPy1ynbxsBmAZ_h-VBEV-nnfPdzUIkZQ?e=hyQQtl)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_m.onnx) | | YOLOX-L | 54.2M | 155.6 | 640x640 |50.0 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/ET5w926jCA5GlVfg9ixB4KEBiW0HYl7SzaHNRaRG9dYO_A?e=ISmCYX)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_l.onnx) | | YOLOX-Darknet53| 63.72M | 185.3 | 640x640 |47.3 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/ESArloSW-MlPlLuemLh9zKkBdovgweKbfu4zkvzKAp7pPQ?e=f81Ikw)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_darknet53.onnx) | | YOLOX-X | 99.1M | 281.9 | 640x640 |51.2 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/ERjqoeMJlFdGuM3tQfXQmhABmGHlIHydWCwhlugeWLE9AA)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox.onnx) | ### Convert Your Model to ONNX First, you should move to by: ```shell cd ``` Then, you can: 1. Convert a standard YOLOX model by -n: ```shell python3 tools/export_onnx.py --output-name yolox_s.onnx -n yolox-s -c yolox_s.pth.tar ``` Notes: * -n: specify a model name. The model name must be one of the [yolox-s,m,l,x and yolox-nane, yolox-tiny, yolov3] * -c: the model you have trained * -o: opset version, default 11. **However, if you will further convert your onnx model to [OpenVINO](../OpenVINO/), please specify the opset version to 10.** * --no-onnxsim: disable onnxsim * To customize an input shape for onnx model, modify the following code in tools/export.py: ```python dummy_input = torch.randn(1, 3, exp.test_size[0], exp.test_size[1]) ``` 2. Convert a standard YOLOX model by -f. When using -f, the above command is equivalent to: ```shell python3 tools/export_onnx.py --output-name yolox_s.onnx -f exps/default/yolox_s.py -c yolox_s.pth.tar ``` 3. To convert your customized model, please use -f: ```shell python3 tools/export_onnx.py --output-name your_yolox.onnx -f exps/your_dir/your_yolox.py -c your_yolox.pth.tar ``` ### ONNXRuntime Demo Step1. ```shell cd /demo/ONNXRuntime ``` Step2. ```shell python3 onnx_inference.py -m -i -o -s 0.3 --input_shape 640,640 ``` Notes: * -m: your converted onnx model * -i: input_image * -s: score threshold for visualization. * --input_shape: should be consistent with the shape you used for onnx convertion. ================================================ FILE: detector/YOLOX/demo/ONNXRuntime/onnx_inference.py ================================================ #!/usr/bin/env python3 # -*- coding: utf-8 -*- # Copyright (c) Megvii, Inc. and its affiliates. import argparse import os import cv2 import numpy as np import onnxruntime from yolox.data.data_augment import preproc as preprocess from yolox.data.datasets import COCO_CLASSES from yolox.utils import mkdir, multiclass_nms, demo_postprocess, vis def make_parser(): parser = argparse.ArgumentParser("onnxruntime inference sample") parser.add_argument( "-m", "--model", type=str, default="yolox.onnx", help="Input your onnx model.", ) parser.add_argument( "-i", "--image_path", type=str, default='test_image.png', help="Path to your input image.", ) parser.add_argument( "-o", "--output_dir", type=str, default='demo_output', help="Path to your output directory.", ) parser.add_argument( "-s", "--score_thr", type=float, default=0.3, help="Score threshould to filter the result.", ) parser.add_argument( "--input_shape", type=str, default="640,640", help="Specify an input shape for inference.", ) parser.add_argument( "--with_p6", action="store_true", help="Whether your model uses p6 in FPN/PAN.", ) return parser if __name__ == '__main__': args = make_parser().parse_args() input_shape = tuple(map(int, args.input_shape.split(','))) origin_img = cv2.imread(args.image_path) mean = (0.485, 0.456, 0.406) std = (0.229, 0.224, 0.225) img, ratio = preprocess(origin_img, input_shape, mean, std) session = onnxruntime.InferenceSession(args.model) ort_inputs = {session.get_inputs()[0].name: img[None, :, :, :]} output = session.run(None, ort_inputs) predictions = demo_postprocess(output[0], input_shape, p6=args.with_p6)[0] boxes = predictions[:, :4] scores = predictions[:, 4:5] * predictions[:, 5:] boxes_xyxy = np.ones_like(boxes) boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2]/2. boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3]/2. boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2]/2. boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3]/2. boxes_xyxy /= ratio dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.65, score_thr=0.1) if dets is not None: final_boxes, final_scores, final_cls_inds = dets[:, :4], dets[:, 4], dets[:, 5] origin_img = vis(origin_img, final_boxes, final_scores, final_cls_inds, conf=args.score_thr, class_names=COCO_CLASSES) mkdir(args.output_dir) output_path = os.path.join(args.output_dir, args.image_path.split("/")[-1]) cv2.imwrite(output_path, origin_img) ================================================ FILE: detector/YOLOX/demo/OpenVINO/README.md ================================================ ## YOLOX for OpenVINO * [C++ Demo](./cpp) * [Python Demo](./python) ================================================ FILE: detector/YOLOX/demo/OpenVINO/cpp/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.4.1) set(CMAKE_CXX_STANDARD 14) project(yolox_openvino_demo) find_package(OpenCV REQUIRED) find_package(InferenceEngine REQUIRED) find_package(ngraph REQUIRED) include_directories( ${OpenCV_INCLUDE_DIRS} ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR} ) add_executable(yolox_openvino yolox_openvino.cpp) target_link_libraries( yolox_openvino ${InferenceEngine_LIBRARIES} ${NGRAPH_LIBRARIES} ${OpenCV_LIBS} ) ================================================ FILE: detector/YOLOX/demo/OpenVINO/cpp/README.md ================================================ # YOLOX-OpenVINO in C++ This toturial includes a C++ demo for OpenVINO, as well as some converted models. ### Download OpenVINO models. | Model | Parameters | GFLOPs | Test Size | mAP | Weights | |:------| :----: | :----: | :---: | :---: | :---: | | [YOLOX-Nano](../../../exps/nano.py) | 0.91M | 1.08 | 416x416 | 25.3 | [Download](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EeWY57o5wQZFtXYd1KJw6Z8B4vxZru649XxQHYIFgio3Qw?e=ZS81ce)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_nano_openvino.tar.gz) | | [YOLOX-Tiny](../../../exps/yolox_tiny.py) | 5.06M | 6.45 | 416x416 |31.7 | [Download](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/ETfvOoCXdVZNinoSpKA_sEYBIQVqfjjF5_M6VvHRnLVcsA?e=STL1pi)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_tiny_openvino.tar.gz) | | [YOLOX-S](../../../exps/yolox_s.py) | 9.0M | 26.8 | 640x640 |39.6 | [Download](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EXUjf3PQnbBLrxNrXPueqaIBzVZOrYQOnJpLK1Fytj5ssA?e=GK0LOM)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_s_openvino.tar.gz) | | [YOLOX-M](../../../exps/yolox_m.py) | 25.3M | 73.8 | 640x640 |46.4 | [Download](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EcoT1BPpeRpLvE_4c441zn8BVNCQ2naxDH3rho7WqdlgLQ?e=95VaM9)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_m_openvino.tar.gz) | | [YOLOX-L](../../../exps/yolox_l.py) | 54.2M | 155.6 | 640x640 |50.0 | [Download](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EZvmn-YLRuVPh0GAP_w3xHMB2VGvrKqQXyK_Cv5yi_DXUg?e=YRh6Eq)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_l_openvino.tar.gz) | | [YOLOX-Darknet53](../../../exps/yolov3.py) | 63.72M | 185.3 | 640x640 |47.3 | [Download](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EQP8LSroikFHuwX0jFRetmcBOCDWSFmylHxolV7ezUPXGw?e=bEw5iq)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_darknet53_openvino.tar.gz) | | [YOLOX-X](../../../exps/yolox_x.py) | 99.1M | 281.9 | 640x640 |51.2 | [Download](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EZFPnLqiD-xIlt7rcZYDjQgB4YXE9wnq1qaSXQwJrsKbdg?e=83nwEz)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_x_openvino.tar.gz) | ## Install OpenVINO Toolkit Please visit [Openvino Homepage](https://docs.openvinotoolkit.org/latest/get_started_guides.html) for more details. ## Set up the Environment ### For Linux **Option1. Set up the environment tempororally. You need to run this command everytime you start a new shell window.** ```shell source /opt/intel/openvino_2021/bin/setupvars.sh ``` **Option2. Set up the environment permenantly.** *Step1.* For Linux: ```shell vim ~/.bashrc ``` *Step2.* Add the following line into your file: ```shell source /opt/intel/openvino_2021/bin/setupvars.sh ``` *Step3.* Save and exit the file, then run: ```shell source ~/.bashrc ``` ## Convert model 1. Export ONNX model Please refer to the [ONNX toturial](../../ONNXRuntime). **Note that you should set --opset to 10, otherwise your next step will fail.** 2. Convert ONNX to OpenVINO ``` shell cd /openvino_2021/deployment_tools/model_optimizer ``` Install requirements for convert tool ```shell sudo ./install_prerequisites/install_prerequisites_onnx.sh ``` Then convert model. ```shell python3 mo.py --input_model --input_shape [--data_type FP16] ``` For example: ```shell python3 mo.py --input_model yolox.onnx --input_shape (1,3,640,640) --data_type FP16 ``` ## Build ### Linux ```shell source /opt/intel/openvino_2021/bin/setupvars.sh mkdir build cd build cmake .. make ``` ## Demo ### c++ ```shell ./yolox_openvino ``` ================================================ FILE: detector/YOLOX/demo/OpenVINO/cpp/yolox_openvino.cpp ================================================ // Copyright (C) 2018-2021 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include #include #include #include #include #include #include using namespace InferenceEngine; /** * @brief Define names based depends on Unicode path support */ #define tcout std::cout #define file_name_t std::string #define imread_t cv::imread #define NMS_THRESH 0.65 #define BBOX_CONF_THRESH 0.3 static const int INPUT_W = 416; static const int INPUT_H = 416; cv::Mat static_resize(cv::Mat& img) { float r = std::min(INPUT_W / (img.cols*1.0), INPUT_H / (img.rows*1.0)); // r = std::min(r, 1.0f); int unpad_w = r * img.cols; int unpad_h = r * img.rows; cv::Mat re(unpad_h, unpad_w, CV_8UC3); cv::resize(img, re, re.size()); cv::Mat out(INPUT_W, INPUT_H, CV_8UC3, cv::Scalar(114, 114, 114)); re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows))); return out; } void blobFromImage(cv::Mat& img, Blob::Ptr& blob){ cv::cvtColor(img, img, cv::COLOR_BGR2RGB); int channels = 3; int img_h = img.rows; int img_w = img.cols; std::vector mean = {0.485, 0.456, 0.406}; std::vector std = {0.229, 0.224, 0.225}; InferenceEngine::MemoryBlob::Ptr mblob = InferenceEngine::as(blob); if (!mblob) { THROW_IE_EXCEPTION << "We expect blob to be inherited from MemoryBlob in matU8ToBlob, " << "but by fact we were not able to cast inputBlob to MemoryBlob"; } // locked memory holder should be alive all time while access to its buffer happens auto mblobHolder = mblob->wmap(); float *blob_data = mblobHolder.as(); for (size_t c = 0; c < channels; c++) { for (size_t h = 0; h < img_h; h++) { for (size_t w = 0; w < img_w; w++) { blob_data[c * img_w * img_h + h * img_w + w] = (((float)img.at(h, w)[c]) / 255.0f - mean[c]) / std[c]; } } } } struct Object { cv::Rect_ rect; int label; float prob; }; struct GridAndStride { int grid0; int grid1; int stride; }; static void generate_grids_and_stride(const int target_size, std::vector& strides, std::vector& grid_strides) { for (auto stride : strides) { int num_grid = target_size / stride; for (int g1 = 0; g1 < num_grid; g1++) { for (int g0 = 0; g0 < num_grid; g0++) { grid_strides.push_back((GridAndStride){g0, g1, stride}); } } } } static void generate_yolox_proposals(std::vector grid_strides, const float* feat_ptr, float prob_threshold, std::vector& objects) { const int num_class = 80; // COCO has 80 classes. Modify this value on your own dataset. const int num_anchors = grid_strides.size(); for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++) { const int grid0 = grid_strides[anchor_idx].grid0; const int grid1 = grid_strides[anchor_idx].grid1; const int stride = grid_strides[anchor_idx].stride; const int basic_pos = anchor_idx * 85; // yolox/models/yolo_head.py decode logic // outputs[..., :2] = (outputs[..., :2] + grids) * strides // outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides float x_center = (feat_ptr[basic_pos + 0] + grid0) * stride; float y_center = (feat_ptr[basic_pos + 1] + grid1) * stride; float w = exp(feat_ptr[basic_pos + 2]) * stride; float h = exp(feat_ptr[basic_pos + 3]) * stride; float x0 = x_center - w * 0.5f; float y0 = y_center - h * 0.5f; float box_objectness = feat_ptr[basic_pos + 4]; for (int class_idx = 0; class_idx < num_class; class_idx++) { float box_cls_score = feat_ptr[basic_pos + 5 + class_idx]; float box_prob = box_objectness * box_cls_score; if (box_prob > prob_threshold) { Object obj; obj.rect.x = x0; obj.rect.y = y0; obj.rect.width = w; obj.rect.height = h; obj.label = class_idx; obj.prob = box_prob; objects.push_back(obj); } } // class loop } // point anchor loop } static inline float intersection_area(const Object& a, const Object& b) { cv::Rect_ inter = a.rect & b.rect; return inter.area(); } static void qsort_descent_inplace(std::vector& faceobjects, int left, int right) { int i = left; int j = right; float p = faceobjects[(left + right) / 2].prob; while (i <= j) { while (faceobjects[i].prob > p) i++; while (faceobjects[j].prob < p) j--; if (i <= j) { // swap std::swap(faceobjects[i], faceobjects[j]); i++; j--; } } #pragma omp parallel sections { #pragma omp section { if (left < j) qsort_descent_inplace(faceobjects, left, j); } #pragma omp section { if (i < right) qsort_descent_inplace(faceobjects, i, right); } } } static void qsort_descent_inplace(std::vector& objects) { if (objects.empty()) return; qsort_descent_inplace(objects, 0, objects.size() - 1); } static void nms_sorted_bboxes(const std::vector& faceobjects, std::vector& picked, float nms_threshold) { picked.clear(); const int n = faceobjects.size(); std::vector areas(n); for (int i = 0; i < n; i++) { areas[i] = faceobjects[i].rect.area(); } for (int i = 0; i < n; i++) { const Object& a = faceobjects[i]; int keep = 1; for (int j = 0; j < (int)picked.size(); j++) { const Object& b = faceobjects[picked[j]]; // intersection over union float inter_area = intersection_area(a, b); float union_area = areas[i] + areas[picked[j]] - inter_area; // float IoU = inter_area / union_area if (inter_area / union_area > nms_threshold) keep = 0; } if (keep) picked.push_back(i); } } static void decode_outputs(const float* prob, std::vector& objects, float scale, const int img_w, const int img_h) { std::vector proposals; std::vector strides = {8, 16, 32}; std::vector grid_strides; generate_grids_and_stride(INPUT_W, strides, grid_strides); generate_yolox_proposals(grid_strides, prob, BBOX_CONF_THRESH, proposals); qsort_descent_inplace(proposals); std::vector picked; nms_sorted_bboxes(proposals, picked, NMS_THRESH); int count = picked.size(); objects.resize(count); for (int i = 0; i < count; i++) { objects[i] = proposals[picked[i]]; // adjust offset to original unpadded float x0 = (objects[i].rect.x) / scale; float y0 = (objects[i].rect.y) / scale; float x1 = (objects[i].rect.x + objects[i].rect.width) / scale; float y1 = (objects[i].rect.y + objects[i].rect.height) / scale; // clip x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f); y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f); x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f); y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f); objects[i].rect.x = x0; objects[i].rect.y = y0; objects[i].rect.width = x1 - x0; objects[i].rect.height = y1 - y0; } } const float color_list[80][3] = { {0.000, 0.447, 0.741}, {0.850, 0.325, 0.098}, {0.929, 0.694, 0.125}, {0.494, 0.184, 0.556}, {0.466, 0.674, 0.188}, {0.301, 0.745, 0.933}, {0.635, 0.078, 0.184}, {0.300, 0.300, 0.300}, {0.600, 0.600, 0.600}, {1.000, 0.000, 0.000}, {1.000, 0.500, 0.000}, {0.749, 0.749, 0.000}, {0.000, 1.000, 0.000}, {0.000, 0.000, 1.000}, {0.667, 0.000, 1.000}, {0.333, 0.333, 0.000}, {0.333, 0.667, 0.000}, {0.333, 1.000, 0.000}, {0.667, 0.333, 0.000}, {0.667, 0.667, 0.000}, {0.667, 1.000, 0.000}, {1.000, 0.333, 0.000}, {1.000, 0.667, 0.000}, {1.000, 1.000, 0.000}, {0.000, 0.333, 0.500}, {0.000, 0.667, 0.500}, {0.000, 1.000, 0.500}, {0.333, 0.000, 0.500}, {0.333, 0.333, 0.500}, {0.333, 0.667, 0.500}, {0.333, 1.000, 0.500}, {0.667, 0.000, 0.500}, {0.667, 0.333, 0.500}, {0.667, 0.667, 0.500}, {0.667, 1.000, 0.500}, {1.000, 0.000, 0.500}, {1.000, 0.333, 0.500}, {1.000, 0.667, 0.500}, {1.000, 1.000, 0.500}, {0.000, 0.333, 1.000}, {0.000, 0.667, 1.000}, {0.000, 1.000, 1.000}, {0.333, 0.000, 1.000}, {0.333, 0.333, 1.000}, {0.333, 0.667, 1.000}, {0.333, 1.000, 1.000}, {0.667, 0.000, 1.000}, {0.667, 0.333, 1.000}, {0.667, 0.667, 1.000}, {0.667, 1.000, 1.000}, {1.000, 0.000, 1.000}, {1.000, 0.333, 1.000}, {1.000, 0.667, 1.000}, {0.333, 0.000, 0.000}, {0.500, 0.000, 0.000}, {0.667, 0.000, 0.000}, {0.833, 0.000, 0.000}, {1.000, 0.000, 0.000}, {0.000, 0.167, 0.000}, {0.000, 0.333, 0.000}, {0.000, 0.500, 0.000}, {0.000, 0.667, 0.000}, {0.000, 0.833, 0.000}, {0.000, 1.000, 0.000}, {0.000, 0.000, 0.167}, {0.000, 0.000, 0.333}, {0.000, 0.000, 0.500}, {0.000, 0.000, 0.667}, {0.000, 0.000, 0.833}, {0.000, 0.000, 1.000}, {0.000, 0.000, 0.000}, {0.143, 0.143, 0.143}, {0.286, 0.286, 0.286}, {0.429, 0.429, 0.429}, {0.571, 0.571, 0.571}, {0.714, 0.714, 0.714}, {0.857, 0.857, 0.857}, {0.000, 0.447, 0.741}, {0.314, 0.717, 0.741}, {0.50, 0.5, 0} }; static void draw_objects(const cv::Mat& bgr, const std::vector& objects) { static const char* class_names[] = { "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush" }; cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); cv::Scalar color = cv::Scalar(color_list[obj.label][0], color_list[obj.label][1], color_list[obj.label][2]); float c_mean = cv::mean(color)[0]; cv::Scalar txt_color; if (c_mean > 0.5){ txt_color = cv::Scalar(0, 0, 0); }else{ txt_color = cv::Scalar(255, 255, 255); } cv::rectangle(image, obj.rect, color * 255, 2); char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.4, 1, &baseLine); cv::Scalar txt_bk_color = color * 0.7 * 255; int x = obj.rect.x; int y = obj.rect.y + 1; //int y = obj.rect.y - label_size.height - baseLine; if (y > image.rows) y = image.rows; //if (x + label_size.width > image.cols) //x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), txt_bk_color, -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.4, txt_color, 1); } cv::imwrite("_demo.jpg" , image); fprintf(stderr, "save vis file\n"); /* cv::imshow("image", image); */ /* cv::waitKey(0); */ } int main(int argc, char* argv[]) { try { // ------------------------------ Parsing and validation of input arguments // --------------------------------- if (argc != 4) { tcout << "Usage : " << argv[0] << " " << std::endl; return EXIT_FAILURE; } const file_name_t input_model {argv[1]}; const file_name_t input_image_path {argv[2]}; const std::string device_name {argv[3]}; // ----------------------------------------------------------------------------------------------------- // --------------------------- Step 1. Initialize inference engine core // ------------------------------------- Core ie; // ----------------------------------------------------------------------------------------------------- // Step 2. Read a model in OpenVINO Intermediate Representation (.xml and // .bin files) or ONNX (.onnx file) format CNNNetwork network = ie.ReadNetwork(input_model); if (network.getOutputsInfo().size() != 1) throw std::logic_error("Sample supports topologies with 1 output only"); if (network.getInputsInfo().size() != 1) throw std::logic_error("Sample supports topologies with 1 input only"); // ----------------------------------------------------------------------------------------------------- // --------------------------- Step 3. Configure input & output // --------------------------------------------- // --------------------------- Prepare input blobs // ----------------------------------------------------- InputInfo::Ptr input_info = network.getInputsInfo().begin()->second; std::string input_name = network.getInputsInfo().begin()->first; /* Mark input as resizable by setting of a resize algorithm. * In this case we will be able to set an input blob of any shape to an * infer request. Resize and layout conversions are executed automatically * during inference */ //input_info->getPreProcess().setResizeAlgorithm(RESIZE_BILINEAR); //input_info->setLayout(Layout::NHWC); //input_info->setPrecision(Precision::FP32); // --------------------------- Prepare output blobs // ---------------------------------------------------- if (network.getOutputsInfo().empty()) { std::cerr << "Network outputs info is empty" << std::endl; return EXIT_FAILURE; } DataPtr output_info = network.getOutputsInfo().begin()->second; std::string output_name = network.getOutputsInfo().begin()->first; output_info->setPrecision(Precision::FP32); // ----------------------------------------------------------------------------------------------------- // --------------------------- Step 4. Loading a model to the device // ------------------------------------------ ExecutableNetwork executable_network = ie.LoadNetwork(network, device_name); // ----------------------------------------------------------------------------------------------------- // --------------------------- Step 5. Create an infer request // ------------------------------------------------- InferRequest infer_request = executable_network.CreateInferRequest(); // ----------------------------------------------------------------------------------------------------- // --------------------------- Step 6. Prepare input // -------------------------------------------------------- /* Read input image to a blob and set it to an infer request without resize * and layout conversions. */ cv::Mat image = imread_t(input_image_path); cv::Mat pr_img = static_resize(image); Blob::Ptr imgBlob = infer_request.GetBlob(input_name); // just wrap Mat data by Blob::Ptr blobFromImage(pr_img, imgBlob); // infer_request.SetBlob(input_name, imgBlob); // infer_request accepts input blob of any size // ----------------------------------------------------------------------------------------------------- // --------------------------- Step 7. Do inference // -------------------------------------------------------- /* Running the request synchronously */ infer_request.Infer(); // ----------------------------------------------------------------------------------------------------- // --------------------------- Step 8. Process output // ------------------------------------------------------ const Blob::Ptr output_blob = infer_request.GetBlob(output_name); MemoryBlob::CPtr moutput = as(output_blob); if (!moutput) { throw std::logic_error("We expect output to be inherited from MemoryBlob, " "but by fact we were not able to cast output to MemoryBlob"); } // locked memory holder should be alive all time while access to its buffer // happens auto moutputHolder = moutput->rmap(); const float* net_pred = moutputHolder.as::value_type*>(); const int image_size = 416; int img_w = image.cols; int img_h = image.rows; float scale = std::min(INPUT_W / (image.cols*1.0), INPUT_H / (image.rows*1.0)); std::vector objects; decode_outputs(net_pred, objects, scale, img_w, img_h); draw_objects(image, objects); // ----------------------------------------------------------------------------------------------------- } catch (const std::exception& ex) { std::cerr << ex.what() << std::endl; return EXIT_FAILURE; } return EXIT_SUCCESS; } ================================================ FILE: detector/YOLOX/demo/OpenVINO/python/README.md ================================================ # YOLOX-OpenVINO in Python This toturial includes a Python demo for OpenVINO, as well as some converted models. ### Download OpenVINO models. | Model | Parameters | GFLOPs | Test Size | mAP | Weights | |:------| :----: | :----: | :---: | :---: | :---: | | [YOLOX-Nano](../../../exps/default/nano.py) | 0.91M | 1.08 | 416x416 | 25.3 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EeWY57o5wQZFtXYd1KJw6Z8B4vxZru649XxQHYIFgio3Qw?e=ZS81ce)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_nano_openvino.tar.gz) | | [YOLOX-Tiny](../../../exps/default/yolox_tiny.py) | 5.06M | 6.45 | 416x416 |31.7 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/ETfvOoCXdVZNinoSpKA_sEYBIQVqfjjF5_M6VvHRnLVcsA?e=STL1pi)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_tiny_openvino.tar.gz) | | [YOLOX-S](../../../exps/default/yolox_s.py) | 9.0M | 26.8 | 640x640 |39.6 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EXUjf3PQnbBLrxNrXPueqaIBzVZOrYQOnJpLK1Fytj5ssA?e=GK0LOM)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_s_openvino.tar.gz) | | [YOLOX-M](../../../exps/default/yolox_m.py) | 25.3M | 73.8 | 640x640 |46.4 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EcoT1BPpeRpLvE_4c441zn8BVNCQ2naxDH3rho7WqdlgLQ?e=95VaM9)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_m_openvino.tar.gz) | | [YOLOX-L](../../../exps/default/yolox_l.py) | 54.2M | 155.6 | 640x640 |50.0 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EZvmn-YLRuVPh0GAP_w3xHMB2VGvrKqQXyK_Cv5yi_DXUg?e=YRh6Eq)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_l_openvino.tar.gz) | | [YOLOX-Darknet53](../../../exps/default/yolov3.py) | 63.72M | 185.3 | 640x640 |47.3 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EQP8LSroikFHuwX0jFRetmcBOCDWSFmylHxolV7ezUPXGw?e=bEw5iq)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_darknet53_openvino.tar.gz) | | [YOLOX-X](../../../exps/default/yolox_x.py) | 99.1M | 281.9 | 640x640 |51.2 | [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/EZFPnLqiD-xIlt7rcZYDjQgB4YXE9wnq1qaSXQwJrsKbdg?e=83nwEz)/[github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_x_openvino.tar.gz) | ## Install OpenVINO Toolkit Please visit [Openvino Homepage](https://docs.openvinotoolkit.org/latest/get_started_guides.html) for more details. ## Set up the Environment ### For Linux **Option1. Set up the environment tempororally. You need to run this command everytime you start a new shell window.** ```shell source /opt/intel/openvino_2021/bin/setupvars.sh ``` **Option2. Set up the environment permenantly.** *Step1.* For Linux: ```shell vim ~/.bashrc ``` *Step2.* Add the following line into your file: ```shell source /opt/intel/openvino_2021/bin/setupvars.sh ``` *Step3.* Save and exit the file, then run: ```shell source ~/.bashrc ``` ## Convert model 1. Export ONNX model Please refer to the [ONNX toturial](../../ONNXRuntime). **Note that you should set --opset to 10, otherwise your next step will fail.** 2. Convert ONNX to OpenVINO ``` shell cd /openvino_2021/deployment_tools/model_optimizer ``` Install requirements for convert tool ```shell sudo ./install_prerequisites/install_prerequisites_onnx.sh ``` Then convert model. ```shell python3 mo.py --input_model --input_shape [--data_type FP16] ``` For example: ```shell python3 mo.py --input_model yolox.onnx --input_shape [1,3,640,640] --data_type FP16 --output_dir converted_output ``` ## Demo ### python ```shell python openvino_inference.py -m -i ``` or ```shell python openvino_inference.py -m -i -o -s -d ``` ================================================ FILE: detector/YOLOX/demo/OpenVINO/python/openvino_inference.py ================================================ #!/usr/bin/env python3 # -*- coding: utf-8 -*- # Copyright (C) 2018-2021 Intel Corporation # SPDX-License-Identifier: Apache-2.0 # Copyright (c) Megvii, Inc. and its affiliates. import argparse import logging as log import os import sys import cv2 import numpy as np from openvino.inference_engine import IECore from yolox.data.data_augment import preproc as preprocess from yolox.data.datasets import COCO_CLASSES from yolox.utils import mkdir, multiclass_nms, demo_postprocess, vis def parse_args() -> argparse.Namespace: """Parse and return command line arguments""" parser = argparse.ArgumentParser(add_help=False) args = parser.add_argument_group('Options') args.add_argument( '-h', '--help', action='help', help='Show this help message and exit.') args.add_argument( '-m', '--model', required=True, type=str, help='Required. Path to an .xml or .onnx file with a trained model.') args.add_argument( '-i', '--input', required=True, type=str, help='Required. Path to an image file.') args.add_argument( '-o', '--output_dir', type=str, default='demo_output', help='Path to your output dir.') args.add_argument( '-s', '--score_thr', type=float, default=0.3, help="Score threshould to visualize the result.") args.add_argument( '-d', '--device', default='CPU', type=str, help='Optional. Specify the target device to infer on; CPU, GPU, \ MYRIAD, HDDL or HETERO: is acceptable. The sample will look \ for a suitable plugin for device specified. Default value \ is CPU.') args.add_argument( '--labels', default=None, type=str, help='Option:al. Path to a labels mapping file.') args.add_argument( '-nt', '--number_top', default=10, type=int, help='Optional. Number of top results.') return parser.parse_args() def main(): log.basicConfig(format='[ %(levelname)s ] %(message)s', level=log.INFO, stream=sys.stdout) args = parse_args() # ---------------------------Step 1. Initialize inference engine core-------------------------------------------------- log.info('Creating Inference Engine') ie = IECore() # ---------------------------Step 2. Read a model in OpenVINO Intermediate Representation or ONNX format--------------- log.info(f'Reading the network: {args.model}') # (.xml and .bin files) or (.onnx file) net = ie.read_network(model=args.model) if len(net.input_info) != 1: log.error('Sample supports only single input topologies') return -1 if len(net.outputs) != 1: log.error('Sample supports only single output topologies') return -1 # ---------------------------Step 3. Configure input & output---------------------------------------------------------- log.info('Configuring input and output blobs') # Get names of input and output blobs input_blob = next(iter(net.input_info)) out_blob = next(iter(net.outputs)) # Set input and output precision manually net.input_info[input_blob].precision = 'FP32' net.outputs[out_blob].precision = 'FP16' # Get a number of classes recognized by a model num_of_classes = max(net.outputs[out_blob].shape) # ---------------------------Step 4. Loading model to the device------------------------------------------------------- log.info('Loading the model to the plugin') exec_net = ie.load_network(network=net, device_name=args.device) # ---------------------------Step 5. Create infer request-------------------------------------------------------------- # load_network() method of the IECore class with a specified number of requests (default 1) returns an ExecutableNetwork # instance which stores infer requests. So you already created Infer requests in the previous step. # ---------------------------Step 6. Prepare input--------------------------------------------------------------------- origin_img = cv2.imread(args.input) _, _, h, w = net.input_info[input_blob].input_data.shape mean = (0.485, 0.456, 0.406) std = (0.229, 0.224, 0.225) image, ratio = preprocess(origin_img, (h, w), mean, std) # ---------------------------Step 7. Do inference---------------------------------------------------------------------- log.info('Starting inference in synchronous mode') res = exec_net.infer(inputs={input_blob: image}) # ---------------------------Step 8. Process output-------------------------------------------------------------------- res = res[out_blob] predictions = demo_postprocess(res, (h, w), p6=False)[0] boxes = predictions[:, :4] scores = predictions[:, 4, None] * predictions[:, 5:] boxes_xyxy = np.ones_like(boxes) boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2]/2. boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3]/2. boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2]/2. boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3]/2. boxes_xyxy /= ratio dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.65, score_thr=0.1) if dets is not None: final_boxes = dets[:, :4] final_scores, final_cls_inds = dets[:, 4], dets[:, 5] origin_img = vis(origin_img, final_boxes, final_scores, final_cls_inds, conf=args.score_thr, class_names=COCO_CLASSES) mkdir(args.output_dir) output_path = os.path.join(args.output_dir, args.image_path.split("/")[-1]) cv2.imwrite(output_path, origin_img) if __name__ == '__main__': sys.exit(main()) ================================================ FILE: detector/YOLOX/demo/TensorRT/cpp/CMakeLists.txt ================================================ cmake_minimum_required(VERSION 2.6) project(yolox) add_definitions(-std=c++11) option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) set(CMAKE_CXX_STANDARD 11) set(CMAKE_BUILD_TYPE Debug) find_package(CUDA REQUIRED) include_directories(${PROJECT_SOURCE_DIR}/include) # include and link dirs of cuda and tensorrt, you need adapt them if yours are different # cuda include_directories(/data/cuda/cuda-10.2/cuda/include) link_directories(/data/cuda/cuda-10.2/cuda/lib64) # cudnn include_directories(/data/cuda/cuda-10.2/cudnn/v8.0.4/include) link_directories(/data/cuda/cuda-10.2/cudnn/v8.0.4/lib64) # tensorrt include_directories(/data/cuda/cuda-10.2/TensorRT/v7.2.1.6/include) link_directories(/data/cuda/cuda-10.2/TensorRT/v7.2.1.6/lib) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED") find_package(OpenCV) include_directories(${OpenCV_INCLUDE_DIRS}) add_executable(yolox ${PROJECT_SOURCE_DIR}/yolox.cpp) target_link_libraries(yolox nvinfer) target_link_libraries(yolox cudart) target_link_libraries(yolox ${OpenCV_LIBS}) add_definitions(-O2 -pthread) ================================================ FILE: detector/YOLOX/demo/TensorRT/cpp/README.md ================================================ # YOLOX-TensorRT in C++ As YOLOX models is easy to converted to tensorrt using [torch2trt gitrepo](https://github.com/NVIDIA-AI-IOT/torch2trt), our C++ demo will not include the model converting or constructing like other tenorrt demos. ## Step 1: Prepare serialized engine file Follow the trt [python demo README](../python/README.md) to convert and save the serialized engine file. Check the 'model_trt.engine' file generated from Step 1, which will automatically saved at the current demo dir. ## Step 2: build the demo Please follow the [TensorRT Installation Guide](https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html) to install TensorRT. Install opencv with ```sudo apt-get install libopencv-dev```. build the demo: ```shell mkdir build cd build cmake .. make ``` Then run the demo: ```shell ./yolox ../model_trt.engine -i ../../../../assets/dog.jpg ``` or ```shell ./yolox -i ``` ================================================ FILE: detector/YOLOX/demo/TensorRT/cpp/logging.h ================================================ /* * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef TENSORRT_LOGGING_H #define TENSORRT_LOGGING_H #include "NvInferRuntimeCommon.h" #include #include #include #include #include #include #include using Severity = nvinfer1::ILogger::Severity; class LogStreamConsumerBuffer : public std::stringbuf { public: LogStreamConsumerBuffer(std::ostream& stream, const std::string& prefix, bool shouldLog) : mOutput(stream) , mPrefix(prefix) , mShouldLog(shouldLog) { } LogStreamConsumerBuffer(LogStreamConsumerBuffer&& other) : mOutput(other.mOutput) { } ~LogStreamConsumerBuffer() { // std::streambuf::pbase() gives a pointer to the beginning of the buffered part of the output sequence // std::streambuf::pptr() gives a pointer to the current position of the output sequence // if the pointer to the beginning is not equal to the pointer to the current position, // call putOutput() to log the output to the stream if (pbase() != pptr()) { putOutput(); } } // synchronizes the stream buffer and returns 0 on success // synchronizing the stream buffer consists of inserting the buffer contents into the stream, // resetting the buffer and flushing the stream virtual int sync() { putOutput(); return 0; } void putOutput() { if (mShouldLog) { // prepend timestamp std::time_t timestamp = std::time(nullptr); tm* tm_local = std::localtime(×tamp); std::cout << "["; std::cout << std::setw(2) << std::setfill('0') << 1 + tm_local->tm_mon << "/"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_mday << "/"; std::cout << std::setw(4) << std::setfill('0') << 1900 + tm_local->tm_year << "-"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_hour << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_min << ":"; std::cout << std::setw(2) << std::setfill('0') << tm_local->tm_sec << "] "; // std::stringbuf::str() gets the string contents of the buffer // insert the buffer contents pre-appended by the appropriate prefix into the stream mOutput << mPrefix << str(); // set the buffer to empty str(""); // flush the stream mOutput.flush(); } } void setShouldLog(bool shouldLog) { mShouldLog = shouldLog; } private: std::ostream& mOutput; std::string mPrefix; bool mShouldLog; }; //! //! \class LogStreamConsumerBase //! \brief Convenience object used to initialize LogStreamConsumerBuffer before std::ostream in LogStreamConsumer //! class LogStreamConsumerBase { public: LogStreamConsumerBase(std::ostream& stream, const std::string& prefix, bool shouldLog) : mBuffer(stream, prefix, shouldLog) { } protected: LogStreamConsumerBuffer mBuffer; }; //! //! \class LogStreamConsumer //! \brief Convenience object used to facilitate use of C++ stream syntax when logging messages. //! Order of base classes is LogStreamConsumerBase and then std::ostream. //! This is because the LogStreamConsumerBase class is used to initialize the LogStreamConsumerBuffer member field //! in LogStreamConsumer and then the address of the buffer is passed to std::ostream. //! This is necessary to prevent the address of an uninitialized buffer from being passed to std::ostream. //! Please do not change the order of the parent classes. //! class LogStreamConsumer : protected LogStreamConsumerBase, public std::ostream { public: //! \brief Creates a LogStreamConsumer which logs messages with level severity. //! Reportable severity determines if the messages are severe enough to be logged. LogStreamConsumer(Severity reportableSeverity, Severity severity) : LogStreamConsumerBase(severityOstream(severity), severityPrefix(severity), severity <= reportableSeverity) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(severity <= reportableSeverity) , mSeverity(severity) { } LogStreamConsumer(LogStreamConsumer&& other) : LogStreamConsumerBase(severityOstream(other.mSeverity), severityPrefix(other.mSeverity), other.mShouldLog) , std::ostream(&mBuffer) // links the stream buffer with the stream , mShouldLog(other.mShouldLog) , mSeverity(other.mSeverity) { } void setReportableSeverity(Severity reportableSeverity) { mShouldLog = mSeverity <= reportableSeverity; mBuffer.setShouldLog(mShouldLog); } private: static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } static std::string severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } bool mShouldLog; Severity mSeverity; }; //! \class Logger //! //! \brief Class which manages logging of TensorRT tools and samples //! //! \details This class provides a common interface for TensorRT tools and samples to log information to the console, //! and supports logging two types of messages: //! //! - Debugging messages with an associated severity (info, warning, error, or internal error/fatal) //! - Test pass/fail messages //! //! The advantage of having all samples use this class for logging as opposed to emitting directly to stdout/stderr is //! that the logic for controlling the verbosity and formatting of sample output is centralized in one location. //! //! In the future, this class could be extended to support dumping test results to a file in some standard format //! (for example, JUnit XML), and providing additional metadata (e.g. timing the duration of a test run). //! //! TODO: For backwards compatibility with existing samples, this class inherits directly from the nvinfer1::ILogger //! interface, which is problematic since there isn't a clean separation between messages coming from the TensorRT //! library and messages coming from the sample. //! //! In the future (once all samples are updated to use Logger::getTRTLogger() to access the ILogger) we can refactor the //! class to eliminate the inheritance and instead make the nvinfer1::ILogger implementation a member of the Logger //! object. class Logger : public nvinfer1::ILogger { public: Logger(Severity severity = Severity::kWARNING) : mReportableSeverity(severity) { } //! //! \enum TestResult //! \brief Represents the state of a given test //! enum class TestResult { kRUNNING, //!< The test is running kPASSED, //!< The test passed kFAILED, //!< The test failed kWAIVED //!< The test was waived }; //! //! \brief Forward-compatible method for retrieving the nvinfer::ILogger associated with this Logger //! \return The nvinfer1::ILogger associated with this Logger //! //! TODO Once all samples are updated to use this method to register the logger with TensorRT, //! we can eliminate the inheritance of Logger from ILogger //! nvinfer1::ILogger& getTRTLogger() { return *this; } //! //! \brief Implementation of the nvinfer1::ILogger::log() virtual method //! //! Note samples should not be calling this function directly; it will eventually go away once we eliminate the //! inheritance from nvinfer1::ILogger //! void log(Severity severity, const char* msg) override { LogStreamConsumer(mReportableSeverity, severity) << "[TRT] " << std::string(msg) << std::endl; } //! //! \brief Method for controlling the verbosity of logging output //! //! \param severity The logger will only emit messages that have severity of this level or higher. //! void setReportableSeverity(Severity severity) { mReportableSeverity = severity; } //! //! \brief Opaque handle that holds logging information for a particular test //! //! This object is an opaque handle to information used by the Logger to print test results. //! The sample must call Logger::defineTest() in order to obtain a TestAtom that can be used //! with Logger::reportTest{Start,End}(). //! class TestAtom { public: TestAtom(TestAtom&&) = default; private: friend class Logger; TestAtom(bool started, const std::string& name, const std::string& cmdline) : mStarted(started) , mName(name) , mCmdline(cmdline) { } bool mStarted; std::string mName; std::string mCmdline; }; //! //! \brief Define a test for logging //! //! \param[in] name The name of the test. This should be a string starting with //! "TensorRT" and containing dot-separated strings containing //! the characters [A-Za-z0-9_]. //! For example, "TensorRT.sample_googlenet" //! \param[in] cmdline The command line used to reproduce the test // //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). //! static TestAtom defineTest(const std::string& name, const std::string& cmdline) { return TestAtom(false, name, cmdline); } //! //! \brief A convenience overloaded version of defineTest() that accepts an array of command-line arguments //! as input //! //! \param[in] name The name of the test //! \param[in] argc The number of command-line arguments //! \param[in] argv The array of command-line arguments (given as C strings) //! //! \return a TestAtom that can be used in Logger::reportTest{Start,End}(). static TestAtom defineTest(const std::string& name, int argc, char const* const* argv) { auto cmdline = genCmdlineString(argc, argv); return defineTest(name, cmdline); } //! //! \brief Report that a test has started. //! //! \pre reportTestStart() has not been called yet for the given testAtom //! //! \param[in] testAtom The handle to the test that has started //! static void reportTestStart(TestAtom& testAtom) { reportTestResult(testAtom, TestResult::kRUNNING); assert(!testAtom.mStarted); testAtom.mStarted = true; } //! //! \brief Report that a test has ended. //! //! \pre reportTestStart() has been called for the given testAtom //! //! \param[in] testAtom The handle to the test that has ended //! \param[in] result The result of the test. Should be one of TestResult::kPASSED, //! TestResult::kFAILED, TestResult::kWAIVED //! static void reportTestEnd(const TestAtom& testAtom, TestResult result) { assert(result != TestResult::kRUNNING); assert(testAtom.mStarted); reportTestResult(testAtom, result); } static int reportPass(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kPASSED); return EXIT_SUCCESS; } static int reportFail(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kFAILED); return EXIT_FAILURE; } static int reportWaive(const TestAtom& testAtom) { reportTestEnd(testAtom, TestResult::kWAIVED); return EXIT_SUCCESS; } static int reportTest(const TestAtom& testAtom, bool pass) { return pass ? reportPass(testAtom) : reportFail(testAtom); } Severity getReportableSeverity() const { return mReportableSeverity; } private: //! //! \brief returns an appropriate string for prefixing a log message with the given severity //! static const char* severityPrefix(Severity severity) { switch (severity) { case Severity::kINTERNAL_ERROR: return "[F] "; case Severity::kERROR: return "[E] "; case Severity::kWARNING: return "[W] "; case Severity::kINFO: return "[I] "; case Severity::kVERBOSE: return "[V] "; default: assert(0); return ""; } } //! //! \brief returns an appropriate string for prefixing a test result message with the given result //! static const char* testResultString(TestResult result) { switch (result) { case TestResult::kRUNNING: return "RUNNING"; case TestResult::kPASSED: return "PASSED"; case TestResult::kFAILED: return "FAILED"; case TestResult::kWAIVED: return "WAIVED"; default: assert(0); return ""; } } //! //! \brief returns an appropriate output stream (cout or cerr) to use with the given severity //! static std::ostream& severityOstream(Severity severity) { return severity >= Severity::kINFO ? std::cout : std::cerr; } //! //! \brief method that implements logging test results //! static void reportTestResult(const TestAtom& testAtom, TestResult result) { severityOstream(Severity::kINFO) << "&&&& " << testResultString(result) << " " << testAtom.mName << " # " << testAtom.mCmdline << std::endl; } //! //! \brief generate a command line string from the given (argc, argv) values //! static std::string genCmdlineString(int argc, char const* const* argv) { std::stringstream ss; for (int i = 0; i < argc; i++) { if (i > 0) ss << " "; ss << argv[i]; } return ss.str(); } Severity mReportableSeverity; }; namespace { //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kVERBOSE //! //! Example usage: //! //! LOG_VERBOSE(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_VERBOSE(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kVERBOSE); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINFO //! //! Example usage: //! //! LOG_INFO(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_INFO(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINFO); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kWARNING //! //! Example usage: //! //! LOG_WARN(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_WARN(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kWARNING); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kERROR //! //! Example usage: //! //! LOG_ERROR(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_ERROR(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kERROR); } //! //! \brief produces a LogStreamConsumer object that can be used to log messages of severity kINTERNAL_ERROR // ("fatal" severity) //! //! Example usage: //! //! LOG_FATAL(logger) << "hello world" << std::endl; //! inline LogStreamConsumer LOG_FATAL(const Logger& logger) { return LogStreamConsumer(logger.getReportableSeverity(), Severity::kINTERNAL_ERROR); } } // anonymous namespace #endif // TENSORRT_LOGGING_H ================================================ FILE: detector/YOLOX/demo/TensorRT/cpp/yolox.cpp ================================================ #include #include #include #include #include #include #include #include #include "NvInfer.h" #include "cuda_runtime_api.h" #include "logging.h" #define CHECK(status) \ do\ {\ auto ret = (status);\ if (ret != 0)\ {\ std::cerr << "Cuda failure: " << ret << std::endl;\ abort();\ }\ } while (0) #define DEVICE 0 // GPU id #define NMS_THRESH 0.65 #define BBOX_CONF_THRESH 0.3 using namespace nvinfer1; // stuff we know about the network and the input/output blobs static const int INPUT_W = 640; static const int INPUT_H = 640; const char* INPUT_BLOB_NAME = "input_0"; const char* OUTPUT_BLOB_NAME = "output_0"; static Logger gLogger; cv::Mat static_resize(cv::Mat& img) { float r = std::min(INPUT_W / (img.cols*1.0), INPUT_H / (img.rows*1.0)); // r = std::min(r, 1.0f); int unpad_w = r * img.cols; int unpad_h = r * img.rows; cv::Mat re(unpad_h, unpad_w, CV_8UC3); cv::resize(img, re, re.size()); cv::Mat out(INPUT_W, INPUT_H, CV_8UC3, cv::Scalar(114, 114, 114)); re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows))); return out; } struct Object { cv::Rect_ rect; int label; float prob; }; struct GridAndStride { int grid0; int grid1; int stride; }; static void generate_grids_and_stride(const int target_size, std::vector& strides, std::vector& grid_strides) { for (auto stride : strides) { int num_grid = target_size / stride; for (int g1 = 0; g1 < num_grid; g1++) { for (int g0 = 0; g0 < num_grid; g0++) { grid_strides.push_back((GridAndStride){g0, g1, stride}); } } } } static inline float intersection_area(const Object& a, const Object& b) { cv::Rect_ inter = a.rect & b.rect; return inter.area(); } static void qsort_descent_inplace(std::vector& faceobjects, int left, int right) { int i = left; int j = right; float p = faceobjects[(left + right) / 2].prob; while (i <= j) { while (faceobjects[i].prob > p) i++; while (faceobjects[j].prob < p) j--; if (i <= j) { // swap std::swap(faceobjects[i], faceobjects[j]); i++; j--; } } #pragma omp parallel sections { #pragma omp section { if (left < j) qsort_descent_inplace(faceobjects, left, j); } #pragma omp section { if (i < right) qsort_descent_inplace(faceobjects, i, right); } } } static void qsort_descent_inplace(std::vector& objects) { if (objects.empty()) return; qsort_descent_inplace(objects, 0, objects.size() - 1); } static void nms_sorted_bboxes(const std::vector& faceobjects, std::vector& picked, float nms_threshold) { picked.clear(); const int n = faceobjects.size(); std::vector areas(n); for (int i = 0; i < n; i++) { areas[i] = faceobjects[i].rect.area(); } for (int i = 0; i < n; i++) { const Object& a = faceobjects[i]; int keep = 1; for (int j = 0; j < (int)picked.size(); j++) { const Object& b = faceobjects[picked[j]]; // intersection over union float inter_area = intersection_area(a, b); float union_area = areas[i] + areas[picked[j]] - inter_area; // float IoU = inter_area / union_area if (inter_area / union_area > nms_threshold) keep = 0; } if (keep) picked.push_back(i); } } static void generate_yolox_proposals(std::vector grid_strides, float* feat_blob, float prob_threshold, std::vector& objects) { const int num_class = 80; const int num_anchors = grid_strides.size(); for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++) { const int grid0 = grid_strides[anchor_idx].grid0; const int grid1 = grid_strides[anchor_idx].grid1; const int stride = grid_strides[anchor_idx].stride; const int basic_pos = anchor_idx * 85; // yolox/models/yolo_head.py decode logic float x_center = (feat_blob[basic_pos+0] + grid0) * stride; float y_center = (feat_blob[basic_pos+1] + grid1) * stride; float w = exp(feat_blob[basic_pos+2]) * stride; float h = exp(feat_blob[basic_pos+3]) * stride; float x0 = x_center - w * 0.5f; float y0 = y_center - h * 0.5f; float box_objectness = feat_blob[basic_pos+4]; for (int class_idx = 0; class_idx < num_class; class_idx++) { float box_cls_score = feat_blob[basic_pos + 5 + class_idx]; float box_prob = box_objectness * box_cls_score; if (box_prob > prob_threshold) { Object obj; obj.rect.x = x0; obj.rect.y = y0; obj.rect.width = w; obj.rect.height = h; obj.label = class_idx; obj.prob = box_prob; objects.push_back(obj); } } // class loop } // point anchor loop } float* blobFromImage(cv::Mat& img){ cv::cvtColor(img, img, cv::COLOR_BGR2RGB); float* blob = new float[img.total()*3]; int channels = 3; int img_h = 640; int img_w = 640; std::vector mean = {0.485, 0.456, 0.406}; std::vector std = {0.229, 0.224, 0.225}; for (size_t c = 0; c < channels; c++) { for (size_t h = 0; h < img_h; h++) { for (size_t w = 0; w < img_w; w++) { blob[c * img_w * img_h + h * img_w + w] = (((float)img.at(h, w)[c]) / 255.0f - mean[c]) / std[c]; } } } return blob; } static void decode_outputs(float* prob, std::vector& objects, float scale, const int img_w, const int img_h) { std::vector proposals; std::vector strides = {8, 16, 32}; std::vector grid_strides; generate_grids_and_stride(INPUT_W, strides, grid_strides); generate_yolox_proposals(grid_strides, prob, BBOX_CONF_THRESH, proposals); std::cout << "num of boxes before nms: " << proposals.size() << std::endl; qsort_descent_inplace(proposals); std::vector picked; nms_sorted_bboxes(proposals, picked, NMS_THRESH); int count = picked.size(); std::cout << "num of boxes: " << count << std::endl; objects.resize(count); for (int i = 0; i < count; i++) { objects[i] = proposals[picked[i]]; // adjust offset to original unpadded float x0 = (objects[i].rect.x) / scale; float y0 = (objects[i].rect.y) / scale; float x1 = (objects[i].rect.x + objects[i].rect.width) / scale; float y1 = (objects[i].rect.y + objects[i].rect.height) / scale; // clip x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f); y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f); x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f); y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f); objects[i].rect.x = x0; objects[i].rect.y = y0; objects[i].rect.width = x1 - x0; objects[i].rect.height = y1 - y0; } } const float color_list[80][3] = { {0.000, 0.447, 0.741}, {0.850, 0.325, 0.098}, {0.929, 0.694, 0.125}, {0.494, 0.184, 0.556}, {0.466, 0.674, 0.188}, {0.301, 0.745, 0.933}, {0.635, 0.078, 0.184}, {0.300, 0.300, 0.300}, {0.600, 0.600, 0.600}, {1.000, 0.000, 0.000}, {1.000, 0.500, 0.000}, {0.749, 0.749, 0.000}, {0.000, 1.000, 0.000}, {0.000, 0.000, 1.000}, {0.667, 0.000, 1.000}, {0.333, 0.333, 0.000}, {0.333, 0.667, 0.000}, {0.333, 1.000, 0.000}, {0.667, 0.333, 0.000}, {0.667, 0.667, 0.000}, {0.667, 1.000, 0.000}, {1.000, 0.333, 0.000}, {1.000, 0.667, 0.000}, {1.000, 1.000, 0.000}, {0.000, 0.333, 0.500}, {0.000, 0.667, 0.500}, {0.000, 1.000, 0.500}, {0.333, 0.000, 0.500}, {0.333, 0.333, 0.500}, {0.333, 0.667, 0.500}, {0.333, 1.000, 0.500}, {0.667, 0.000, 0.500}, {0.667, 0.333, 0.500}, {0.667, 0.667, 0.500}, {0.667, 1.000, 0.500}, {1.000, 0.000, 0.500}, {1.000, 0.333, 0.500}, {1.000, 0.667, 0.500}, {1.000, 1.000, 0.500}, {0.000, 0.333, 1.000}, {0.000, 0.667, 1.000}, {0.000, 1.000, 1.000}, {0.333, 0.000, 1.000}, {0.333, 0.333, 1.000}, {0.333, 0.667, 1.000}, {0.333, 1.000, 1.000}, {0.667, 0.000, 1.000}, {0.667, 0.333, 1.000}, {0.667, 0.667, 1.000}, {0.667, 1.000, 1.000}, {1.000, 0.000, 1.000}, {1.000, 0.333, 1.000}, {1.000, 0.667, 1.000}, {0.333, 0.000, 0.000}, {0.500, 0.000, 0.000}, {0.667, 0.000, 0.000}, {0.833, 0.000, 0.000}, {1.000, 0.000, 0.000}, {0.000, 0.167, 0.000}, {0.000, 0.333, 0.000}, {0.000, 0.500, 0.000}, {0.000, 0.667, 0.000}, {0.000, 0.833, 0.000}, {0.000, 1.000, 0.000}, {0.000, 0.000, 0.167}, {0.000, 0.000, 0.333}, {0.000, 0.000, 0.500}, {0.000, 0.000, 0.667}, {0.000, 0.000, 0.833}, {0.000, 0.000, 1.000}, {0.000, 0.000, 0.000}, {0.143, 0.143, 0.143}, {0.286, 0.286, 0.286}, {0.429, 0.429, 0.429}, {0.571, 0.571, 0.571}, {0.714, 0.714, 0.714}, {0.857, 0.857, 0.857}, {0.000, 0.447, 0.741}, {0.314, 0.717, 0.741}, {0.50, 0.5, 0} }; static void draw_objects(const cv::Mat& bgr, const std::vector& objects, std::string f) { static const char* class_names[] = { "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush" }; cv::Mat image = bgr.clone(); for (size_t i = 0; i < objects.size(); i++) { const Object& obj = objects[i]; fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); cv::Scalar color = cv::Scalar(color_list[obj.label][0], color_list[obj.label][1], color_list[obj.label][2]); float c_mean = cv::mean(color)[0]; cv::Scalar txt_color; if (c_mean > 0.5){ txt_color = cv::Scalar(0, 0, 0); }else{ txt_color = cv::Scalar(255, 255, 255); } cv::rectangle(image, obj.rect, color * 255, 2); char text[256]; sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); int baseLine = 0; cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.4, 1, &baseLine); cv::Scalar txt_bk_color = color * 0.7 * 255; int x = obj.rect.x; int y = obj.rect.y + 1; //int y = obj.rect.y - label_size.height - baseLine; if (y > image.rows) y = image.rows; //if (x + label_size.width > image.cols) //x = image.cols - label_size.width; cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), txt_bk_color, -1); cv::putText(image, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.4, txt_color, 1); } cv::imwrite("det_res.jpg", image); fprintf(stderr, "save vis file\n"); /* cv::imshow("image", image); */ /* cv::waitKey(0); */ } void doInference(IExecutionContext& context, float* input, float* output, const int output_size, cv::Size input_shape) { const ICudaEngine& engine = context.getEngine(); // Pointers to input and output device buffers to pass to engine. // Engine requires exactly IEngine::getNbBindings() number of buffers. assert(engine.getNbBindings() == 2); void* buffers[2]; // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings() const int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME); assert(engine.getBindingDataType(inputIndex) == nvinfer1::DataType::kFLOAT); const int outputIndex = engine.getBindingIndex(OUTPUT_BLOB_NAME); assert(engine.getBindingDataType(outputIndex) == nvinfer1::DataType::kFLOAT); int mBatchSize = engine.getMaxBatchSize(); // Create GPU buffers on device CHECK(cudaMalloc(&buffers[inputIndex], 3 * input_shape.height * input_shape.width * sizeof(float))); CHECK(cudaMalloc(&buffers[outputIndex], output_size*sizeof(float))); // Create stream cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[inputIndex], input, 3 * input_shape.height * input_shape.width * sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(1, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[outputIndex], output_size * sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release stream and buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex])); } int main(int argc, char** argv) { cudaSetDevice(DEVICE); // create a model using the API directly and serialize it to a stream char *trtModelStream{nullptr}; size_t size{0}; if (argc == 4 && std::string(argv[2]) == "-i") { const std::string engine_file_path {argv[1]}; std::ifstream file(engine_file_path, std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { std::cerr << "arguments not right!" << std::endl; std::cerr << "run 'python3 yolox/deploy/trt.py -n yolox-{tiny, s, m, l, x}' to serialize model first!" << std::endl; std::cerr << "Then use the following command:" << std::endl; std::cerr << "./yolox ../model_trt.engine -i ../../../assets/dog.jpg // deserialize file and run inference" << std::endl; return -1; } const std::string input_image_path {argv[3]}; //std::vector file_names; //if (read_files_in_dir(argv[2], file_names) < 0) { //std::cout << "read_files_in_dir failed." << std::endl; //return -1; //} IRuntime* runtime = createInferRuntime(gLogger); assert(runtime != nullptr); ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size); assert(engine != nullptr); IExecutionContext* context = engine->createExecutionContext(); assert(context != nullptr); delete[] trtModelStream; auto out_dims = engine->getBindingDimensions(1); auto output_size = 1; for(int j=0;j(end - start).count() << "ms" << std::endl; std::vector objects; decode_outputs(prob, objects, scale, img_w, img_h); draw_objects(img, objects, input_image_path); // destroy the engine context->destroy(); engine->destroy(); runtime->destroy(); return 0; } ================================================ FILE: detector/YOLOX/demo/TensorRT/python/README.md ================================================ # YOLOX-TensorRT in Python This toturial includes a Python demo for TensorRT. ## Install TensorRT Toolkit Please follow the [TensorRT Installation Guide](https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html) and [torch2trt gitrepo](https://github.com/NVIDIA-AI-IOT/torch2trt) to install TensorRT and torch2trt. ## Convert model YOLOX models can be easily conveted to TensorRT models using torch2trt If you want to convert our model, use the flag -n to specify a model name: ```shell python tools/trt.py -n -c ``` For example: ```shell python tools/trt.py -n yolox-s -c your_ckpt.pth.tar ``` can be: yolox-nano, yolox-tiny. yolox-s, yolox-m, yolox-l, yolox-x. If you want to convert your customized model, use the flag -f to specify you exp file: ```shell python tools/trt.py -f -c ``` For example: ```shell python tools/trt.py -f /path/to/your/yolox/exps/yolox_s.py -c your_ckpt.pth.tar ``` *yolox_s.py* can be any exp file modified by you. The converted model and the serialized engine file (for C++ demo) will be saved on your experiment output dir. ## Demo The TensorRT python demo is merged on our pytorch demo file, so you can run the pytorch demo command with ```--trt```. ```shell python tools/demo.py image -n yolox-s --trt --save_result ``` or ```shell python tools/demo.py image -f exps/default/yolox_s.py --trt --save_result ``` ================================================ FILE: detector/YOLOX/demo/ncnn/android/README.md ================================================ # YOLOX-Android-ncnn Andoird app of YOLOX object detection base on [ncnn](https://github.com/Tencent/ncnn) ## Tutorial ### Step1 Download ncnn-android-vulkan.zip from [releases of ncnn](https://github.com/Tencent/ncnn/releases). This repo us [20210525 release](https://github.com/Tencent/ncnn/releases/download/20210525/ncnn-20210525-android-vulkan.zip) for building. ### Step2 After downloading, please extract your zip file. Then, there are two ways to finish this step: * put your extracted directory into app/src/main/jni * change the ncnn_DIR path in app/src/main/jni/CMakeLists.txt to your extracted directory. ### Step3 Download example param and bin file from [onedrive](https://megvii-my.sharepoint.cn/:u:/g/personal/gezheng_megvii_com/ESXBH_GSSmFMszWJ6YG2VkQB5cWDfqVWXgk0D996jH0rpQ?e=qzEqUh) or [github](https://github.com/Megvii-BaseDetection/storage/releases/download/0.0.1/yolox_s_ncnn.tar.gz). Unzip the file to app/src/main/assets. ### Step4 Open this project with Android Studio, build it and enjoy! ## Reference * [ncnn-android-yolov5](https://github.com/nihui/ncnn-android-yolov5) ================================================ FILE: detector/YOLOX/demo/ncnn/android/app/build.gradle ================================================ apply plugin: 'com.android.application' android { compileSdkVersion 24 buildToolsVersion "29.0.2" defaultConfig { applicationId "com.megvii.yoloXncnn" archivesBaseName = "$applicationId" ndk { moduleName "ncnn" abiFilters "armeabi-v7a", "arm64-v8a" } minSdkVersion 24 } externalNativeBuild { cmake { version "3.10.2" path file('src/main/jni/CMakeLists.txt') } } } ================================================ FILE: detector/YOLOX/demo/ncnn/android/app/src/main/AndroidManifest.xml ================================================ ================================================ FILE: detector/YOLOX/demo/ncnn/android/app/src/main/assets/yolox.param ================================================ 7767517 220 250 Input images 0 1 images YoloV5Focus focus 1 1 images 503 Convolution Conv_41 1 1 503 877 0=32 1=3 4=1 5=1 6=3456 Swish Mul_43 1 1 877 507 Convolution Conv_44 1 1 507 880 0=64 1=3 3=2 4=1 5=1 6=18432 Swish Mul_46 1 1 880 511 Split splitncnn_0 1 2 511 511_splitncnn_0 511_splitncnn_1 Convolution Conv_47 1 1 511_splitncnn_1 883 0=32 1=1 5=1 6=2048 Swish Mul_49 1 1 883 515 Split splitncnn_1 1 2 515 515_splitncnn_0 515_splitncnn_1 Convolution Conv_50 1 1 511_splitncnn_0 886 0=32 1=1 5=1 6=2048 Swish Mul_52 1 1 886 519 Convolution Conv_53 1 1 515_splitncnn_1 889 0=32 1=1 5=1 6=1024 Swish Mul_55 1 1 889 523 Convolution Conv_56 1 1 523 892 0=32 1=3 4=1 5=1 6=9216 Swish Mul_58 1 1 892 527 BinaryOp Add_59 2 1 527 515_splitncnn_0 528 Concat Concat_60 2 1 528 519 529 Convolution Conv_61 1 1 529 895 0=64 1=1 5=1 6=4096 Swish Mul_63 1 1 895 533 Convolution Conv_64 1 1 533 898 0=128 1=3 3=2 4=1 5=1 6=73728 Swish Mul_66 1 1 898 537 Split splitncnn_2 1 2 537 537_splitncnn_0 537_splitncnn_1 Convolution Conv_67 1 1 537_splitncnn_1 901 0=64 1=1 5=1 6=8192 Swish Mul_69 1 1 901 541 Split splitncnn_3 1 2 541 541_splitncnn_0 541_splitncnn_1 Convolution Conv_70 1 1 537_splitncnn_0 904 0=64 1=1 5=1 6=8192 Swish Mul_72 1 1 904 545 Convolution Conv_73 1 1 541_splitncnn_1 907 0=64 1=1 5=1 6=4096 Swish Mul_75 1 1 907 549 Convolution Conv_76 1 1 549 910 0=64 1=3 4=1 5=1 6=36864 Swish Mul_78 1 1 910 553 BinaryOp Add_79 2 1 553 541_splitncnn_0 554 Split splitncnn_4 1 2 554 554_splitncnn_0 554_splitncnn_1 Convolution Conv_80 1 1 554_splitncnn_1 913 0=64 1=1 5=1 6=4096 Swish Mul_82 1 1 913 558 Convolution Conv_83 1 1 558 916 0=64 1=3 4=1 5=1 6=36864 Swish Mul_85 1 1 916 562 BinaryOp Add_86 2 1 562 554_splitncnn_0 563 Split splitncnn_5 1 2 563 563_splitncnn_0 563_splitncnn_1 Convolution Conv_87 1 1 563_splitncnn_1 919 0=64 1=1 5=1 6=4096 Swish Mul_89 1 1 919 567 Convolution Conv_90 1 1 567 922 0=64 1=3 4=1 5=1 6=36864 Swish Mul_92 1 1 922 571 BinaryOp Add_93 2 1 571 563_splitncnn_0 572 Concat Concat_94 2 1 572 545 573 Convolution Conv_95 1 1 573 925 0=128 1=1 5=1 6=16384 Swish Mul_97 1 1 925 577 Split splitncnn_6 1 2 577 577_splitncnn_0 577_splitncnn_1 Convolution Conv_98 1 1 577_splitncnn_1 928 0=256 1=3 3=2 4=1 5=1 6=294912 Swish Mul_100 1 1 928 581 Split splitncnn_7 1 2 581 581_splitncnn_0 581_splitncnn_1 Convolution Conv_101 1 1 581_splitncnn_1 931 0=128 1=1 5=1 6=32768 Swish Mul_103 1 1 931 585 Split splitncnn_8 1 2 585 585_splitncnn_0 585_splitncnn_1 Convolution Conv_104 1 1 581_splitncnn_0 934 0=128 1=1 5=1 6=32768 Swish Mul_106 1 1 934 589 Convolution Conv_107 1 1 585_splitncnn_1 937 0=128 1=1 5=1 6=16384 Swish Mul_109 1 1 937 593 Convolution Conv_110 1 1 593 940 0=128 1=3 4=1 5=1 6=147456 Swish Mul_112 1 1 940 597 BinaryOp Add_113 2 1 597 585_splitncnn_0 598 Split splitncnn_9 1 2 598 598_splitncnn_0 598_splitncnn_1 Convolution Conv_114 1 1 598_splitncnn_1 943 0=128 1=1 5=1 6=16384 Swish Mul_116 1 1 943 602 Convolution Conv_117 1 1 602 946 0=128 1=3 4=1 5=1 6=147456 Swish Mul_119 1 1 946 606 BinaryOp Add_120 2 1 606 598_splitncnn_0 607 Split splitncnn_10 1 2 607 607_splitncnn_0 607_splitncnn_1 Convolution Conv_121 1 1 607_splitncnn_1 949 0=128 1=1 5=1 6=16384 Swish Mul_123 1 1 949 611 Convolution Conv_124 1 1 611 952 0=128 1=3 4=1 5=1 6=147456 Swish Mul_126 1 1 952 615 BinaryOp Add_127 2 1 615 607_splitncnn_0 616 Concat Concat_128 2 1 616 589 617 Convolution Conv_129 1 1 617 955 0=256 1=1 5=1 6=65536 Swish Mul_131 1 1 955 621 Split splitncnn_11 1 2 621 621_splitncnn_0 621_splitncnn_1 Convolution Conv_132 1 1 621_splitncnn_1 958 0=512 1=3 3=2 4=1 5=1 6=1179648 Swish Mul_134 1 1 958 625 Convolution Conv_135 1 1 625 961 0=256 1=1 5=1 6=131072 Swish Mul_137 1 1 961 629 Split splitncnn_12 1 4 629 629_splitncnn_0 629_splitncnn_1 629_splitncnn_2 629_splitncnn_3 Pooling MaxPool_138 1 1 629_splitncnn_3 630 1=5 3=2 5=1 Pooling MaxPool_139 1 1 629_splitncnn_2 631 1=9 3=4 5=1 Pooling MaxPool_140 1 1 629_splitncnn_1 632 1=13 3=6 5=1 Concat Concat_141 4 1 629_splitncnn_0 630 631 632 633 Convolution Conv_142 1 1 633 964 0=512 1=1 5=1 6=524288 Swish Mul_144 1 1 964 637 Split splitncnn_13 1 2 637 637_splitncnn_0 637_splitncnn_1 Convolution Conv_145 1 1 637_splitncnn_1 967 0=256 1=1 5=1 6=131072 Swish Mul_147 1 1 967 641 Convolution Conv_148 1 1 637_splitncnn_0 970 0=256 1=1 5=1 6=131072 Swish Mul_150 1 1 970 645 Convolution Conv_151 1 1 641 973 0=256 1=1 5=1 6=65536 Swish Mul_153 1 1 973 649 Convolution Conv_154 1 1 649 976 0=256 1=3 4=1 5=1 6=589824 Swish Mul_156 1 1 976 653 Concat Concat_157 2 1 653 645 654 Convolution Conv_158 1 1 654 979 0=512 1=1 5=1 6=262144 Swish Mul_160 1 1 979 658 Convolution Conv_161 1 1 658 982 0=256 1=1 5=1 6=131072 Swish Mul_163 1 1 982 662 Split splitncnn_14 1 2 662 662_splitncnn_0 662_splitncnn_1 Interp Resize_165 1 1 662_splitncnn_1 667 0=1 1=2.000000e+00 2=2.000000e+00 Concat Concat_166 2 1 667 621_splitncnn_0 668 Split splitncnn_15 1 2 668 668_splitncnn_0 668_splitncnn_1 Convolution Conv_167 1 1 668_splitncnn_1 985 0=128 1=1 5=1 6=65536 Swish Mul_169 1 1 985 672 Convolution Conv_170 1 1 668_splitncnn_0 988 0=128 1=1 5=1 6=65536 Swish Mul_172 1 1 988 676 Convolution Conv_173 1 1 672 991 0=128 1=1 5=1 6=16384 Swish Mul_175 1 1 991 680 Convolution Conv_176 1 1 680 994 0=128 1=3 4=1 5=1 6=147456 Swish Mul_178 1 1 994 684 Concat Concat_179 2 1 684 676 685 Convolution Conv_180 1 1 685 997 0=256 1=1 5=1 6=65536 Swish Mul_182 1 1 997 689 Convolution Conv_183 1 1 689 1000 0=128 1=1 5=1 6=32768 Swish Mul_185 1 1 1000 693 Split splitncnn_16 1 2 693 693_splitncnn_0 693_splitncnn_1 Interp Resize_187 1 1 693_splitncnn_1 698 0=1 1=2.000000e+00 2=2.000000e+00 Concat Concat_188 2 1 698 577_splitncnn_0 699 Split splitncnn_17 1 2 699 699_splitncnn_0 699_splitncnn_1 Convolution Conv_189 1 1 699_splitncnn_1 1003 0=64 1=1 5=1 6=16384 Swish Mul_191 1 1 1003 703 Convolution Conv_192 1 1 699_splitncnn_0 1006 0=64 1=1 5=1 6=16384 Swish Mul_194 1 1 1006 707 Convolution Conv_195 1 1 703 1009 0=64 1=1 5=1 6=4096 Swish Mul_197 1 1 1009 711 Convolution Conv_198 1 1 711 1012 0=64 1=3 4=1 5=1 6=36864 Swish Mul_200 1 1 1012 715 Concat Concat_201 2 1 715 707 716 Convolution Conv_202 1 1 716 1015 0=128 1=1 5=1 6=16384 Swish Mul_204 1 1 1015 720 Split splitncnn_18 1 2 720 720_splitncnn_0 720_splitncnn_1 Convolution Conv_205 1 1 720_splitncnn_1 1018 0=128 1=3 3=2 4=1 5=1 6=147456 Swish Mul_207 1 1 1018 724 Concat Concat_208 2 1 724 693_splitncnn_0 725 Split splitncnn_19 1 2 725 725_splitncnn_0 725_splitncnn_1 Convolution Conv_209 1 1 725_splitncnn_1 1021 0=128 1=1 5=1 6=32768 Swish Mul_211 1 1 1021 729 Convolution Conv_212 1 1 725_splitncnn_0 1024 0=128 1=1 5=1 6=32768 Swish Mul_214 1 1 1024 733 Convolution Conv_215 1 1 729 1027 0=128 1=1 5=1 6=16384 Swish Mul_217 1 1 1027 737 Convolution Conv_218 1 1 737 1030 0=128 1=3 4=1 5=1 6=147456 Swish Mul_220 1 1 1030 741 Concat Concat_221 2 1 741 733 742 Convolution Conv_222 1 1 742 1033 0=256 1=1 5=1 6=65536 Swish Mul_224 1 1 1033 746 Split splitncnn_20 1 2 746 746_splitncnn_0 746_splitncnn_1 Convolution Conv_225 1 1 746_splitncnn_1 1036 0=256 1=3 3=2 4=1 5=1 6=589824 Swish Mul_227 1 1 1036 750 Concat Concat_228 2 1 750 662_splitncnn_0 751 Split splitncnn_21 1 2 751 751_splitncnn_0 751_splitncnn_1 Convolution Conv_229 1 1 751_splitncnn_1 1039 0=256 1=1 5=1 6=131072 Swish Mul_231 1 1 1039 755 Convolution Conv_232 1 1 751_splitncnn_0 1042 0=256 1=1 5=1 6=131072 Swish Mul_234 1 1 1042 759 Convolution Conv_235 1 1 755 1045 0=256 1=1 5=1 6=65536 Swish Mul_237 1 1 1045 763 Convolution Conv_238 1 1 763 1048 0=256 1=3 4=1 5=1 6=589824 Swish Mul_240 1 1 1048 767 Concat Concat_241 2 1 767 759 768 Convolution Conv_242 1 1 768 1051 0=512 1=1 5=1 6=262144 Swish Mul_244 1 1 1051 772 Convolution Conv_245 1 1 720_splitncnn_0 1054 0=128 1=1 5=1 6=16384 Swish Mul_247 1 1 1054 776 Split splitncnn_22 1 2 776 776_splitncnn_0 776_splitncnn_1 Convolution Conv_248 1 1 776_splitncnn_1 1057 0=128 1=3 4=1 5=1 6=147456 Swish Mul_250 1 1 1057 780 Convolution Conv_251 1 1 780 1060 0=128 1=3 4=1 5=1 6=147456 Swish Mul_253 1 1 1060 784 Convolution Conv_254 1 1 784 797 0=80 1=1 5=1 6=10240 9=4 Convolution Conv_255 1 1 776_splitncnn_0 1063 0=128 1=3 4=1 5=1 6=147456 Swish Mul_257 1 1 1063 789 Convolution Conv_258 1 1 789 1066 0=128 1=3 4=1 5=1 6=147456 Swish Mul_260 1 1 1066 793 Split splitncnn_23 1 2 793 793_splitncnn_0 793_splitncnn_1 Convolution Conv_261 1 1 793_splitncnn_1 794 0=4 1=1 5=1 6=512 Convolution Conv_262 1 1 793_splitncnn_0 796 0=1 1=1 5=1 6=128 9=4 Concat Concat_265 3 1 794 796 797 798 Convolution Conv_266 1 1 746_splitncnn_0 1069 0=128 1=1 5=1 6=32768 Swish Mul_268 1 1 1069 802 Split splitncnn_24 1 2 802 802_splitncnn_0 802_splitncnn_1 Convolution Conv_269 1 1 802_splitncnn_1 1072 0=128 1=3 4=1 5=1 6=147456 Swish Mul_271 1 1 1072 806 Convolution Conv_272 1 1 806 1075 0=128 1=3 4=1 5=1 6=147456 Swish Mul_274 1 1 1075 810 Convolution Conv_275 1 1 810 823 0=80 1=1 5=1 6=10240 9=4 Convolution Conv_276 1 1 802_splitncnn_0 1078 0=128 1=3 4=1 5=1 6=147456 Swish Mul_278 1 1 1078 815 Convolution Conv_279 1 1 815 1081 0=128 1=3 4=1 5=1 6=147456 Swish Mul_281 1 1 1081 819 Split splitncnn_25 1 2 819 819_splitncnn_0 819_splitncnn_1 Convolution Conv_282 1 1 819_splitncnn_1 820 0=4 1=1 5=1 6=512 Convolution Conv_283 1 1 819_splitncnn_0 822 0=1 1=1 5=1 6=128 9=4 Concat Concat_286 3 1 820 822 823 824 Convolution Conv_287 1 1 772 1084 0=128 1=1 5=1 6=65536 Swish Mul_289 1 1 1084 828 Split splitncnn_26 1 2 828 828_splitncnn_0 828_splitncnn_1 Convolution Conv_290 1 1 828_splitncnn_1 1087 0=128 1=3 4=1 5=1 6=147456 Swish Mul_292 1 1 1087 832 Convolution Conv_293 1 1 832 1090 0=128 1=3 4=1 5=1 6=147456 Swish Mul_295 1 1 1090 836 Convolution Conv_296 1 1 836 849 0=80 1=1 5=1 6=10240 9=4 Convolution Conv_297 1 1 828_splitncnn_0 1093 0=128 1=3 4=1 5=1 6=147456 Swish Mul_299 1 1 1093 841 Convolution Conv_300 1 1 841 1096 0=128 1=3 4=1 5=1 6=147456 Swish Mul_302 1 1 1096 845 Split splitncnn_27 1 2 845 845_splitncnn_0 845_splitncnn_1 Convolution Conv_303 1 1 845_splitncnn_1 846 0=4 1=1 5=1 6=512 Convolution Conv_304 1 1 845_splitncnn_0 848 0=1 1=1 5=1 6=128 9=4 Concat Concat_307 3 1 846 848 849 850 Reshape Reshape_315 1 1 798 858 0=-1 1=85 Reshape Reshape_323 1 1 824 866 0=-1 1=85 Reshape Reshape_331 1 1 850 874 0=-1 1=85 Concat Concat_332 3 1 858 866 874 875 0=1 Permute Transpose_333 1 1 875 output 0=1 ================================================ FILE: detector/YOLOX/demo/ncnn/android/app/src/main/java/com/megvii/yoloXncnn/MainActivity.java ================================================ // Some code in this file is based on: // https://github.com/nihui/ncnn-android-yolov5/blob/master/app/src/main/java/com/tencent/yolov5ncnn/MainActivity.java // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. // Copyright (C) Megvii, Inc. and its affiliates. All rights reserved. package com.megvii.yoloXncnn; import android.app.Activity; import android.content.Intent; import android.graphics.Bitmap; import android.graphics.BitmapFactory; import android.graphics.Canvas; import android.graphics.Color; import android.graphics.Paint; import android.media.ExifInterface; import android.graphics.Matrix; import android.net.Uri; import android.os.Bundle; import android.util.Log; import android.view.View; import android.widget.Button; import android.widget.ImageView; import java.io.FileNotFoundException; import java.io.InputStream; import java.io.IOException; public class MainActivity extends Activity { private static final int SELECT_IMAGE = 1; private ImageView imageView; private Bitmap bitmap = null; private Bitmap yourSelectedImage = null; private YOLOXncnn yoloX = new YOLOXncnn(); /** Called when the activity is first created. */ @Override public void onCreate(Bundle savedInstanceState) { super.onCreate(savedInstanceState); setContentView(R.layout.main); boolean ret_init = yoloX.Init(getAssets()); if (!ret_init) { Log.e("MainActivity", "yoloXncnn Init failed"); } imageView = (ImageView) findViewById(R.id.imageView); Button buttonImage = (Button) findViewById(R.id.buttonImage); buttonImage.setOnClickListener(new View.OnClickListener() { @Override public void onClick(View arg0) { Intent i = new Intent(Intent.ACTION_PICK); i.setType("image/*"); startActivityForResult(i, SELECT_IMAGE); } }); Button buttonDetect = (Button) findViewById(R.id.buttonDetect); buttonDetect.setOnClickListener(new View.OnClickListener() { @Override public void onClick(View arg0) { if (yourSelectedImage == null) return; YOLOXncnn.Obj[] objects = yoloX.Detect(yourSelectedImage, false); showObjects(objects); } }); Button buttonDetectGPU = (Button) findViewById(R.id.buttonDetectGPU); buttonDetectGPU.setOnClickListener(new View.OnClickListener() { @Override public void onClick(View arg0) { if (yourSelectedImage == null) return; YOLOXncnn.Obj[] objects = yoloX.Detect(yourSelectedImage, true); showObjects(objects); } }); } private void showObjects(YOLOXncnn.Obj[] objects) { if (objects == null) { imageView.setImageBitmap(bitmap); return; } // draw objects on bitmap Bitmap rgba = bitmap.copy(Bitmap.Config.ARGB_8888, true); final int[] colors = new int[] { Color.rgb( 54, 67, 244), Color.rgb( 99, 30, 233), Color.rgb(176, 39, 156), Color.rgb(183, 58, 103), Color.rgb(181, 81, 63), Color.rgb(243, 150, 33), Color.rgb(244, 169, 3), Color.rgb(212, 188, 0), Color.rgb(136, 150, 0), Color.rgb( 80, 175, 76), Color.rgb( 74, 195, 139), Color.rgb( 57, 220, 205), Color.rgb( 59, 235, 255), Color.rgb( 7, 193, 255), Color.rgb( 0, 152, 255), Color.rgb( 34, 87, 255), Color.rgb( 72, 85, 121), Color.rgb(158, 158, 158), Color.rgb(139, 125, 96) }; Canvas canvas = new Canvas(rgba); Paint paint = new Paint(); paint.setStyle(Paint.Style.STROKE); paint.setStrokeWidth(4); Paint textbgpaint = new Paint(); textbgpaint.setColor(Color.WHITE); textbgpaint.setStyle(Paint.Style.FILL); Paint textpaint = new Paint(); textpaint.setColor(Color.BLACK); textpaint.setTextSize(26); textpaint.setTextAlign(Paint.Align.LEFT); for (int i = 0; i < objects.length; i++) { paint.setColor(colors[i % 19]); canvas.drawRect(objects[i].x, objects[i].y, objects[i].x + objects[i].w, objects[i].y + objects[i].h, paint); // draw filled text inside image { String text = objects[i].label + " = " + String.format("%.1f", objects[i].prob * 100) + "%"; float text_width = textpaint.measureText(text); float text_height = - textpaint.ascent() + textpaint.descent(); float x = objects[i].x; float y = objects[i].y - text_height; if (y < 0) y = 0; if (x + text_width > rgba.getWidth()) x = rgba.getWidth() - text_width; canvas.drawRect(x, y, x + text_width, y + text_height, textbgpaint); canvas.drawText(text, x, y - textpaint.ascent(), textpaint); } } imageView.setImageBitmap(rgba); } @Override protected void onActivityResult(int requestCode, int resultCode, Intent data) { super.onActivityResult(requestCode, resultCode, data); if (resultCode == RESULT_OK && null != data) { Uri selectedImage = data.getData(); try { if (requestCode == SELECT_IMAGE) { bitmap = decodeUri(selectedImage); yourSelectedImage = bitmap.copy(Bitmap.Config.ARGB_8888, true); imageView.setImageBitmap(bitmap); } } catch (FileNotFoundException e) { Log.e("MainActivity", "FileNotFoundException"); return; } } } private Bitmap decodeUri(Uri selectedImage) throws FileNotFoundException { // Decode image size BitmapFactory.Options o = new BitmapFactory.Options(); o.inJustDecodeBounds = true; BitmapFactory.decodeStream(getContentResolver().openInputStream(selectedImage), null, o); // The new size we want to scale to final int REQUIRED_SIZE = 640; // Find the correct scale value. It should be the power of 2. int width_tmp = o.outWidth, height_tmp = o.outHeight; int scale = 1; while (true) { if (width_tmp / 2 < REQUIRED_SIZE || height_tmp / 2 < REQUIRED_SIZE) { break; } width_tmp /= 2; height_tmp /= 2; scale *= 2; } // Decode with inSampleSize BitmapFactory.Options o2 = new BitmapFactory.Options(); o2.inSampleSize = scale; Bitmap bitmap = BitmapFactory.decodeStream(getContentResolver().openInputStream(selectedImage), null, o2); // Rotate according to EXIF int rotate = 0; try { ExifInterface exif = new ExifInterface(getContentResolver().openInputStream(selectedImage)); int orientation = exif.getAttributeInt(ExifInterface.TAG_ORIENTATION, ExifInterface.ORIENTATION_NORMAL); switch (orientation) { case ExifInterface.ORIENTATION_ROTATE_270: rotate = 270; break; case ExifInterface.ORIENTATION_ROTATE_180: rotate = 180; break; case ExifInterface.ORIENTATION_ROTATE_90: rotate = 90; break; } } catch (IOException e) { Log.e("MainActivity", "ExifInterface IOException"); } Matrix matrix = new Matrix(); matrix.postRotate(rotate); return Bitmap.createBitmap(bitmap, 0, 0, bitmap.getWidth(), bitmap.getHeight(), matrix, true); } } ================================================ FILE: detector/YOLOX/demo/ncnn/android/app/src/main/java/com/megvii/yoloXncnn/YOLOXncnn.java ================================================ // Copyright (C) Megvii, Inc. and its affiliates. All rights reserved. package com.megvii.yoloXncnn; import android.content.res.AssetManager; import android.graphics.Bitmap; public class YOLOXncnn { public native boolean Init(AssetManager mgr); public class Obj { public float x; public float y; public float w; public float h; public String label; public float prob; } public native Obj[] Detect(Bitmap bitmap, boolean use_gpu); static { System.loadLibrary("yoloXncnn"); } } ================================================ FILE: detector/YOLOX/demo/ncnn/android/app/src/main/java/com/megvii/yoloXncnn/yoloXncnn.java ================================================ // Copyright (C) Megvii, Inc. and its affiliates. All rights reserved. package com.megvii.yoloXncnn; import android.content.res.AssetManager; import android.graphics.Bitmap; public class YOLOXncnn { public native boolean Init(AssetManager mgr); public class Obj { public float x; public float y; public float w; public float h; public String label; public float prob; } public native Obj[] Detect(Bitmap bitmap, boolean use_gpu); static { System.loadLibrary("yoloXncnn"); } } ================================================ FILE: detector/YOLOX/demo/ncnn/android/app/src/main/jni/CMakeLists.txt ================================================ project(yoloXncnn) cmake_minimum_required(VERSION 3.4.1) set(ncnn_DIR ${CMAKE_SOURCE_DIR}/ncnn-20210525-android-vulkan/${ANDROID_ABI}/lib/cmake/ncnn) find_package(ncnn REQUIRED) add_library(yoloXncnn SHARED yoloXncnn_jni.cpp) target_link_libraries(yoloXncnn ncnn jnigraphics ) ================================================ FILE: detector/YOLOX/demo/ncnn/android/app/src/main/jni/yoloXncnn_jni.cpp ================================================ // Some code in this file is based on: // https://github.com/nihui/ncnn-android-yolov5/blob/master/app/src/main/jni/yolov5ncnn_jni.cpp // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. // Copyright (C) Megvii, Inc. and its affiliates. All rights reserved. #include #include #include #include #include #include // ncnn #include "layer.h" #include "net.h" #include "benchmark.h" static ncnn::UnlockedPoolAllocator g_blob_pool_allocator; static ncnn::PoolAllocator g_workspace_pool_allocator; static ncnn::Net yoloX; class YoloV5Focus : public ncnn::Layer { public: YoloV5Focus() { one_blob_only = true; } virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob, const ncnn::Option& opt) const { int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; int outw = w / 2; int outh = h / 2; int outc = channels * 4; top_blob.create(outw, outh, outc, 4u, 1, opt.blob_allocator); if (top_blob.empty()) return -100; #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < outc; p++) { const float* ptr = bottom_blob.channel(p % channels).row((p / channels) % 2) + ((p / channels) / 2); float* outptr = top_blob.channel(p); for (int i = 0; i < outh; i++) { for (int j = 0; j < outw; j++) { *outptr = *ptr; outptr += 1; ptr += 2; } ptr += w; } } return 0; } }; DEFINE_LAYER_CREATOR(YoloV5Focus) struct Object { float x; float y; float w; float h; int label; float prob; }; struct GridAndStride { int grid0; int grid1; int stride; }; static inline float intersection_area(const Object& a, const Object& b) { if (a.x > b.x + b.w || a.x + a.w < b.x || a.y > b.y + b.h || a.y + a.h < b.y) { // no intersection return 0.f; } float inter_width = std::min(a.x + a.w, b.x + b.w) - std::max(a.x, b.x); float inter_height = std::min(a.y + a.h, b.y + b.h) - std::max(a.y, b.y); return inter_width * inter_height; } static void qsort_descent_inplace(std::vector& faceobjects, int left, int right) { int i = left; int j = right; float p = faceobjects[(left + right) / 2].prob; while (i <= j) { while (faceobjects[i].prob > p) i++; while (faceobjects[j].prob < p) j--; if (i <= j) { // swap std::swap(faceobjects[i], faceobjects[j]); i++; j--; } } #pragma omp parallel sections { #pragma omp section { if (left < j) qsort_descent_inplace(faceobjects, left, j); } #pragma omp section { if (i < right) qsort_descent_inplace(faceobjects, i, right); } } } static void qsort_descent_inplace(std::vector& faceobjects) { if (faceobjects.empty()) return; qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1); } static void nms_sorted_bboxes(const std::vector& faceobjects, std::vector& picked, float nms_threshold) { picked.clear(); const int n = faceobjects.size(); std::vector areas(n); for (int i = 0; i < n; i++) { areas[i] = faceobjects[i].w * faceobjects[i].h; } for (int i = 0; i < n; i++) { const Object& a = faceobjects[i]; int keep = 1; for (int j = 0; j < (int)picked.size(); j++) { const Object& b = faceobjects[picked[j]]; // intersection over union float inter_area = intersection_area(a, b); float union_area = areas[i] + areas[picked[j]] - inter_area; // float IoU = inter_area / union_area if (inter_area / union_area > nms_threshold) keep = 0; } if (keep) picked.push_back(i); } } static void generate_grids_and_stride(const int target_size, std::vector& strides, std::vector& grid_strides) { for (auto stride : strides) { int num_grid = target_size / stride; for (int g1 = 0; g1 < num_grid; g1++) { for (int g0 = 0; g0 < num_grid; g0++) { grid_strides.push_back((GridAndStride){g0, g1, stride}); } } } } static void generate_yolox_proposals(std::vector grid_strides, const ncnn::Mat& feat_blob, float prob_threshold, std::vector& objects) { const int num_grid = feat_blob.h; fprintf(stderr, "output height: %d, width: %d, channels: %d, dims:%d\n", feat_blob.h, feat_blob.w, feat_blob.c, feat_blob.dims); const int num_class = feat_blob.w - 5; const int num_anchors = grid_strides.size(); const float* feat_ptr = feat_blob.channel(0); for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++) { const int grid0 = grid_strides[anchor_idx].grid0; const int grid1 = grid_strides[anchor_idx].grid1; const int stride = grid_strides[anchor_idx].stride; // yolox/models/yolo_head.py decode logic // outputs[..., :2] = (outputs[..., :2] + grids) * strides // outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides float x_center = (feat_ptr[0] + grid0) * stride; float y_center = (feat_ptr[1] + grid1) * stride; float w = exp(feat_ptr[2]) * stride; float h = exp(feat_ptr[3]) * stride; float x0 = x_center - w * 0.5f; float y0 = y_center - h * 0.5f; float box_objectness = feat_ptr[4]; for (int class_idx = 0; class_idx < num_class; class_idx++) { float box_cls_score = feat_ptr[5 + class_idx]; float box_prob = box_objectness * box_cls_score; if (box_prob > prob_threshold) { Object obj; obj.x = x0; obj.y = y0; obj.w = w; obj.h = h; obj.label = class_idx; obj.prob = box_prob; objects.push_back(obj); } } // class loop feat_ptr += feat_blob.w; } // point anchor loop } extern "C" { // FIXME DeleteGlobalRef is missing for objCls static jclass objCls = NULL; static jmethodID constructortorId; static jfieldID xId; static jfieldID yId; static jfieldID wId; static jfieldID hId; static jfieldID labelId; static jfieldID probId; JNIEXPORT jint JNI_OnLoad(JavaVM* vm, void* reserved) { __android_log_print(ANDROID_LOG_DEBUG, "YOLOXncnn", "JNI_OnLoad"); ncnn::create_gpu_instance(); return JNI_VERSION_1_4; } JNIEXPORT void JNI_OnUnload(JavaVM* vm, void* reserved) { __android_log_print(ANDROID_LOG_DEBUG, "YOLOXncnn", "JNI_OnUnload"); ncnn::destroy_gpu_instance(); } // public native boolean Init(AssetManager mgr); JNIEXPORT jboolean JNICALL Java_com_megvii_yoloXncnn_YOLOXncnn_Init(JNIEnv* env, jobject thiz, jobject assetManager) { ncnn::Option opt; opt.lightmode = true; opt.num_threads = 4; opt.blob_allocator = &g_blob_pool_allocator; opt.workspace_allocator = &g_workspace_pool_allocator; opt.use_packing_layout = true; // use vulkan compute if (ncnn::get_gpu_count() != 0) opt.use_vulkan_compute = true; AAssetManager* mgr = AAssetManager_fromJava(env, assetManager); yoloX.opt = opt; yoloX.register_custom_layer("YoloV5Focus", YoloV5Focus_layer_creator); // init param { int ret = yoloX.load_param(mgr, "yolox.param"); if (ret != 0) { __android_log_print(ANDROID_LOG_DEBUG, "YOLOXncnn", "load_param failed"); return JNI_FALSE; } } // init bin { int ret = yoloX.load_model(mgr, "yolox.bin"); if (ret != 0) { __android_log_print(ANDROID_LOG_DEBUG, "YOLOXncnn", "load_model failed"); return JNI_FALSE; } } // init jni glue jclass localObjCls = env->FindClass("com/megvii/yoloXncnn/YOLOXncnn$Obj"); objCls = reinterpret_cast(env->NewGlobalRef(localObjCls)); constructortorId = env->GetMethodID(objCls, "", "(Lcom/megvii/yoloXncnn/YOLOXncnn;)V"); xId = env->GetFieldID(objCls, "x", "F"); yId = env->GetFieldID(objCls, "y", "F"); wId = env->GetFieldID(objCls, "w", "F"); hId = env->GetFieldID(objCls, "h", "F"); labelId = env->GetFieldID(objCls, "label", "Ljava/lang/String;"); probId = env->GetFieldID(objCls, "prob", "F"); return JNI_TRUE; } // public native Obj[] Detect(Bitmap bitmap, boolean use_gpu); JNIEXPORT jobjectArray JNICALL Java_com_megvii_yoloXncnn_YOLOXncnn_Detect(JNIEnv* env, jobject thiz, jobject bitmap, jboolean use_gpu) { if (use_gpu == JNI_TRUE && ncnn::get_gpu_count() == 0) { return NULL; //return env->NewStringUTF("no vulkan capable gpu"); } double start_time = ncnn::get_current_time(); AndroidBitmapInfo info; AndroidBitmap_getInfo(env, bitmap, &info); const int width = info.width; const int height = info.height; if (info.format != ANDROID_BITMAP_FORMAT_RGBA_8888) return NULL; // parameters which might change for different model const int target_size = 640; const float prob_threshold = 0.3f; const float nms_threshold = 0.65f; std::vector strides = {8, 16, 32}; // might have stride=64 // python 0-1 input tensor with rgb_means = (0.485, 0.456, 0.406), std = (0.229, 0.224, 0.225) // so for 0-255 input image, rgb_mean should multiply 255 and norm should div by std. const float mean_vals[3] = {255.f * 0.485f, 255.f * 0.456, 255.f * 0.406f}; const float norm_vals[3] = {1 / (255.f * 0.229f), 1 / (255.f * 0.224f), 1 / (255.f * 0.225f)}; int w = width; int h = height; float scale = 1.f; if (w > h) { scale = (float)target_size / w; w = target_size; h = h * scale; } else { scale = (float)target_size / h; h = target_size; w = w * scale; } ncnn::Mat in = ncnn::Mat::from_android_bitmap_resize(env, bitmap, ncnn::Mat::PIXEL_RGB, w, h); // pad to target_size rectangle int wpad = target_size - w; int hpad = target_size - h; ncnn::Mat in_pad; // different from yolov5, yolox only pad on bottom and right side, // which means users don't need to extra padding info to decode boxes coordinate. ncnn::copy_make_border(in, in_pad, 0, hpad, 0, wpad, ncnn::BORDER_CONSTANT, 114.f); // yolox std::vector objects; { in_pad.substract_mean_normalize(mean_vals, norm_vals); ncnn::Extractor ex = yoloX.create_extractor(); ex.set_vulkan_compute(use_gpu); ex.input("images", in_pad); std::vector proposals; // yolox decode and generate proposal logic { ncnn::Mat out; ex.extract("output", out); std::vector grid_strides; generate_grids_and_stride(target_size, strides, grid_strides); generate_yolox_proposals(grid_strides, out, prob_threshold, proposals); } // sort all proposals by score from highest to lowest qsort_descent_inplace(proposals); // apply nms with nms_threshold std::vector picked; nms_sorted_bboxes(proposals, picked, nms_threshold); int count = picked.size(); objects.resize(count); for (int i = 0; i < count; i++) { objects[i] = proposals[picked[i]]; // adjust offset to original unpadded float x0 = (objects[i].x) / scale; float y0 = (objects[i].y) / scale; float x1 = (objects[i].x + objects[i].w) / scale; float y1 = (objects[i].y + objects[i].h) / scale; // clip x0 = std::max(std::min(x0, (float)(width - 1)), 0.f); y0 = std::max(std::min(y0, (float)(height - 1)), 0.f); x1 = std::max(std::min(x1, (float)(width - 1)), 0.f); y1 = std::max(std::min(y1, (float)(height - 1)), 0.f); objects[i].x = x0; objects[i].y = y0; objects[i].w = x1 - x0; objects[i].h = y1 - y0; } } // objects to Obj[] static const char* class_names[] = { "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush" }; jobjectArray jObjArray = env->NewObjectArray(objects.size(), objCls, NULL); for (size_t i=0; iNewObject(objCls, constructortorId, thiz); env->SetFloatField(jObj, xId, objects[i].x); env->SetFloatField(jObj, yId, objects[i].y); env->SetFloatField(jObj, wId, objects[i].w); env->SetFloatField(jObj, hId, objects[i].h); env->SetObjectField(jObj, labelId, env->NewStringUTF(class_names[objects[i].label])); env->SetFloatField(jObj, probId, objects[i].prob); env->SetObjectArrayElement(jObjArray, i, jObj); } double elasped = ncnn::get_current_time() - start_time; __android_log_print(ANDROID_LOG_DEBUG, "YOLOXncnn", "%.2fms detect", elasped); return jObjArray; } } ================================================ FILE: detector/YOLOX/demo/ncnn/android/app/src/main/res/layout/main.xml ================================================