Repository: OlafenwaMoses/ImageAI Branch: master Commit: 2156d1a39a19 Files: 152 Total size: 1.1 MB Directory structure: gitextract_hhet6k5q/ ├── .codecov.yml ├── .github/ │ ├── FUNDING.yml │ └── workflows/ │ └── build.yml ├── .gitignore ├── .travis.yml ├── BACKEND_MIGRATION.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── examples/ │ ├── camera_feed_detection.py │ ├── custom_detection.py │ ├── custom_detection_array_input_output.py │ ├── custom_detection_extract_objects.py │ ├── custom_detection_from_array_extract_objects_array.py │ ├── custom_detection_from_file_extract_objects_array.py │ ├── custom_detection_train.py │ ├── custom_detection_video.py │ ├── custom_model_prediction.py │ ├── custom_model_training.py │ ├── image_custom_object_detection.py │ ├── image_prediction.py │ ├── object_detection.py │ ├── video_analysis_per_frame.py │ ├── video_analysis_per_second.py │ ├── video_custom_object_detection.py │ └── video_object_detection.py ├── imageai/ │ ├── Classification/ │ │ ├── CUSTOMCLASSIFICATION.md │ │ ├── CUSTOMTRAINING.md │ │ ├── Custom/ │ │ │ ├── __init__.py │ │ │ ├── data_transformation.py │ │ │ └── training_params.py │ │ ├── README.md │ │ ├── __init__.py │ │ └── imagenet_classes.txt │ ├── Detection/ │ │ ├── Custom/ │ │ │ ├── CUSTOMDETECTION.md │ │ │ ├── CUSTOMDETECTIONTRAINING.md │ │ │ ├── CUSTOMVIDEODETECTION.md │ │ │ ├── __init__.py │ │ │ └── yolo/ │ │ │ ├── __init__.py │ │ │ ├── compute_loss.py │ │ │ ├── custom_anchors.py │ │ │ ├── dataset.py │ │ │ ├── metric.py │ │ │ └── validate.py │ │ ├── README.md │ │ ├── VIDEO.md │ │ ├── __init__.py │ │ ├── coco91_classes.txt │ │ └── coco_classes.txt │ ├── __init__.py │ ├── backend_check/ │ │ ├── __init__.py │ │ ├── backend_check.py │ │ └── model_extension.py │ ├── densenet121/ │ │ └── __init__.py │ ├── inceptionv3/ │ │ └── __init__.py │ ├── mobilenetv2/ │ │ └── __init__.py │ ├── resnet50/ │ │ └── __init__.py │ ├── retinanet/ │ │ ├── __init__.py │ │ └── utils.py │ └── yolov3/ │ ├── __init__.py │ ├── tiny_yolov3.py │ ├── utils.py │ └── yolov3.py ├── imageai_tf_deprecated/ │ ├── Classification/ │ │ ├── CUSTOMCLASSIFICATION.md │ │ ├── CUSTOMTRAINING.md │ │ ├── Custom/ │ │ │ └── __init__.py │ │ ├── README.md │ │ └── __init__.py │ ├── Detection/ │ │ ├── Custom/ │ │ │ ├── CUSTOMDETECTION.md │ │ │ ├── CUSTOMDETECTIONTRAINING.md │ │ │ ├── CUSTOMVIDEODETECTION.md │ │ │ ├── __init__.py │ │ │ ├── callbacks.py │ │ │ ├── evaluate.py │ │ │ ├── gen_anchors.py │ │ │ ├── generator.py │ │ │ ├── utils/ │ │ │ │ ├── __init__.py │ │ │ │ ├── bbox.py │ │ │ │ ├── colors.py │ │ │ │ ├── image.py │ │ │ │ ├── multi_gpu_model.py │ │ │ │ └── utils.py │ │ │ └── voc.py │ │ ├── README.md │ │ ├── VIDEO.md │ │ ├── YOLO/ │ │ │ ├── __init__.py │ │ │ ├── utils.py │ │ │ └── yolov3.py │ │ ├── __init__.py │ │ └── keras_retinanet/ │ │ ├── __init__.py │ │ ├── backend/ │ │ │ ├── __init__.py │ │ │ └── backend.py │ │ ├── bin/ │ │ │ ├── __init__.py │ │ │ ├── convert_model.py │ │ │ ├── debug.py │ │ │ ├── evaluate.py │ │ │ └── train.py │ │ ├── callbacks/ │ │ │ ├── __init__.py │ │ │ ├── coco.py │ │ │ ├── common.py │ │ │ └── eval.py │ │ ├── initializers.py │ │ ├── layers/ │ │ │ ├── __init__.py │ │ │ ├── _misc.py │ │ │ └── filter_detections.py │ │ ├── losses.py │ │ ├── models/ │ │ │ ├── __init__.py │ │ │ ├── densenet.py │ │ │ ├── effnet.py │ │ │ ├── mobilenet.py │ │ │ ├── resnet.py │ │ │ ├── retinanet.py │ │ │ ├── senet.py │ │ │ └── vgg.py │ │ ├── preprocessing/ │ │ │ ├── __init__.py │ │ │ ├── coco.py │ │ │ ├── csv_generator.py │ │ │ ├── generator.py │ │ │ ├── kitti.py │ │ │ ├── open_images.py │ │ │ └── pascal_voc.py │ │ └── utils/ │ │ ├── __init__.py │ │ ├── anchors.py │ │ ├── coco_eval.py │ │ ├── colors.py │ │ ├── compute_overlap.pyx │ │ ├── config.py │ │ ├── eval.py │ │ ├── gpu.py │ │ ├── image.py │ │ ├── model.py │ │ ├── tf_version.py │ │ ├── transform.py │ │ └── visualization.py │ ├── Prediction/ │ │ ├── Custom/ │ │ │ ├── __init__.py │ │ │ └── custom_utils.py │ │ ├── __init__.py │ │ └── imagenet_utils.py │ └── __init__.py ├── requirements.txt ├── requirements_extra.txt ├── requirements_gpu.txt ├── scripts/ │ └── pascal_voc_to_yolo.py ├── setup.py └── test/ ├── test_custom_classification.py ├── test_custom_classification_training.py ├── test_custom_detection_training.py ├── test_custom_object_detection.py ├── test_custom_video_detection.py ├── test_image_classification.py ├── test_object_detection.py └── test_video_object_detection.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .codecov.yml ================================================ codecov: notify: require_ci_to_pass: yes coverage: precision: 2 round: down range: "30...100" status: project: yes patch: yes changes: no parsers: gcov: branch_detection: conditional: yes loop: yes method: no macro: no comment: layout: "header, diff" behavior: default require_changes: no ================================================ FILE: .github/FUNDING.yml ================================================ github: OlafenwaMoses ================================================ FILE: .github/workflows/build.yml ================================================ name: Build and Testing on: push: branches: [master] pull_request: branches: [master] jobs: UnitestPython37: name: Python3.7 Tests runs-on: ubuntu-latest # needs: None steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: python-version: '3.7' cache: 'pip' - name: Install Dependencies run: | pip install -r requirements.txt pip install -r requirements_extra.txt - name: Download and Setup Resources env: CI: false run: | sudo apt-get update sudo apt-get install unzip -y mkdir test/data-models mkdir test/data-json wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/densenet121-a639ec97.pth -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/densenet121-idenprof-test_acc_0.82550_epoch-95.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/inception_v3-idenprof-test_acc_0.81050_epoch-92.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/inception_v3_google-1a9a5a14.pth -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/mobilenet_v2-b0353104.pth -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/mobilenet_v2-idenprof-test_acc_0.85300_epoch-92.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/resnet50-19c8e357.pth -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/resnet50-idenprof-test_acc_0.78200_epoch-91.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/retinanet_resnet50_fpn_coco-eeacb38b.pth -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/tiny-yolov3.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/tiny_yolov3_number-plate-dataset-imageai_mAP-0.22595_epoch-20.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/yolov3.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/yolov3_number-plate-dataset-imageai_mAP-0.57145_epoch-11.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/idenprof.json -P test/data-json wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/number-plate-dataset-imageai_tiny_yolov3_detection_config.json -P test/data-json wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/number-plate-dataset-imageai_yolov3_detection_config.json -P test/data-json wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/idenprof_model_classes.json -P test/data-json wget https://github.com/OlafenwaMoses/ImageAI/releases/download/test-resources-v3/data-datasets.zip -P test wget https://github.com/OlafenwaMoses/ImageAI/releases/download/test-resources-v3/data-images.zip -P test wget https://github.com/OlafenwaMoses/ImageAI/releases/download/test-resources-v3/data-videos.zip -P test unzip test/data-datasets.zip -d test unzip test/data-images.zip -d test unzip test/data-videos.zip -d test - name: Run Unittest run: | pytest test -vvv UnitestPython38: name: Python3.8 Tests runs-on: ubuntu-latest # needs: None steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: python-version: '3.8' cache: 'pip' - name: Install Dependencies run: | pip install -r requirements.txt pip install -r requirements_extra.txt - name: Download and Setup Resources env: CI: false run: | sudo apt-get update sudo apt-get install unzip -y mkdir test/data-models mkdir test/data-json wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/densenet121-a639ec97.pth -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/densenet121-idenprof-test_acc_0.82550_epoch-95.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/inception_v3-idenprof-test_acc_0.81050_epoch-92.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/inception_v3_google-1a9a5a14.pth -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/mobilenet_v2-b0353104.pth -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/mobilenet_v2-idenprof-test_acc_0.85300_epoch-92.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/resnet50-19c8e357.pth -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/resnet50-idenprof-test_acc_0.78200_epoch-91.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/retinanet_resnet50_fpn_coco-eeacb38b.pth -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/tiny-yolov3.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/tiny_yolov3_number-plate-dataset-imageai_mAP-0.22595_epoch-20.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/yolov3.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/yolov3_number-plate-dataset-imageai_mAP-0.57145_epoch-11.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/idenprof.json -P test/data-json wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/number-plate-dataset-imageai_tiny_yolov3_detection_config.json -P test/data-json wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/number-plate-dataset-imageai_yolov3_detection_config.json -P test/data-json wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/idenprof_model_classes.json -P test/data-json wget https://github.com/OlafenwaMoses/ImageAI/releases/download/test-resources-v3/data-datasets.zip -P test wget https://github.com/OlafenwaMoses/ImageAI/releases/download/test-resources-v3/data-images.zip -P test wget https://github.com/OlafenwaMoses/ImageAI/releases/download/test-resources-v3/data-videos.zip -P test unzip test/data-datasets.zip -d test unzip test/data-images.zip -d test unzip test/data-videos.zip -d test - name: Run Unittest run: | pytest test -vvv UnitestPython39: name: Python3.9 Tests runs-on: ubuntu-latest # needs: None steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: python-version: '3.9' cache: 'pip' - name: Install Dependencies run: | pip install -r requirements.txt pip install -r requirements_extra.txt - name: Download and Setup Resources env: CI: false run: | sudo apt-get update sudo apt-get install unzip -y mkdir test/data-models mkdir test/data-json wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/densenet121-a639ec97.pth -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/densenet121-idenprof-test_acc_0.82550_epoch-95.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/inception_v3-idenprof-test_acc_0.81050_epoch-92.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/inception_v3_google-1a9a5a14.pth -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/mobilenet_v2-b0353104.pth -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/mobilenet_v2-idenprof-test_acc_0.85300_epoch-92.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/resnet50-19c8e357.pth -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/resnet50-idenprof-test_acc_0.78200_epoch-91.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/retinanet_resnet50_fpn_coco-eeacb38b.pth -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/tiny-yolov3.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/tiny_yolov3_number-plate-dataset-imageai_mAP-0.22595_epoch-20.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/yolov3.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/yolov3_number-plate-dataset-imageai_mAP-0.57145_epoch-11.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/idenprof.json -P test/data-json wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/number-plate-dataset-imageai_tiny_yolov3_detection_config.json -P test/data-json wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/number-plate-dataset-imageai_yolov3_detection_config.json -P test/data-json wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/idenprof_model_classes.json -P test/data-json wget https://github.com/OlafenwaMoses/ImageAI/releases/download/test-resources-v3/data-datasets.zip -P test wget https://github.com/OlafenwaMoses/ImageAI/releases/download/test-resources-v3/data-images.zip -P test wget https://github.com/OlafenwaMoses/ImageAI/releases/download/test-resources-v3/data-videos.zip -P test unzip test/data-datasets.zip -d test unzip test/data-images.zip -d test unzip test/data-videos.zip -d test - name: Run Unittest run: | pytest test -vvv UnitestPython310: name: Python3.10 Tests runs-on: ubuntu-latest # needs: None steps: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: python-version: '3.10' cache: 'pip' - name: Install Dependencies run: | pip install -r requirements.txt pip install -r requirements_extra.txt - name: Download and Setup Resources env: CI: false run: | sudo apt-get update sudo apt-get install unzip -y mkdir test/data-models mkdir test/data-json wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/densenet121-a639ec97.pth -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/densenet121-idenprof-test_acc_0.82550_epoch-95.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/inception_v3-idenprof-test_acc_0.81050_epoch-92.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/inception_v3_google-1a9a5a14.pth -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/mobilenet_v2-b0353104.pth -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/mobilenet_v2-idenprof-test_acc_0.85300_epoch-92.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/resnet50-19c8e357.pth -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/resnet50-idenprof-test_acc_0.78200_epoch-91.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/retinanet_resnet50_fpn_coco-eeacb38b.pth -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/tiny-yolov3.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/tiny_yolov3_number-plate-dataset-imageai_mAP-0.22595_epoch-20.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/yolov3.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/yolov3_number-plate-dataset-imageai_mAP-0.57145_epoch-11.pt -P test/data-models wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/idenprof.json -P test/data-json wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/number-plate-dataset-imageai_tiny_yolov3_detection_config.json -P test/data-json wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/number-plate-dataset-imageai_yolov3_detection_config.json -P test/data-json wget https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/idenprof_model_classes.json -P test/data-json wget https://github.com/OlafenwaMoses/ImageAI/releases/download/test-resources-v3/data-datasets.zip -P test wget https://github.com/OlafenwaMoses/ImageAI/releases/download/test-resources-v3/data-images.zip -P test wget https://github.com/OlafenwaMoses/ImageAI/releases/download/test-resources-v3/data-videos.zip -P test unzip test/data-datasets.zip -d test unzip test/data-images.zip -d test unzip test/data-videos.zip -d test - name: Run Unittest run: | pytest test -vvv ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ cover/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder .pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: # .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control #poetry.lock # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. #pdm.lock # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it # in version control. # https://pdm.fming.dev/#use-with-ide .pdm.toml # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ # pytype static type analyzer .pytype/ # Cython debug symbols cython_debug/ # PyCharm # JetBrains specific template is maintained in a separate JetBrains.gitignore that can # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ # Other files and folders test/data-models test/data-images test/data-json test/data-videos test/data-datasets experiment ================================================ FILE: .travis.yml ================================================ dist: xenial sudo: required language: python python: - '3.7.6' install: - pip install -r requirements.txt - pip install pytest - pip install pytest-cov script: - python setup.py install - cd test - mkdir data-models - mkdir data-temp - wget -P data-models/ https://github.com/OlafenwaMoses/ImageAI/releases/download/1.0/DenseNet-BC-121-32.h5 - wget -P data-models/ https://github.com/OlafenwaMoses/ImageAI/releases/download/1.0/inception_v3_weights_tf_dim_ordering_tf_kernels.h5 - wget -P data-models/ https://github.com/OlafenwaMoses/ImageAI/releases/download/essentials-v5/resnet50_imagenet_tf.2.0.h5 - wget -P data-models/ https://github.com/OlafenwaMoses/ImageAI/releases/download/essentials-v5/mobilenet_v2.h5 - wget -P data-models/ https://github.com/OlafenwaMoses/ImageAI/releases/download/models-v3/idenprof_densenet-0.763500.h5 - wget -P data-models/ https://github.com/OlafenwaMoses/ImageAI/releases/download/models-v3/idenprof_full_resnet_ex-001_acc-0.119792.h5 - wget -P data-models/ https://github.com/OlafenwaMoses/ImageAI/releases/download/essentials-v5/idenprof_resnet_ex-056_acc-0.993062.h5 - wget -P data-models/ https://github.com/OlafenwaMoses/ImageAI/releases/download/essentials-v5/resnet50_coco_best_v2.1.0.h5 - wget -P data-models/ https://github.com/OlafenwaMoses/ImageAI/releases/download/1.0/yolo.h5 - wget -P data-models/ https://github.com/OlafenwaMoses/ImageAI/releases/download/1.0/yolo-tiny.h5 - wget -P data-models/ https://github.com/OlafenwaMoses/ImageAI/releases/download/essential-v4/pretrained-yolov3.h5 - wget -P data-models/ https://github.com/OlafenwaMoses/ImageAI/releases/download/essential-v4/hololens-ex-60--loss-2.76.h5 - pytest -v --cov after_script: - bash <(curl -s https://codecov.io/bash) ================================================ FILE: BACKEND_MIGRATION.md ================================================ # Overview In December 2022, ImageAI `3.0.2` was released which effected the change from Tensorflow backend to PyTorch backend. This change allows ImageAI to support `Python 3.7` up to `Python 3.10` for all its features and deprecates a number of functionalities for this and future versions of ImageAI. # Deprecated functionalities - Tensorflow backend no longer supported. Now replaced with PyTorch - All `.h5` pretrained models and custom trained `.h5` models no longer supported. If you still intend to use these models, see the `Using Tensorflow backend` section. - `Speed mode` have been removed from model loading - Custom detection model training dataset format changed to YOLO format from Pascal VOC. To convert your dataset to YOLO format, see the `Convert Pascal VOC dataset to YOLO format` section. - Enhance data for custom classification model training now removed - Detection model training standalone evaluation now removed # Using Tensorflow backend To use Tensorflow backend, do the following - Install Python 3.7 - Install Tensorflow - CPU: `pip install tensorflow==2.4.0` - GPU: `pip install tensorflow-gpu==2.4.0` - Install other dependencies: `pip install keras==2.4.3 numpy==1.19.3 pillow==7.0.0 scipy==1.4.1 h5py==2.10.0 matplotlib==3.3.2 opencv-python keras-resnet==0.2.0` - Install ImageAI **2.1.6**: `pip install imageai==2.1.6` - Download the Tensorflow models from the releases below - [Models for Image Recognition and Object Detection](https://github.com/OlafenwaMoses/ImageAI/releases/tag/1.0) - [TF2.x Models [ Exclusives ]](https://github.com/OlafenwaMoses/ImageAI/releases/tag/essentials-v5) # Convert Pascal VOC dataset to YOLO format Because ImageAI now uses `YOLO format` for training custom object detection models; should you need to train a new model with the new ImageAI version, you will need to convert your `Pascal VOC` datasets to YOLO format by doing the following - Run the command below ``` python scripts/pascal_voc_to_yolo.py --dataset_dir ``` - Once completed, you will find the YOLO version of the dataset next to your Pascal VOC dataset. - E.g, if your dataset is in `C:/Users/Troublemaker/Documents/datasets/headset`, your conversion command will be ``` python scripts/pascal_voc_to_yolo.py --dataset_dir C:/Users/Troublemaker/Documents/datasets/headset ``` and once completed, the output will be in `C:/Users/Troublemaker/Documents/datasets/headset-yolo` ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2019 MOSES OLAFENWA Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: MANIFEST.in ================================================ recursive-include imageai/Detection *.txt recursive-include imageai/Classification *.txt ================================================ FILE: README.md ================================================ # ImageAI (v3.0.3) [![Build Status](https://travis-ci.com/OlafenwaMoses/ImageAI.svg?branch=master)](https://travis-ci.com/OlafenwaMoses/ImageAI) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/OlafenwaMoses/ImageAI/blob/master/LICENSE) [![PyPI version](https://badge.fury.io/py/imageai.svg)](https://badge.fury.io/py/imageai) [![Downloads](https://pepy.tech/badge/imageai/month)](https://pepy.tech/project/imageai) [![Downloads](https://pepy.tech/badge/imageai/week)](https://pepy.tech/project/imageai) An open-source python library built to empower developers to build applications and systems with self-contained Deep Learning and Computer Vision capabilities using simple and few lines of code. If you will like to sponsor this project, kindly visit the [Github sponsor page](https://github.com/sponsors/OlafenwaMoses). ## --------------------------------------------------- ## Introducing Jarvis and TheiaEngine. We the creators of ImageAI are glad to announce 2 new AI projects to provide state-of-the-art Generative AI, LLM and Image Understanding on your personal computer and servers. [![](jarvis.png)](https://jarvis.genxr.co) Install Jarvis on PC/Mac to setup limitless access to LLM powered AI Chats for your every day work, research and generative AI needs with 100% privacy and full offline capability. Visit [https://jarvis.genxr.co](https://jarvis.genxr.co/) to get started. [![](theiaengine.png)](https://www.genxr.co/theia-engine) [TheiaEngine](https://www.genxr.co/theia-engine), the next-generation computer Vision AI API capable of all Generative and Understanding computer vision tasks in a single API call and available via REST API to all programming languages. Features include - **Detect 300+ objects** ( 220 more objects than ImageAI) - **Provide answers to any content or context questions** asked on an image - very useful to get information on any object, action or information without needing to train a new custom model for every tasks - **Generate scene description and summary** - **Convert 2D image to 3D pointcloud and triangular mesh** - **Semantic Scene mapping of objects, walls, floors, etc** - **Stateless Face recognition and emotion detection** - **Image generation and augmentation from prompt** - etc. Visit [https://www.genxr.co/theia-engine](https://www.genxr.co/theia-engine) to try the demo and join in the beta testing today. ## --------------------------------------------------- ![](logo1.png) Developed and maintained by [Moses Olafenwa](https://twitter.com/OlafenwaMoses) --- Built with simplicity in mind, **ImageAI** supports a list of state-of-the-art Machine Learning algorithms for image prediction, custom image prediction, object detection, video detection, video object tracking and image predictions trainings. **ImageAI** currently supports image prediction and training using 4 different Machine Learning algorithms trained on the ImageNet-1000 dataset. **ImageAI** also supports object detection, video detection and object tracking using RetinaNet, YOLOv3 and TinyYOLOv3 trained on COCO dataset. Finally, **ImageAI** allows you to train custom models for performing detection and recognition of new objects. Eventually, **ImageAI** will provide support for a wider and more specialized aspects of Computer Vision **New Release : ImageAI 3.0.2** What's new: - PyTorch backend - TinyYOLOv3 model training ### TABLE OF CONTENTS - :white_square_button: Installation - :white_square_button: Features - :white_square_button: Documentation - :white_square_button: Sponsors - :white_square_button: Projects Built on ImageAI - :white_square_button: High Performance Implementation - :white_square_button: AI Practice Recommendations - :white_square_button: Contact Developers - :white_square_button: Citation - :white_square_button: References ## Installation
To install ImageAI, run the python installation instruction below in the command line: - [Download and Install](https://www.python.org/downloads/) **Python 3.7**, **Python 3.8**, **Python 3.9** or **Python 3.10** - Install dependencies - **CPU**: Download [requirements.txt](https://github.com/OlafenwaMoses/ImageAI/blob/master/requirements.txt) file and install via the command ``` pip install -r requirements.txt ``` or simply copy and run the command below ``` pip install cython pillow>=7.0.0 numpy>=1.18.1 opencv-python>=4.1.2 torch>=1.9.0 --extra-index-url https://download.pytorch.org/whl/cpu torchvision>=0.10.0 --extra-index-url https://download.pytorch.org/whl/cpu pytest==7.1.3 tqdm==4.64.1 scipy>=1.7.3 matplotlib>=3.4.3 mock==4.0.3 ``` - **GPU/CUDA**: Download [requirements_gpu.txt](https://github.com/OlafenwaMoses/ImageAI/blob/master/requirements_gpu.txt) file and install via the command ``` pip install -r requirements_gpu.txt ``` or smiply copy and run the command below ``` pip install cython pillow>=7.0.0 numpy>=1.18.1 opencv-python>=4.1.2 torch>=1.9.0 --extra-index-url https://download.pytorch.org/whl/cu102 torchvision>=0.10.0 --extra-index-url https://download.pytorch.org/whl/cu102 pytest==7.1.3 tqdm==4.64.1 scipy>=1.7.3 matplotlib>=3.4.3 mock==4.0.3 ``` - If you plan to train custom AI models, download [requirements_extra.txt](https://github.com/OlafenwaMoses/ImageAI/blob/master/requirements_extra.txt) file and install via the command ``` pip install -r requirements_extra.txt ``` or simply copy and run the command below ``` pip install pycocotools@git+https://github.com/gautamchitnis/cocoapi.git@cocodataset-master#subdirectory=PythonAPI ``` - Then run the command below to install ImageAI ``` pip install imageai --upgrade ``` ## Features

Image Classification

ImageAI provides 4 different algorithms and model types to perform image prediction, trained on the ImageNet-1000 dataset. The 4 algorithms provided for image prediction include MobileNetV2, ResNet50, InceptionV3 and DenseNet121. Click the link below to see the full sample codes, explanations and best practices guide.

>>> Get Started

Object Detection

ImageAI provides very convenient and powerful methods to perform object detection on images and extract each object from the image. The object detection class provides support for RetinaNet, YOLOv3 and TinyYOLOv3, with options to adjust for state of the art performance or real time processing. Click the link below to see the full sample codes, explanations and best practices guide.

>>> Get Started

Video Object Detection & Analysis

ImageAI provides very convenient and powerful methods to perform object detection in videos. The video object detection class provided only supports the current state-of-the-art RetinaNet. Click the link to see the full videos, sample codes, explanations and best practices guide.

>>> Get Started

Custom Classification model training

ImageAI provides classes and methods for you to train a new model that can be used to perform prediction on your own custom objects. You can train your custom models using MobileNetV2, ResNet50, InceptionV3 and DenseNet in 5 lines of code. Click the link below to see the guide to preparing training images, sample training codes, explanations and best practices.

>>> Get Started

Custom Model Classification

ImageAI provides classes and methods for you to run image prediction your own custom objects using your own model trained with ImageAI Model Training class. You can use your custom models trained with MobileNetV2, ResNet50, InceptionV3 and DenseNet and the JSON file containing the mapping of the custom object names. Click the link below to see the guide to sample training codes, explanations, and best practices guide.

>>> Get Started

Custom Detection Model Training

ImageAI provides classes and methods for you to train new YOLOv3 or TinyYOLOv3 object detection models on your custom dataset. This means you can train a model to detect literally any object of interest by providing the images, the annotations and training with ImageAI. Click the link below to see the guide to sample training codes, explanations, and best practices guide.

>>> Get Started

Custom Object Detection

ImageAI now provides classes and methods for you detect and recognize your own custom objects in images using your own model trained with the DetectionModelTrainer class. You can use your custom trained YOLOv3 or TinyYOLOv3 model and the **.json** file generated during the training. Click the link below to see the guide to sample training codes, explanations, and best practices guide.

>>> Get Started

Custom Video Object Detection & Analysis

ImageAI now provides classes and methods for you detect and recognize your own custom objects in images using your own model trained with the DetectionModelTrainer class. You can use your custom trained YOLOv3 or TinyYOLOv3 model and the **.json** file generated during the training. Click the link below to see the guide to sample training codes, explanations, and best practices guide.

>>> Get Started
## Documentation
We have provided full documentation for all **ImageAI** classes and functions. Visit the link below: - Documentation - **English Version** [https://imageai.readthedocs.io](https://imageai.readthedocs.io) ## Sponsors
## Real-Time and High Performance Implementation
**ImageAI** provides abstracted and convenient implementations of state-of-the-art Computer Vision technologies. All of **ImageAI** implementations and code can work on any computer system with moderate CPU capacity. However, the speed of processing for operations like image prediction, object detection and others on CPU is slow and not suitable for real-time applications. To perform real-time Computer Vision operations with high performance, you need to use GPU enabled technologies. **ImageAI** uses the PyTorch backbone for it's Computer Vision operations. PyTorch supports both CPUs and GPUs ( Specifically NVIDIA GPUs. You can get one for your PC or get a PC that has one) for machine learning and artificial intelligence algorithms' implementations. ## Projects Built on ImageAI
## AI Practice Recommendations
For anyone interested in building AI systems and using them for business, economic, social and research purposes, it is critical that the person knows the likely positive, negative and unprecedented impacts the use of such technologies will have. They must also be aware of approaches and practices recommended by experienced industry experts to ensure every use of AI brings overall benefit to mankind. We therefore recommend to everyone that wishes to use ImageAI and other AI tools and resources to read Microsoft's January 2018 publication on AI titled "The Future Computed : Artificial Intelligence and its role in society". Kindly follow the link below to download the publication. [https://blogs.microsoft.com/blog/2018/01/17/future-computed-artificial-intelligence-role-society](https://blogs.microsoft.com/blog/2018/01/17/future-computed-artificial-intelligence-role-society/) ### Contact Developer
- **Moses Olafenwa** * _Email:_ guymodscientist@gmail.com * _Twitter:_ [@OlafenwaMoses](https://twitter.com/OlafenwaMoses) * _Medium:_ [@guymodscientist](https://medium.com/@guymodscientist) * _Facebook:_ [moses.olafenwa](https://facebook.com/moses.olafenwa) - **John Olafenwa** * _Email:_ johnolafenwa@gmail.com * _Website:_ [https://john.aicommons.science](https://john.aicommons.science) * _Twitter:_ [@johnolafenwa](https://twitter.com/johnolafenwa) * _Medium:_ [@johnolafenwa](https://medium.com/@johnolafenwa) * _Facebook:_ [olafenwajohn](https://facebook.com/olafenwajohn) ### Citation
You can cite **ImageAI** in your projects and research papers via the **BibTeX** entry below. ``` @misc {ImageAI, author = "Moses", title = "ImageAI, an open source python library built to empower developers to build applications and systems with self-contained Computer Vision capabilities", url = "https://github.com/OlafenwaMoses/ImageAI", month = "mar", year = "2018--" } ``` ### References
1. Somshubra Majumdar, DenseNet Implementation of the paper, Densely Connected Convolutional Networks in Keras [https://github.com/titu1994/DenseNet](https://github.com/titu1994/DenseNet) 2. Broad Institute of MIT and Harvard, Keras package for deep residual networks [https://github.com/broadinstitute/keras-resnet](https://github.com/broadinstitute/keras-resnet) 3. Fizyr, Keras implementation of RetinaNet object detection [https://github.com/fizyr/keras-retinanet](https://github.com/fizyr/keras-retinanet) 4. Francois Chollet, Keras code and weights files for popular deeplearning models [https://github.com/fchollet/deep-learning-models](https://github.com/fchollet/deep-learning-models) 5. Forrest N. et al, SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and <0.5MB model size [https://arxiv.org/abs/1602.07360](https://arxiv.org/abs/1602.07360) 6. Kaiming H. et al, Deep Residual Learning for Image Recognition [https://arxiv.org/abs/1512.03385](https://arxiv.org/abs/1512.03385) 7. Szegedy. et al, Rethinking the Inception Architecture for Computer Vision [https://arxiv.org/abs/1512.00567](https://arxiv.org/abs/1512.00567) 8. Gao. et al, Densely Connected Convolutional Networks [https://arxiv.org/abs/1608.06993](https://arxiv.org/abs/1608.06993) 9. Tsung-Yi. et al, Focal Loss for Dense Object Detection [https://arxiv.org/abs/1708.02002](https://arxiv.org/abs/1708.02002) 10. O Russakovsky et al, ImageNet Large Scale Visual Recognition Challenge [https://arxiv.org/abs/1409.0575](https://arxiv.org/abs/1409.0575) 11. TY Lin et al, Microsoft COCO: Common Objects in Context [https://arxiv.org/abs/1405.0312](https://arxiv.org/abs/1405.0312) 12. Moses & John Olafenwa, A collection of images of identifiable professionals. [https://github.com/OlafenwaMoses/IdenProf](https://github.com/OlafenwaMoses/IdenProf) 13. Joseph Redmon and Ali Farhadi, YOLOv3: An Incremental Improvement. [https://arxiv.org/abs/1804.02767](https://arxiv.org/abs/1804.02767) 14. Experiencor, Training and Detecting Objects with YOLO3 [https://github.com/experiencor/keras-yolo3](https://github.com/experiencor/keras-yolo3) 15. MobileNetV2: Inverted Residuals and Linear Bottlenecks [https://arxiv.org/abs/1801.04381](https://arxiv.org/abs/1801.04381) 16. YOLOv3 in PyTorch > ONNX > CoreML > TFLite [https://github.com/ultralytics/yolov3](https://github.com/ultralytics/yolov3) ================================================ FILE: examples/camera_feed_detection.py ================================================ from imageai.Detection import VideoObjectDetection import os import cv2 execution_path = os.getcwd() camera = cv2.VideoCapture(0) detector = VideoObjectDetection() detector.setModelTypeAsYOLOv3() detector.setModelPath(os.path.join(execution_path , "yolov3.pt")) # Download the model via this link https://github.com/OlafenwaMoses/ImageAI/releases/tag/1.0 detector.loadModel() video_path = detector.detectObjectsFromVideo(camera_input=camera, output_file_path=os.path.join(execution_path, "camera_detected_video") , frames_per_second=20, log_progress=True, minimum_percentage_probability=30) print(video_path) ================================================ FILE: examples/custom_detection.py ================================================ from imageai.Detection.Custom import CustomObjectDetection detector = CustomObjectDetection() detector.setModelTypeAsYOLOv3() detector.setModelPath("yolov3_hololens-yolo_mAP-0.82726_epoch-73.pt") # https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/yolov3_hololens-yolo_mAP-0.82726_epoch-73.pt detector.setJsonPath("hololens-yolo_yolov3_detection_config.json") # https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/hololens-yolo_yolov3_detection_config.json detector.loadModel() detections = detector.detectObjectsFromImage(input_image="holo2.jpg", output_image_path="holo2-detected.jpg") for detection in detections: print(detection["name"], " : ", detection["percentage_probability"], " : ", detection["box_points"]) """ EXAMPLE RESULT hololens : 39.69653248786926 : [611, 74, 751, 154] hololens : 87.6643180847168 : [23, 46, 90, 79] hololens : 89.25175070762634 : [191, 66, 243, 95] hololens : 64.49641585350037 : [437, 81, 514, 133] hololens : 91.78624749183655 : [380, 113, 423, 138] """ ================================================ FILE: examples/custom_detection_array_input_output.py ================================================ from imageai.Detection.Custom import CustomObjectDetection import cv2 image_array = cv2.imread("holo2.jpg") detector = CustomObjectDetection() detector.setModelTypeAsYOLOv3() detector.setModelPath("yolov3_hololens-yolo_mAP-0.82726_epoch-73.pt") # https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/yolov3_hololens-yolo_mAP-0.82726_epoch-73.pt detector.setJsonPath("hololens-yolo_yolov3_detection_config.json") # https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/hololens-yolo_yolov3_detection_config.json detector.loadModel() detected_image, detections = detector.detectObjectsFromImage(input_image=image_array, input_type="array", output_type="array") for eachObject in detections: print(eachObject["name"], " : ", eachObject["percentage_probability"], " : ", eachObject["box_points"]) cv2.imshow("Main Image", detected_image) cv2.waitKey() cv2.destroyAllWindows() """ SAMPLE RESULT hololens : 39.69653248786926 : [611, 74, 751, 154] hololens : 87.6643180847168 : [23, 46, 90, 79] hololens : 89.25175070762634 : [191, 66, 243, 95] hololens : 64.49641585350037 : [437, 81, 514, 133] hololens : 91.78624749183655 : [380, 113, 423, 138] """ ================================================ FILE: examples/custom_detection_extract_objects.py ================================================ from imageai.Detection.Custom import CustomObjectDetection detector = CustomObjectDetection() detector.setModelTypeAsYOLOv3() detector.setModelPath("yolov3_hololens-yolo_mAP-0.82726_epoch-73.pt") # https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/yolov3_hololens-yolo_mAP-0.82726_epoch-73.pt detector.setJsonPath("hololens-yolo_yolov3_detection_config.json") # https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/hololens-yolo_yolov3_detection_config.json detector.loadModel() detections, extracted_objects_array = detector.detectObjectsFromImage(input_image="holo2.jpg", output_image_path="holo2-detected.jpg", extract_detected_objects=True) for detection, object_path in zip(detections, extracted_objects_array): print(object_path) print(detection["name"], " : ", detection["percentage_probability"], " : ", detection["box_points"]) print("---------------") """ SAMPLE RESULT holo2-detected-objects\hololens-1.jpg hololens : 39.69653248786926 : [611, 74, 751, 154] --------------- holo2-detected-objects\hololens-1.jpg hololens : 87.6643180847168 : [23, 46, 90, 79] --------------- holo2-detected-objects\hololens-1.jpg hololens : 89.25175070762634 : [191, 66, 243, 95] --------------- holo2-detected-objects\hololens-1.jpg hololens : 64.49641585350037 : [437, 81, 514, 133] --------------- holo2-detected-objects\hololens-1.jpg hololens : 91.78624749183655 : [380, 113, 423, 138] --------------- """ ================================================ FILE: examples/custom_detection_from_array_extract_objects_array.py ================================================ from imageai.Detection.Custom import CustomObjectDetection import cv2 image_array = cv2.imread("holo2.jpg") detector = CustomObjectDetection() detector.setModelTypeAsYOLOv3() detector.setModelPath("yolov3_hololens-yolo_mAP-0.82726_epoch-73.pt") # https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/yolov3_hololens-yolo_mAP-0.82726_epoch-73.pt detector.setJsonPath("hololens-yolo_yolov3_detection_config.json") # https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/hololens-yolo_yolov3_detection_config.json detector.loadModel() detected_image, detections, extracted_objects = detector.detectObjectsFromImage(input_image=image_array, extract_detected_objects=True, input_type="array", output_type="array") for eachObject in detections: print(eachObject["name"], " : ", eachObject["percentage_probability"], " : ", eachObject["box_points"]) cv2.imshow("Main Image", detected_image) count = 0 for img in extracted_objects: count += 1 cv2.imshow("Window" + str(count), img) cv2.waitKey() cv2.destroyAllWindows() """ SAMPLE RESULT hololens : 39.69653248786926 : [611, 74, 751, 154] hololens : 87.6643180847168 : [23, 46, 90, 79] hololens : 89.25175070762634 : [191, 66, 243, 95] hololens : 64.49641585350037 : [437, 81, 514, 133] hololens : 91.78624749183655 : [380, 113, 423, 138] """ ================================================ FILE: examples/custom_detection_from_file_extract_objects_array.py ================================================ from imageai.Detection.Custom import CustomObjectDetection import cv2 detector = CustomObjectDetection() detector.setModelTypeAsYOLOv3() detector.setModelPath("yolov3_hololens-yolo_mAP-0.82726_epoch-73.pt") # https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/yolov3_hololens-yolo_mAP-0.82726_epoch-73.pt detector.setJsonPath("hololens-yolo_yolov3_detection_config.json") # https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/hololens-yolo_yolov3_detection_config.json detector.loadModel() detected_image, detections, extracted_objects = detector.detectObjectsFromImage(input_image="holo2.jpg", extract_detected_objects=True, output_type="array") for eachObject in detections: print(eachObject["name"], " : ", eachObject["percentage_probability"], " : ", eachObject["box_points"]) cv2.imshow("Main Image", detected_image) count = 0 for img in extracted_objects: count += 1 cv2.imshow("Window" + str(count), img) cv2.waitKey() cv2.destroyAllWindows() """ SAMPLE RESULT hololens : 39.69653248786926 : [611, 74, 751, 154] hololens : 87.6643180847168 : [23, 46, 90, 79] hololens : 89.25175070762634 : [191, 66, 243, 95] hololens : 64.49641585350037 : [437, 81, 514, 133] hololens : 91.78624749183655 : [380, 113, 423, 138] """ ================================================ FILE: examples/custom_detection_train.py ================================================ from imageai.Detection.Custom import DetectionModelTrainer trainer = DetectionModelTrainer() trainer.setModelTypeAsYOLOv3() trainer.setDataDirectory(data_directory="hololens") trainer.setTrainConfig(object_names_array=["hololens"], batch_size=4, num_experiments=200, train_from_pretrained_model="yolov3.pt") #download pre-trained model via https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/yolov3.pt # If you are training to detect more than 1 object, set names of objects above like object_names_array=["hololens", "google-glass", "oculus", "magic-leap"] trainer.trainModel() """ SAMPLE RESULT Generating anchor boxes for training images... thr=0.25: 1.0000 best possible recall, 6.93 anchors past thr n=9, img_size=416, metric_all=0.463/0.856-mean/best, past_thr=0.549-mean: ==================== Pretrained YOLOv3 model loaded to initialize weights ==================== Epoch 1/100 ---------- Train: 30it [00:14, 2.09it/s] box loss-> 0.09820, object loss-> 0.27985, class loss-> 0.00000 Validation: 15it [01:45, 7.05s/it] recall: 0.085714 precision: 0.000364 mAP@0.5: 0.000186, mAP@0.5-0.95: 0.000030 Epoch 2/100 ---------- Train: 30it [00:07, 4.25it/s] box loss-> 0.08691, object loss-> 0.07011, class loss-> 0.00000 Validation: 15it [01:37, 6.53s/it] recall: 0.214286 precision: 0.000854 mAP@0.5: 0.000516, mAP@0.5-0.95: 0.000111 """ ================================================ FILE: examples/custom_detection_video.py ================================================ from imageai.Detection.Custom import CustomVideoObjectDetection import os execution_path = os.getcwd() video_detector = CustomVideoObjectDetection() video_detector.setModelTypeAsYOLOv3() video_detector.setModelPath("yolov3_hololens-yolo_mAP-0.82726_epoch-73.pt") # https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/yolov3_hololens-yolo_mAP-0.82726_epoch-73.pt video_detector.setJsonPath("hololens-yolo_yolov3_detection_config.json") # https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/hololens-yolo_yolov3_detection_config.json video_detector.loadModel() video_detector.detectObjectsFromVideo(input_file_path="holo1.mp4", output_file_path=os.path.join(execution_path, "holo1-detected3"), frames_per_second=20, minimum_percentage_probability=40, log_progress=True) ================================================ FILE: examples/custom_model_prediction.py ================================================ from imageai.Classification.Custom import CustomImageClassification import os execution_path = os.getcwd() prediction = CustomImageClassification() prediction.setModelTypeAsResNet50() prediction.setModelPath(os.path.join(execution_path, "resnet50-idenprof-test_acc_0.78200_epoch-91.pt")) # Download the model via this link https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/resnet50-idenprof-test_acc_0.78200_epoch-91.pt prediction.setJsonPath(os.path.join(execution_path, "idenprof_model_classes.json")) # Download from here https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/idenprof_model_classes.json prediction.loadModel(num_objects=10) predictions, probabilities = prediction.classifyImage(os.path.join(execution_path, "9.jpg"), result_count=5) for eachPrediction, eachProbability in zip(predictions, probabilities): print(eachPrediction , " : " , eachProbability) ================================================ FILE: examples/custom_model_training.py ================================================ from imageai.Classification.Custom import ClassificationModelTrainer model_trainer = ClassificationModelTrainer() model_trainer.setModelTypeAsResNet50() model_trainer.setDataDirectory("idenprof") model_trainer.trainModel(num_experiments=200, batch_size=32) ================================================ FILE: examples/image_custom_object_detection.py ================================================ from imageai.Detection import ObjectDetection import os from time import time execution_path = os.getcwd() detector = ObjectDetection() detector.setModelTypeAsYOLOv3() detector.setModelPath( os.path.join(execution_path , "yolov3.pt")) # Download the model via this link https://github.com/OlafenwaMoses/ImageAI/releases/tag/1.0 detector.loadModel() our_time = time() custom = detector.CustomObjects(bicycle=True, backpack=True) detections = detector.detectCustomObjectsFromImage( custom_objects=custom, input_image=os.path.join(execution_path , "7.jpg"), output_image_path=os.path.join(execution_path , "7-detected.jpg"), minimum_percentage_probability=40) for eachObject in detections: print(eachObject["name"] , " : " , eachObject["percentage_probability"], " : ", eachObject["box_points"] ) print("--------------------------------") ================================================ FILE: examples/image_prediction.py ================================================ from imageai.Classification import ImageClassification import os execution_path = os.getcwd() prediction = ImageClassification() prediction.setModelTypeAsResNet50() prediction.setModelPath(os.path.join(execution_path, "resnet50-19c8e357.pth")) # Download the model via this link https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/resnet50-19c8e357.pth prediction.loadModel() predictions, probabilities = prediction.classifyImage(os.path.join(execution_path, "1.jpg"), result_count=10) for eachPrediction, eachProbability in zip(predictions, probabilities): print(eachPrediction , " : " , eachProbability) ================================================ FILE: examples/object_detection.py ================================================ from imageai.Detection import ObjectDetection import os execution_path = os.getcwd() detector = ObjectDetection() detector.setModelTypeAsRetinaNet() detector.setModelPath( os.path.join(execution_path , "retinanet_resnet50_fpn_coco-eeacb38b.pth")) # Download the model via this link https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/retinanet_resnet50_fpn_coco-eeacb38b.pth detector.loadModel() detections = detector.detectObjectsFromImage(input_image=os.path.join(execution_path , "2.jpg"), output_image_path=os.path.join(execution_path , "2_detected.jpg"), minimum_percentage_probability=40) for eachObject in detections: print(eachObject["name"] , " : ", eachObject["percentage_probability"], " : ", eachObject["box_points"] ) print("--------------------------------") ================================================ FILE: examples/video_analysis_per_frame.py ================================================ from imageai.Detection import VideoObjectDetection import os from matplotlib import pyplot as plt execution_path = os.getcwd() color_index = {'bus': 'red', 'handbag': 'steelblue', 'giraffe': 'orange', 'spoon': 'gray', 'cup': 'yellow', 'chair': 'green', 'elephant': 'pink', 'truck': 'indigo', 'motorcycle': 'azure', 'refrigerator': 'gold', 'keyboard': 'violet', 'cow': 'magenta', 'mouse': 'crimson', 'sports ball': 'raspberry', 'horse': 'maroon', 'cat': 'orchid', 'boat': 'slateblue', 'hot dog': 'navy', 'apple': 'cobalt', 'parking meter': 'aliceblue', 'sandwich': 'skyblue', 'skis': 'deepskyblue', 'microwave': 'peacock', 'knife': 'cadetblue', 'baseball bat': 'cyan', 'oven': 'lightcyan', 'carrot': 'coldgrey', 'scissors': 'seagreen', 'sheep': 'deepgreen', 'toothbrush': 'cobaltgreen', 'fire hydrant': 'limegreen', 'remote': 'forestgreen', 'bicycle': 'olivedrab', 'toilet': 'ivory', 'tv': 'khaki', 'skateboard': 'palegoldenrod', 'train': 'cornsilk', 'zebra': 'wheat', 'tie': 'burlywood', 'orange': 'melon', 'bird': 'bisque', 'dining table': 'chocolate', 'hair drier': 'sandybrown', 'cell phone': 'sienna', 'sink': 'coral', 'bench': 'salmon', 'bottle': 'brown', 'car': 'silver', 'bowl': 'maroon', 'tennis racket': 'palevilotered', 'airplane': 'lavenderblush', 'pizza': 'hotpink', 'umbrella': 'deeppink', 'bear': 'plum', 'fork': 'purple', 'laptop': 'indigo', 'vase': 'mediumpurple', 'baseball glove': 'slateblue', 'traffic light': 'mediumblue', 'bed': 'navy', 'broccoli': 'royalblue', 'backpack': 'slategray', 'snowboard': 'skyblue', 'kite': 'cadetblue', 'teddy bear': 'peacock', 'clock': 'lightcyan', 'wine glass': 'teal', 'frisbee': 'aquamarine', 'donut': 'mincream', 'suitcase': 'seagreen', 'dog': 'springgreen', 'banana': 'emeraldgreen', 'person': 'honeydew', 'surfboard': 'palegreen', 'cake': 'sapgreen', 'book': 'lawngreen', 'potted plant': 'greenyellow', 'toaster': 'ivory', 'stop sign': 'beige', 'couch': 'khaki'} resized = False def forFrame(frame_number, output_array, output_count, returned_frame): plt.clf() this_colors = [] labels = [] sizes = [] counter = 0 for eachItem in output_count: counter += 1 labels.append(eachItem + " = " + str(output_count[eachItem])) sizes.append(output_count[eachItem]) this_colors.append(color_index[eachItem]) global resized if (resized == False): manager = plt.get_current_fig_manager() manager.resize(width=1000, height=500) resized = True plt.subplot(1, 2, 1) plt.title("Frame : " + str(frame_number)) plt.axis("off") plt.imshow(returned_frame, interpolation="none") plt.subplot(1, 2, 2) plt.title("Analysis: " + str(frame_number)) plt.pie(sizes, labels=labels, colors=this_colors, shadow=True, startangle=140, autopct="%1.1f%%") plt.pause(0.01) video_detector = VideoObjectDetection() video_detector.setModelTypeAsYOLOv3() video_detector.setModelPath(os.path.join(execution_path, "yolov3.pt")) # https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/yolov3.pt video_detector.loadModel() plt.show() video_detector.detectObjectsFromVideo(input_file_path=os.path.join(execution_path, "traffic.mp4"), output_file_path=os.path.join(execution_path, "video_frame_analysis") , frames_per_second=20, per_frame_function=forFrame, minimum_percentage_probability=30, return_detected_frame=True) ================================================ FILE: examples/video_analysis_per_second.py ================================================ from imageai.Detection import VideoObjectDetection import os from matplotlib import pyplot as plt execution_path = os.getcwd() color_index = {'bus': 'red', 'handbag': 'steelblue', 'giraffe': 'orange', 'spoon': 'gray', 'cup': 'yellow', 'chair': 'green', 'elephant': 'pink', 'truck': 'indigo', 'motorcycle': 'azure', 'refrigerator': 'gold', 'keyboard': 'violet', 'cow': 'magenta', 'mouse': 'crimson', 'sports ball': 'raspberry', 'horse': 'maroon', 'cat': 'orchid', 'boat': 'slateblue', 'hot dog': 'navy', 'apple': 'cobalt', 'parking meter': 'aliceblue', 'sandwich': 'skyblue', 'skis': 'deepskyblue', 'microwave': 'peacock', 'knife': 'cadetblue', 'baseball bat': 'cyan', 'oven': 'lightcyan', 'carrot': 'coldgrey', 'scissors': 'seagreen', 'sheep': 'deepgreen', 'toothbrush': 'cobaltgreen', 'fire hydrant': 'limegreen', 'remote': 'forestgreen', 'bicycle': 'olivedrab', 'toilet': 'ivory', 'tv': 'khaki', 'skateboard': 'palegoldenrod', 'train': 'cornsilk', 'zebra': 'wheat', 'tie': 'burlywood', 'orange': 'melon', 'bird': 'bisque', 'dining table': 'chocolate', 'hair drier': 'sandybrown', 'cell phone': 'sienna', 'sink': 'coral', 'bench': 'salmon', 'bottle': 'brown', 'car': 'silver', 'bowl': 'maroon', 'tennis racket': 'palevilotered', 'airplane': 'lavenderblush', 'pizza': 'hotpink', 'umbrella': 'deeppink', 'bear': 'plum', 'fork': 'purple', 'laptop': 'indigo', 'vase': 'mediumpurple', 'baseball glove': 'slateblue', 'traffic light': 'mediumblue', 'bed': 'navy', 'broccoli': 'royalblue', 'backpack': 'slategray', 'snowboard': 'skyblue', 'kite': 'cadetblue', 'teddy bear': 'peacock', 'clock': 'lightcyan', 'wine glass': 'teal', 'frisbee': 'aquamarine', 'donut': 'mincream', 'suitcase': 'seagreen', 'dog': 'springgreen', 'banana': 'emeraldgreen', 'person': 'honeydew', 'surfboard': 'palegreen', 'cake': 'sapgreen', 'book': 'lawngreen', 'potted plant': 'greenyellow', 'toaster': 'ivory', 'stop sign': 'beige', 'couch': 'khaki'} resized = False def forSecond(frame_number, output_arrays, count_arrays, average_count, returned_frame): plt.clf() this_colors = [] labels = [] sizes = [] counter = 0 for eachItem in average_count: counter += 1 labels.append(eachItem + " = " + str(average_count[eachItem])) sizes.append(average_count[eachItem]) this_colors.append(color_index[eachItem]) global resized if (resized == False): manager = plt.get_current_fig_manager() manager.resize(width=1000, height=500) resized = True plt.subplot(1, 2, 1) plt.title("Second : " + str(frame_number)) plt.axis("off") plt.imshow(returned_frame, interpolation="none") plt.subplot(1, 2, 2) plt.title("Analysis: " + str(frame_number)) plt.pie(sizes, labels=labels, colors=this_colors, shadow=True, startangle=140, autopct="%1.1f%%") plt.pause(0.01) video_detector = VideoObjectDetection() video_detector.setModelTypeAsYOLOv3() video_detector.setModelPath(os.path.join(execution_path, "yolov3.pt")) # https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/yolov3.pt video_detector.loadModel() plt.show() video_detector.detectObjectsFromVideo(input_file_path=os.path.join(execution_path, "traffic.mp4"), output_file_path=os.path.join(execution_path, "video_second_analysis") , frames_per_second=20, per_second_function=forSecond, minimum_percentage_probability=30, return_detected_frame=True, log_progress=True) ================================================ FILE: examples/video_custom_object_detection.py ================================================ from imageai.Detection import VideoObjectDetection import os execution_path = os.getcwd() detector = VideoObjectDetection() detector.setModelTypeAsYOLOv3() detector.setModelPath(os.path.join(execution_path, "yolov3.pt")) # https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/yolov3.pt detector.loadModel() custom = detector.CustomObjects(person=True, motorcycle=True, bus=True) video_path = detector.detectCustomObjectsFromVideo(custom_objects=custom, input_file_path=os.path.join(execution_path, "traffic.mp4"), output_file_path=os.path.join(execution_path, "traffic_detected_custom") , frames_per_second=20, log_progress=True) print(video_path) ================================================ FILE: examples/video_object_detection.py ================================================ from imageai.Detection import VideoObjectDetection import os execution_path = os.getcwd() detector = VideoObjectDetection() detector.setModelTypeAsYOLOv3() detector.setModelPath(os.path.join(execution_path, "yolov3.pt")) # https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/yolov3.pt detector.loadModel() video_path = detector.detectObjectsFromVideo(input_file_path=os.path.join(execution_path, "traffic.mp4"), output_file_path=os.path.join(execution_path, "traffic_detected") , frames_per_second=20, log_progress=True) print(video_path) ================================================ FILE: imageai/Classification/CUSTOMCLASSIFICATION.md ================================================ # ImageAI : Custom Image Classification ImageAI provides 4 different algorithms and model types to perform custom image prediction using your custom models. You will be able to use your model trained with **ImageAI** and the corresponding model_class JSON file to predict custom objects that you have trained the model on. ### TABLE OF CONTENTS - :white_square_button: Custom Model Prediction - :white_square_button: Custom Model Prediction with Full Model (NEW) ### Custom Model Prediction
In this example, we will be using the model trained for 20 experiments on **IdenProf**, a dataset of uniformed professionals and achieved 65.17% accuracy on the test dataset. (You can use your own trained model and generated JSON file. This 'class' is provided mainly for the purpose to use your own custom models.) Download the ResNet model of the model and JSON files in links below: - [**ResNet50**](https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/resnet50-idenprof-test_acc_0.78200_epoch-91.pt) _(Size = 90.4 mb)_ - [**idenprof_model_class.json file**](https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/idenprof_model_classes.json) Great! Once you have downloaded this model file and the JSON file, start a new python project, and then copy the model file and the JSON file to your project folder where your python files (.py files) will be. Download the image below, or take any image on your computer that include any of the following professionals(Chef, Doctor, Engineer, Farmer, Fireman, Judge, Mechanic, Pilot, Police and Waiter) and copy it to your python project's folder. Then create a python file and give it a name; an example is **FirstCustomPrediction.py**. Then write the code below into the python file: ### FirstCustomPrediction.py ```python from imageai.Classification.Custom import CustomImageClassification import os execution_path = os.getcwd() prediction = CustomImageClassification() prediction.setModelTypeAsResNet50() prediction.setModelPath(os.path.join(execution_path, "resnet50-idenprof-test_acc_0.78200_epoch-91.pt")) prediction.setJsonPath(os.path.join(execution_path, "idenprof_model_class.json")) prediction.loadModel(num_objects=10) predictions, probabilities = prediction.classifyImage(os.path.join(execution_path, "4.jpg"), result_count=5) for eachPrediction, eachProbability in zip(predictions, probabilities): print(eachPrediction + " : " + eachProbability) ``` **Sample Result:** ![Sample Result](../../data-images/4.jpg) ``` mechanic : 76.82620286941528 chef : 10.106072574853897 waiter : 4.036874696612358 police : 2.6663416996598244 pilot : 2.239348366856575 ``` The code above works as follows: ```python from imageai.Classification.Custom import CustomImageClassification import os ``` The code above imports the **ImageAI** library for custom image prediction and the python **os** class. ```python execution_path = os.getcwd() ``` The above line obtains the path to the folder that contains your python file (in this example, your FirstCustomPrediction.py). ```python prediction = CustomImageClassification() prediction.setModelTypeAsResNet50() prediction.setModelPath(os.path.join(execution_path, "resnet50-idenprof-test_acc_0.78200_epoch-91.pt")) prediction.setJsonPath(os.path.join(execution_path, "idenprof_model_class.json")) prediction.loadModel(num_objects=10) ``` In the lines above, we created and instance of the `CustomImageClassification()` class in the first line, then we set the model type of the prediction object to ResNet by caling the `.setModelTypeAsResNet50()` in the second line, we set the model path of the prediction object to the path of the custom model file (`resnet50-idenprof-test_acc_0.78200_epoch-91.pt`) we copied to the python file folder in the third line, we set the path to the idenprof_model_class.json of the model, we load the model and parse the number of objected that can be predicted in the model. ```python predictions, probabilities = prediction.classifyImage(os.path.join(execution_path, "4.jpg"), result_count=5) ``` In the above line, we defined 2 variables to be equal to the function called to predict an image, which is the `.classifyImage()` function, into which we parsed the path to our image and also state the number of prediction results we want to have (values from 1 to 10 in this case) parsing `result_count=5`. The `.classifyImage()` function will return 2 array objects with the first (**predictions**) being an array of predictions and the second (**percentage_probabilities**) being an array of the corresponding percentage probability for each prediction. ```python for eachPrediction, eachProbability in zip(predictions, probabilities): print(eachPrediction + " : " + eachProbability) ``` The above line obtains each object in the **predictions** array, and also obtains the corresponding percentage probability from the **percentage_probabilities**, and finally prints the result of both to console. **CustomImageClassification** class also supports the multiple predictions, input types and prediction speeds that are contained in the **ImageClassification** class. Follow this [link](README.md) to see all the details. ### Documentation We have provided full documentation for all **ImageAI** classes and functions in 3 major languages. Find links below:** * Documentation - **English Version [https://imageai.readthedocs.io](https://imageai.readthedocs.io)** ================================================ FILE: imageai/Classification/CUSTOMTRAINING.md ================================================ # ImageAI : Custom Prediction Model Training ## --------------------------------------------------- ## Introducing Jarvis and TheiaEngine. We the creators of ImageAI are glad to announce 2 new AI projects to provide state-of-the-art Generative AI, LLM and Image Understanding on your personal computer and servers. [![](../../jarvis.png)](https://jarvis.genxr.co) Install Jarvis on PC/Mac to setup limitless access to LLM powered AI Chats for your every day work, research and generative AI needs with 100% privacy and full offline capability. Visit [https://jarvis.genxr.co](https://jarvis.genxr.co/) to get started. [![](../../theiaengine.png)]((https://www.genxr.co/theia-engine)) [TheiaEngine](https://www.genxr.co/theia-engine), the next-generation computer Vision AI API capable of all Generative and Understanding computer vision tasks in a single API call and available via REST API to all programming languages. Features include - **Detect 300+ objects** ( 220 more objects than ImageAI) - **Provide answers to any content or context questions** asked on an image - very useful to get information on any object, action or information without needing to train a new custom model for every tasks - **Generate scene description and summary** - **Convert 2D image to 3D pointcloud and triangular mesh** - **Semantic Scene mapping of objects, walls, floors, etc** - **Stateless Face recognition and emotion detection** - **Image generation and augmentation from prompt** - etc. Visit [https://www.genxr.co/theia-engine](https://www.genxr.co/theia-engine) to try the demo and join in the beta testing today. ## --------------------------------------------------- **ImageAI** provides the most simple and powerful approach to training custom image prediction models using state-of-the-art SqueezeNet, ResNet50, InceptionV3 and DenseNet which you can load into the `imageai.Classification.Custom.CustomImageClassification` class. This allows you to train your own model on any set of images that corresponds to any type of objects/persons. The training process generates a JSON file that maps the objects types in your image dataset and creates lots of models. You will then pick the model with the highest accuracy and perform custom image prediction using the model and the JSON file generated. ### TABLE OF CONTENTS - :white_square_button: Custom Model Training Prediction - :white_square_button: Saving Full Custom Model - :white_square_button: Training on the IdenProf Dataset - :white_square_button: Continuous Model Training - :white_square_button: Transfer Learning (Training from a pre-trained model) ### Custom Model Training
Because model training is a compute intensive tasks, we strongly advise you perform this experiment using a computer with a NVIDIA GPU and the GPU version of Tensorflow installed. Performing model training on CPU will my take hours or days. With NVIDIA GPU powered computer system, this will take a few hours. You can use Google Colab for this experiment as it has an NVIDIA K80 GPU available. To train a custom prediction model, you need to prepare the images you want to use to train the model. You will prepare the images as follows: 1. Create a dataset folder with the name you will like your dataset to be called (e.g pets) 2. In the dataset folder, create a folder by the name **train** 3. In the dataset folder, create a folder by the name **test** 4. In the train folder, create a folder for each object you want to the model to predict and give the folder a name that corresponds to the respective object name (e.g dog, cat, squirrel, snake) 5. In the test folder, create a folder for each object you want to the model to predict and give the folder a name that corresponds to the respective object name (e.g dog, cat, squirrel, snake) 6. In each folder present in the train folder, put the images of each object in its respective folder. This images are the ones to be used to train the model To produce a model that can perform well in practical applications, I recommend you about 500 or more images per object. 1000 images per object is just great 7. In each folder present in the test folder, put about 100 to 200 images of each object in its respective folder. These images are the ones to be used to test the model as it trains 8. Once you have done this, the structure of your image dataset folder should look like below: ``` pets//train//dog//dog-train-images pets//train//cat//cat-train-images pets//train//squirrel//squirrel-train-images pets//train//snake//snake-train-images pets//test//dog//dog-test-images pets//test//cat//cat-test-images pets//test//squirrel//squirrel-test-images pets//test//snake//snake-test-images ``` 9. Then your training code goes as follows: ```python from imageai.Classification.Custom import ClassificationModelTrainer model_trainer = ClassificationModelTrainer() model_trainer.setModelTypeAsResNet50() model_trainer.setDataDirectory("pets") model_trainer.trainModel(num_objects=4, num_experiments=100, enhance_data=True, batch_size=32, show_network_summary=True) ``` Yes! Just 5 lines of code and you can train any of the available 4 state-of-the-art Deep Learning algorithms on your custom dataset. Now lets take a look at how the code above works. ```python from imageai.Classification.Custom import ClassificationModelTrainer model_trainer = ClassificationModelTrainer() model_trainer.setModelTypeAsResNet50() model_trainer.setDataDirectory("pets") ``` In the first line, we import the **ImageAI** model training class, then we define the model trainer in the second line, we set the network type in the third line and set the path to the image dataset we want to train the network on. ```python model_trainer.trainModel(num_experiments=100, batch_size=32) ``` In the code above, we start the training process. The parameters stated in the function are as below: - **num_experiments** : this is to state the number of times the network will train over all the training images, which is also called epochs - **batch_size** : This is to state the number of images the network will process at ones. The images are processed in batches until they are exhausted per each experiment performed. When you start the training, you should see something like this in the console: ``` ================================================== Training with GPU ================================================== Epoch 1/100 ---------- 100%|█████████████████████████████████████████████████████████████████████████████████| 282/282 [02:15<00:00, 2.08it/s] train Loss: 3.8062 Accuracy: 0.1178 100%|███████████████████████████████████████████████████████████████████████████████████| 63/63 [00:26<00:00, 2.36it/s] test Loss: 2.2829 Accuracy: 0.1215 Epoch 2/100 ---------- 100%|█████████████████████████████████████████████████████████████████████████████████| 282/282 [01:57<00:00, 2.40it/s] train Loss: 2.2682 Accuracy: 0.1303 100%|███████████████████████████████████████████████████████████████████████████████████| 63/63 [00:20<00:00, 3.07it/s] test Loss: 2.2388 Accuracy: 0.1470 ``` Let us explain the details shown above: 1. The line **Epoch 1/100** means the network is training the first experiment of the targeted 100 2. The line `1/25 [>.............................] - ETA: 52s - loss: 2.3026 - acc: 0.2500` represents the number of batches that has been trained in the present experiment 3. The best model is automatically saved to `/models>` Once you are done training your custom model, you can use the "CustomImageClassification" class to perform image prediction with your model. Simply follow the link below. [imageai/Classification/CUSTOMCLASSIFICATION.md](https://github.com/OlafenwaMoses/ImageAI/blob/master/imageai/Classification/CUSTOMCLASSIFICATION.md) ### Documentation We have provided full documentation for all **ImageAI** classes and functions. Find links below: * Documentation - **English Version [https://imageai.readthedocs.io](https://imageai.readthedocs.io)** ================================================ FILE: imageai/Classification/Custom/__init__.py ================================================ import time, warnings import os import copy import re import json from typing import List, Tuple, Union from PIL import Image import numpy as np import torch import torch.nn as nn from torch.optim import lr_scheduler from torchvision import datasets from torchvision import transforms from torchvision.models import mobilenet_v2, inception_v3, resnet50, densenet121 from torchvision.models.inception import InceptionOutputs from .data_transformation import data_transforms1, data_transforms2 from .training_params import resnet50_train_params, densenet121_train_params, inception_v3_train_params, mobilenet_v2_train_params from tqdm import tqdm from ...backend_check.model_extension import extension_check class ClassificationModelTrainer(): """ This is the Classification Model training class, that allows you to define a deep learning network from the 4 available networks types supported by ImageAI which are MobileNetv2, ResNet50, InceptionV3 and DenseNet121 and then train on custom image data. """ def __init__(self) -> None: self.__model_type = "" self.__device = "cuda" if torch.cuda.is_available() else "cpu" self.__data_dir = "" self.__data_loaders = None self.__class_names = None self.__dataset_sizes = None self.__dataset_name = "" self.__model = None self.__optimizer = None self.__lr_scheduler = None self.__loss_fn = nn.CrossEntropyLoss() self.__transfer_learning_mode = "fine_tune_all" self.__model_path = "" self.__training_params = None def __set_training_param(self) -> None: if not self.__model_type: raise RuntimeError("The model type is not set!!!") self.__model = self.__training_params["model"] optimizer = self.__training_params["optimizer"] lr_decay_rate = self.__training_params["lr_decay_rate"] lr_step_size = self.__training_params["lr_step_size"] lr = self.__training_params["lr"] weight_decay = self.__training_params["weight_decay"] if self.__model_path: self.__set_transfer_learning_mode() print("==> Transfer learning enabled") # change the last linear layer to have output features of # same size as the number of unique classes in the new # dataset. if self.__model_type == "mobilenet_v2": in_features = self.__model.classifier[1].in_features self.__model.classifier[1] = nn.Linear(in_features, len(self.__class_names)) elif self.__model_type == "densenet121": in_features = self.__model.classifier.in_features self.__model.classifier = nn.Linear(in_features, len(self.__class_names)) else: in_features = self.__model.fc.in_features self.__model.fc = nn.Linear(in_features, len(self.__class_names)) self.__model.to(self.__device) self.__optimizer = optimizer( self.__model.parameters(), lr=lr, momentum=0.9, weight_decay=weight_decay ) if lr_decay_rate and lr_step_size: self.__lr_scheduler = lr_scheduler.StepLR( self.__optimizer, gamma=lr_decay_rate, step_size=lr_step_size ) def __set_transfer_learning_mode(self) -> None: state_dict = torch.load(self.__model_path) if self.__model_type == "densenet121": # '.'s are no longer allowed in module names, but previous densenet layers # as provided by the pytorch organization has names that uses '.'s. pattern = re.compile( r"^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\." "(?:weight|bias|running_mean|running_var))$" ) for key in list(state_dict.keys()): res = pattern.match(key) if res: new_key = res.group(1) + res.group(2) state_dict[new_key] = state_dict[key] del state_dict[key] self.__model.load_state_dict(state_dict) self.__model.to(self.__device) if self.__transfer_learning_mode == "freeze_all": for param in self.__model.parameters(): param.requires_grad = False def __load_data(self, batch_size : int = 8) -> None: if not self.__data_dir: raise RuntimeError("The dataset directory not yet set.") image_dataset = { x:datasets.ImageFolder( os.path.join(self.__data_dir, x), data_transforms2[x] if self.__model_type=="inception_v3" else data_transforms1[x] ) for x in ["train", "test"] } self.__data_loaders = { x:torch.utils.data.DataLoader( image_dataset[x], batch_size=batch_size, shuffle=True ) for x in ["train", "test"] } self.__dataset_sizes = {x:len(image_dataset[x]) for x in ["train", "test"]} self.__class_names = image_dataset["train"].classes self.__dataset_name = os.path.basename(self.__data_dir.rstrip(os.path.sep)) def setDataDirectory(self, data_directory : str = "") -> None: """ Sets the directory that contains the training and test dataset. The data directory should contain 'train' and 'test' subdirectories for the training and test datasets. In each of these subdirectories, each object must have a dedicated folder and the folder containing images for the object. The structure of the 'test' and 'train' folder must be as follows: >> train >> class1 >> class1_train_images >> class2 >> class2_train_images >> class3 >> class3_train_images >> class4 >> class4_train_images >> class5 >> class5_train_images >> test >> class1 >> class1_test_images >> class2 >> class2_test_images >> class3 >> class3_test_images >> class4 >> class4_test_images >> class5 >> class5_test_images """ if os.path.isdir(data_directory): self.__data_dir = data_directory return raise ValueError("expected a path to a directory") def setModelTypeAsMobileNetV2(self) -> None: """ 'setModelTypeAsMobileNetV2()' is used to set the model type to the MobileNetV2 model. :return: """ self.__model_type = "mobilenet_v2" self.__training_params = mobilenet_v2_train_params() def setModelTypeAsResNet50(self) -> None: """ 'setModelTypeAsResNet50()' is used to set the model type to the ResNet50 model. :return: """ self.__model_type = "resnet50" self.__training_params = resnet50_train_params() def setModelTypeAsInceptionV3(self) -> None: """ 'setModelTypeAsInceptionV3()' is used to set the model type to the InceptionV3 model. :return: """ self.__model_type = "inception_v3" self.__training_params = inception_v3_train_params() def setModelTypeAsDenseNet121(self) -> None: """ 'setModelTypeAsDenseNet()' is used to set the model type to the DenseNet model. :return: """ self.__model_type = "densenet121" self.__training_params = densenet121_train_params() def freezeAllLayers(self) -> None: """ Set the transfer learning mode to freeze all layers. NOTE: The last layer (fully connected layer) is trainable. """ self.__transfer_learning_mode = "freeze_all" def fineTuneAllLayers(self) -> None: """ Sets the transfer learning mode to fine-tune the pretrained weights """ self.__transfer_learning_mode = "fine_tune_all" def trainModel( self, num_experiments : int = 100, batch_size : int = 8, model_directory : str = None, transfer_from_model: str = None, verbose : bool = True ) -> None: """ 'trainModel()' function starts the model actual training. It accepts the following values: - num_experiments: Also known as epochs, is the number of times the network will process all the images in the training dataset - batch_size: The number of image data that will be loaded into memory at once during training - model_directory: Location where json mapping and trained models will be saved - transfer_from_model: Path to a pre-trained imagenet model that corresponds to the training model type - verbose: Option to enable/disable training logs :param num_experiments: :param batch_size: :model_directory: :transfer_from_model: :verbose: :return: """ # Load dataset self.__load_data(batch_size) # Check and effect transfer learning if enabled if transfer_from_model: extension_check(transfer_from_model) self.__model_path = transfer_from_model # Load training parameters for the specified model type self.__set_training_param() # Create output directory to save trained models and json mappings if not model_directory: model_directory = os.path.join(self.__data_dir, "models") if not os.path.exists(model_directory): os.mkdir(model_directory) # Dump class mappings to json file with open(os.path.join(model_directory, f"{self.__dataset_name}_model_classes.json"), "w") as f: classes_dict = {} class_list = sorted(self.__class_names) for i in range(len(class_list)): classes_dict[str(i)] = class_list[i] json.dump(classes_dict, f) # Prep model weights for training since = time.time() best_model_weights = copy.deepcopy(self.__model.state_dict()) best_acc = 0.0 prev_save_name, recent_save_name = "", "" # Device check and log print("=" * 50) print("Training with GPU") if self.__device == "cuda" else print("Training with CPU. This might cause slower train.") print("=" * 50) for epoch in range(num_experiments): if verbose: print(f"Epoch {epoch + 1}/{num_experiments}", "-"*10, sep="\n") # each epoch has a training and test phase for phase in ["train", "test"]: if phase == "train": self.__model.train() else: self.__model.eval() running_loss = 0.0 running_corrects = 0 # Iterate on the dataset in batches for imgs, labels in tqdm(self.__data_loaders[phase]): imgs = imgs.to(self.__device) labels = labels.to(self.__device) self.__optimizer.zero_grad() with torch.set_grad_enabled(phase == "train"): output = self.__model(imgs) if self.__model_type == "inception_v3" and type(output) == InceptionOutputs: output = output[0] _, preds = torch.max(output, 1) loss = self.__loss_fn(output, labels) if phase=="train": loss.backward() self.__optimizer.step() running_loss += loss.item() * imgs.size(0) running_corrects += torch.sum(preds==labels.data) # Compute accuracy and loss metrics post epoch training if phase == "train" and isinstance(self.__lr_scheduler, torch.optim.lr_scheduler.StepLR): self.__lr_scheduler.step() epoch_loss = running_loss / self.__dataset_sizes[phase] epoch_acc = running_corrects.double() / self.__dataset_sizes[phase] if verbose: print(f"{phase} Loss: {epoch_loss:.4f} Accuracy: {epoch_acc:.4f}") if phase == "test" and epoch_acc > best_acc: best_acc = epoch_acc recent_save_name = self.__model_type+f"-{self.__dataset_name}-test_acc_{best_acc:.5f}_epoch-{epoch}.pt" if prev_save_name: os.remove(os.path.join(model_directory, prev_save_name)) best_model_weights = copy.deepcopy(self.__model.state_dict()) torch.save( best_model_weights, os.path.join(model_directory, recent_save_name) ) prev_save_name = recent_save_name time_elapsed = time.time() - since print(f"Training completed in {time_elapsed//60:.0f}m {time_elapsed % 60:.0f}s") print(f"Best test accuracy: {best_acc:.4f}") class CustomImageClassification: """ An implementation that allows for easy classification of images using the state of the art computer vision classification model trained on custom data. The class provides 4 different classification models which are ResNet50, DensesNet121, InceptionV3 and MobileNetV2. The following functions are required to be called before a classification can be made * At least of of the following and it must correspond to the model set in the setModelPath() [setModelTypeAsMobileNetV2(), setModelTypeAsResNet(), setModelTypeAsDenseNet, setModelTypeAsInceptionV3] * setModelPath: This is used to specify the absolute path to the trained model file. * setJsonPath: This is used to specify the absolute path to the json file saved during the training of the custom model. * useCPU (Optional): If you will like to force the image classification to be performed on CPU, call this function. * loadModel: Used to load the trained model weights and json data. * classifyImage(): Used for classifying an image. """ def __init__(self) -> None: self.__model = None self.__model_type = "" self.__model_loaded = False self.__device = "cuda" if torch.cuda.is_available() else "cpu" self.__json_path = None self.__class_names = None self.__model_loaded = False def __load_image(self, image_input: Union[str, np.ndarray, Image.Image]) -> torch.Tensor: images = [] preprocess = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) if type(image_input) == str: if os.path.isfile(image_input): img = Image.open(image_input).convert("RGB") images.append(preprocess(img)) else: raise ValueError(f"image path '{image_input}' is not found or a valid file") elif type(image_input) == np.ndarray: img = Image.fromarray(image_input).convert("RGB") images.append(preprocess(img)) elif "PIL" in str(type(image_input)): img = image_input.convert("RGB") images.append(preprocess(img)) else: raise ValueError(f"Invalid image input format") return torch.stack(images) def __load_classes(self): if self.__json_path: with open(self.__json_path, 'r') as f: self.__class_names = list(json.load(f).values()) else: raise ValueError("Invalid json path. Set a valid json mapping path by calling the 'setJsonPath()' function") def setModelPath(self, path : str) -> None: """ Sets the path to the pretrained weight. """ if os.path.isfile(path): extension_check(path) self.__model_path = path self.__model_loaded = False else: raise ValueError( f"The path '{path}' isn't a valid file. Ensure you specify the path to a valid trained model file." ) def setJsonPath(self, path : str) -> None: """ Sets the path to the pretrained weight. """ if os.path.isfile(path): self.__json_path = path else: raise ValueError( "parameter path should be a valid path to the json mapping file." ) def setModelTypeAsMobileNetV2(self) -> None: """ 'setModelTypeAsMobileNetV2()' is used to set the model type to the MobileNetV2 model. :return: """ self.__model_type = "mobilenet_v2" def setModelTypeAsResNet50(self) -> None: """ 'setModelTypeAsResNet50()' is used to set the model type to the ResNet50 model. :return: """ self.__model_type = "resnet50" def setModelTypeAsInceptionV3(self) -> None: """ 'setModelTypeAsInceptionV3()' is used to set the model type to the InceptionV3 model. :return: """ self.__model_type = "inception_v3" def setModelTypeAsDenseNet121(self) -> None: """ 'setModelTypeAsDenseNet121()' is used to set the model type to the DenseNet121 model. :return: """ self.__model_type = "densenet121" def useCPU(self): """ Used to force classification to be done on CPU. By default, classification will occur on GPU compute if available else CPU compute. """ self.__device = "cpu" if self.__model_loaded: self.__model_loaded = False self.loadModel() def loadModel(self) -> None: """ 'loadModel()' function is used to load the model weights into the model architecture from the file path defined in the setModelPath() function. :return: """ if not self.__model_loaded: self.__load_classes() try: # change the last layer of the networks to conform to the number # of unique classes in the custom dataset used to train the custom # model if self.__model_type == "resnet50": self.__model = resnet50(pretrained=False) in_features = self.__model.fc.in_features self.__model.fc = nn.Linear(in_features, len(self.__class_names)) elif self.__model_type == "mobilenet_v2": self.__model = mobilenet_v2(pretrained=False) in_features = self.__model.classifier[1].in_features self.__model.classifier[1] = nn.Linear(in_features, len(self.__class_names)) elif self.__model_type == "inception_v3": self.__model = inception_v3(pretrained=False) in_features = self.__model.fc.in_features self.__model.fc = nn.Linear(in_features, len(self.__class_names)) elif self.__model_type == "densenet121": self.__model = densenet121(pretrained=False) in_features = self.__model.classifier.in_features self.__model.classifier = nn.Linear(in_features, len(self.__class_names)) else: raise RuntimeError("Unknown model type.\nEnsure the model type is properly set.") state_dict = torch.load(self.__model_path, map_location=self.__device) if self.__model_type == "densenet121": # '.'s are no longer allowed in module names, but previous densenet layers # as provided by the pytorch organization has names that uses '.'s. pattern = re.compile( r"^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\." "(?:weight|bias|running_mean|running_var))$" ) for key in list(state_dict.keys()): res = pattern.match(key) if res: new_key = res.group(1) + res.group(2) state_dict[new_key] = state_dict[key] del state_dict[key] self.__model.load_state_dict(state_dict) self.__model.to(self.__device).eval() self.__model_loaded = True except Exception as e: raise Exception("Weight loading failed.\nEnsure the model path is" " set and the weight file is in the specified model path.") def classifyImage(self, image_input: Union[str, np.ndarray, Image.Image], result_count: int) -> Tuple[List[str], List[float]]: """ 'classifyImage()' function is used to classify a given image by receiving the following arguments: * image_input: file path, numpy array or PIL image of the input image. * result_count (optional) , the number of classifications to be sent which must be whole numbers between 1 and total number of classes the model is trained to classify. This function returns 2 arrays namely 'classification_results' and 'classification_probabilities'. The 'classification_results' contains possible objects classes arranged in descending of their percentage probabilities. The 'classification_probabilities' contains the percentage probability of each object class. The position of each object class in the 'classification_results' array corresponds with the positions of the percentage probability in the 'classification_probabilities' array. :param image_input: :param result_count: :return classification_results, classification_probabilities: """ if not self.__model_loaded: raise RuntimeError( "Model not yet loaded. You need to call '.loadModel()' before performing image classification" ) images = self.__load_image(image_input) images = images.to(self.__device) with torch.no_grad(): output = self.__model(images) probabilities = torch.softmax(output, dim=1) topN_prob, topN_catid = torch.topk(probabilities, result_count) predictions = [ [ (self.__class_names[topN_catid[i][j]], topN_prob[i][j].item()*100) for j in range(topN_prob.shape[1]) ] for i in range(topN_prob.shape[0]) ] labels_pred = [] probabilities_pred = [] for idx, pred in enumerate(predictions): for label, score in pred: labels_pred.append(label) probabilities_pred.append(round(score, 4)) return labels_pred, probabilities_pred ================================================ FILE: imageai/Classification/Custom/data_transformation.py ================================================ from torchvision import transforms data_transforms1 = { "train":transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize( [0.485, 0.456, 0.406], [0.229, 0.224, 0.225] ) ]), "test": transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize( [0.485, 0.456, 0.406], [0.229, 0.224, 0.225] ) ]) } data_transforms2 = { "train":transforms.Compose([ transforms.RandomResizedCrop(299), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize( [0.485, 0.456, 0.406], [0.229, 0.224, 0.225] ) ]), "test": transforms.Compose([ transforms.Resize(299), transforms.CenterCrop(299), transforms.ToTensor(), transforms.Normalize( [0.485, 0.456, 0.406], [0.229, 0.224, 0.225] ) ]) } ================================================ FILE: imageai/Classification/Custom/training_params.py ================================================ import torch from torch.optim import SGD from torchvision.models import resnet50, inception_v3, mobilenet_v2, densenet121 model = resnet50(pretrained=False) def resnet50_train_params(): model = resnet50(pretrained=False) return { "model": model, "optimizer": SGD, "weight_decay":1e-4, "lr":0.1, "lr_decay_rate": None, "lr_step_size": None } def inception_v3_train_params(): model = inception_v3(pretrained=False, init_weights=False) return { "model": model, "optimizer": SGD, "weight_decay":0, "lr":0.045, "lr_decay_rate": 0.94, "lr_step_size":2 } def mobilenet_v2_train_params(): model = mobilenet_v2(pretrained=False) return { "model": model, "optimizer": SGD, "weight_decay":4e-5, "lr":0.045, "lr_decay_rate": 0.98, "lr_step_size":1 } def densenet121_train_params(): model = densenet121(pretrained=False) return { "model": model, "optimizer": SGD, "weight_decay":1e-4, "lr":0.1, "lr_decay_rate": None, "lr_step_size":None, } ================================================ FILE: imageai/Classification/README.md ================================================ # ImageAI : Image Classification ## --------------------------------------------------- ## Introducing Jarvis and TheiaEngine. We the creators of ImageAI are glad to announce 2 new AI projects to provide state-of-the-art Generative AI, LLM and Image Understanding on your personal computer and servers. [![](../../jarvis.png)](https://jarvis.genxr.co) Install Jarvis on PC/Mac to setup limitless access to LLM powered AI Chats for your every day work, research and generative AI needs with 100% privacy and full offline capability. Visit [https://jarvis.genxr.co](https://jarvis.genxr.co/) to get started. [![](../../theiaengine.png)](https://www.genxr.co/theia-engine) [TheiaEngine](https://www.genxr.co/theia-engine), the next-generation computer Vision AI API capable of all Generative and Understanding computer vision tasks in a single API call and available via REST API to all programming languages. Features include - **Detect 300+ objects** ( 220 more objects than ImageAI) - **Provide answers to any content or context questions** asked on an image - very useful to get information on any object, action or information without needing to train a new custom model for every tasks - **Generate scene description and summary** - **Convert 2D image to 3D pointcloud and triangular mesh** - **Semantic Scene mapping of objects, walls, floors, etc** - **Stateless Face recognition and emotion detection** - **Image generation and augmentation from prompt** - etc. Visit [https://www.genxr.co/theia-engine](https://www.genxr.co/theia-engine) to try the demo and join in the beta testing today. ## --------------------------------------------------- ### TABLE OF CONTENTS - :white_square_button: First Prediction - :white_square_button: Documentation ImageAI provides 4 different algorithms and model types to perform image prediction. To perform image prediction on any picture, take the following simple steps. The 4 algorithms provided for image prediction include **MobileNetV2**, **ResNet50**, **InceptionV3** and **DenseNet121**. Each of these algorithms have individual model files which you must use depending on the choice of your algorithm. To download the model file for your choice of algorithm, click on any of the links below: - **[MobileNetV2](https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/mobilenet_v2-b0353104.pth)** _(Size = 4.82 mb, fastest prediction time and moderate accuracy)_ - **[ResNet50](https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/resnet50-19c8e357.pth)** by Microsoft Research _(Size = 98 mb, fast prediction time and high accuracy)_ - **[InceptionV3](https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/inception_v3_google-1a9a5a14.pth)** by Google Brain team _(Size = 91.6 mb, slow prediction time and higher accuracy)_ - **[DenseNet121](https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/densenet121-a639ec97.pth)** by Facebook AI Research _(Size = 31.6 mb, slower prediction time and highest accuracy)_ Great! Once you have downloaded this model file, start a new python project, and then copy the model file to your project folder where your python files (.py files) will be . Download the image below, or take any image on your computer and copy it to your python project's folder. Then create a python file and give it a name; an example is `FirstPrediction.py`. Then write the code below into the python file: ### FirstPrediction.py
```python from imageai.Classification import ImageClassification import os execution_path = os.getcwd() prediction = ImageClassification() prediction.setModelTypeAsResNet50() prediction.setModelPath(os.path.join(execution_path, "resnet50-19c8e357.pth")) prediction.loadModel() predictions, probabilities = prediction.classifyImage(os.path.join(execution_path, "1.jpg"), result_count=5 ) for eachPrediction, eachProbability in zip(predictions, probabilities): print(eachPrediction , " : " , eachProbability) ``` Sample Result: ![](../../data-images/1.jpg) ``` convertible : 52.459555864334106 sports_car : 37.61284649372101 pickup : 3.1751200556755066 car_wheel : 1.817505806684494 minivan : 1.7487050965428352 ``` The code above works as follows: ```python from imageai.Classification import ImageClassification import os ``` The code above imports the `ImageAI` library and the python `os` class. ```python execution_path = os.getcwd() ``` The above line obtains the path to the folder that contains your python file (in this example, your FirstPrediction.py). ```python prediction = ImageClassification() prediction.setModelTypeAsResNet50() prediction.setModelPath(os.path.join(execution_path, "resnet50-19c8e357.pth")) ``` In the lines above, we created and instance of the `ImagePrediction()` class in the first line, then we set the model type of the prediction object to ResNet by caling the `.setModelTypeAsResNet50()` in the second line and then we set the model path of the prediction object to the path of the model file (`resnet50-19c8e357.pth`) we copied to the python file folder in the third line. ```python predictions, probabilities = prediction.classifyImage(os.path.join(execution_path, "1.jpg"), result_count=5 ) ``` In the above line, we defined 2 variables to be equal to the function called to predict an image, which is the `.classifyImage()` function, into which we parsed the path to our image and also state the number of prediction results we want to have (values from 1 to 1000) parsing `result_count=5`. The `.classifyImage()` function will return 2 array objects with the first (**predictions**) being an array of predictions and the second (**percentage_probabilities**) being an array of the corresponding percentage probability for each prediction. ```python for eachPrediction, eachProbability in zip(predictions, probabilities): print(eachPrediction, " : " , eachProbability) ``` The above line obtains each object in the **predictions** array, and also obtains the corresponding percentage probability from the **percentage_probabilities**, and finally prints the result of both to console. ### Documentation We have provided full documentation for all **ImageAI** classes and functions. Find links below:** * Documentation - **English Version [https://imageai.readthedocs.io](https://imageai.readthedocs.io)** ================================================ FILE: imageai/Classification/__init__.py ================================================ import os, re from typing import Union from typing import List, Tuple import numpy as np import torch from torchvision.models import resnet50, densenet121, mobilenet_v2, inception_v3 import torch.nn.functional as F from torchvision import transforms from PIL import Image import traceback from ..backend_check.model_extension import extension_check classification_models = { "resnet50": { "model": resnet50(pretrained=False) }, "densenet121": { "model": densenet121(pretrained=False) }, "inceptionv3": { "model": inception_v3(pretrained=False) }, "mobilenetv2": { "model": mobilenet_v2(pretrained=False) } } class ImageClassification: """ This is the image classification class in the ImageAI library. It allows you to classify objects into all the 1000 different classes in the ImageNet dataset [ https://www.kaggle.com/c/imagenet-object-localization-challenge/overview/description ]. The class provides 4 different classification models which are ResNet50, DensesNet121, InceptionV3 and MobileNetV2. The following functions are required to be called before a classification can be made * At least of of the following and it must correspond to the model set in the setModelPath() [setModelTypeAsMobileNetV2(), setModelTypeAsResNet(), setModelTypeAsDenseNet, setModelTypeAsInceptionV3] * setModelPath: This is used to specify the absolute path to a pretrained model file. Download any of the files in this release -> https://github.com/OlafenwaMoses/ImageAI/releases/tag/3.0.0-pretrained * useCPU (Optional): If you will like to force the image classification to be performed on CPU, call this function. * loadModel: Used to load the pretrained model weights * classifyImage(): Used for classifying an image. """ def __init__(self) -> None: self.__model_type:str = None self.__model:Union[resnet50, densenet121, mobilenet_v2, inception_v3] = None self.__model_path: str = None self.__classes_path: str = os.path.join(os.path.dirname(os.path.abspath(__file__)), "imagenet_classes.txt") self.__model_loaded: bool = False self.__device: str = "cuda" if torch.cuda.is_available() else "cpu" self.__classes: List[str] = [] def setModelPath(self, path: str): """ 'setModelPath()' function is required and is used to set the file path to the model adopted from the list of the available 4 model types. The model path must correspond to the model type set for the classification instance object. :param model_path: :return: """ if os.path.isfile(path): extension_check(path) self.__model_path = path else: raise ValueError( f"The path '{path}' isn't a valid file. Ensure you specify the path to a valid trained model file." ) def __load_classes(self) -> List[str]: with open(self.__classes_path) as f: self.__classes = [c.strip() for c in f.readlines()] def __load_image(self, image_input: Union[str, np.ndarray, Image.Image]) -> torch.Tensor: images = [] preprocess = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) if type(image_input) == str: if os.path.isfile(image_input): img = Image.open(image_input).convert("RGB") images.append(preprocess(img)) else: raise ValueError(f"image path '{image_input}' is not found or a valid file") elif type(image_input) == np.ndarray: img = Image.fromarray(image_input).convert("RGB") images.append(preprocess(img)) elif "PIL" in str(type(image_input)): img = image_input.convert("RGB") images.append(preprocess(img)) else: raise ValueError(f"Invalid image input format") return torch.stack(images) def setModelTypeAsResNet50(self): """ 'setModelTypeAsResNet50()' is used to set the model type to the ResNet50 model. :return: """ if self.__model_type == None: self.__model_type = "resnet50" def setModelTypeAsDenseNet121(self): """ 'setModelTypeAsDenseNet121()' is used to set the model type to the DenseNet121 model. :return: """ if self.__model_type == None: self.__model_type = "densenet121" def setModelTypeAsInceptionV3(self): """ 'setModelTypeAsInceptionV3()' is used to set the model type to the InceptionV3 model. :return: """ if self.__model_type == None: self.__model_type = "inceptionv3" def setModelTypeAsMobileNetV2(self): """ 'setModelTypeAsMobileNetV2()' is used to set the model type to the MobileNetV2 model. :return: """ if self.__model_type == None: self.__model_type = "mobilenetv2" def useCPU(self): """ Used to force classification to be done on CPU. By default, classification will occur on GPU compute if available else CPU compute. """ self.__device = "cpu" if self.__model_loaded: self.__model_loaded = False self.loadModel() def loadModel(self): """ 'loadModel()' function is used to load the model weights into the model architecture from the file path defined in the setModelPath() function. :return: """ if not self.__model_loaded: try: if self.__model_path == None: raise ValueError( "Model path not specified. Call '.setModelPath()' and parse the path to the model file before loading the model." ) if self.__model_type in classification_models.keys(): self.__model = classification_models[self.__model_type]["model"] else: raise ValueError( f"Model type '{self.__model_type}' not supported." ) state_dict = torch.load(self.__model_path) if self.__model_type == "densenet121": # '.'s are no longer allowed in module names, but previous densenet layers # as provided by the Pytorch's model zoon has names that uses '.'s. pattern = re.compile( r"^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\." "(?:weight|bias|running_mean|running_var))$" ) for key in list(state_dict.keys()): res = pattern.match(key) if res: new_key = res.group(1) + res.group(2) state_dict[new_key] = state_dict[key] del state_dict[key] self.__model.load_state_dict( state_dict ) self.__model.to(self.__device) self.__model_loaded = True self.__model.eval() self.__load_classes() except Exception: print(traceback.print_exc()) print("Weight loading failed.\nEnsure the model path is" " set and the weight file is in the specified model path.") def classifyImage(self, image_input: Union[str, np.ndarray, Image.Image], result_count: int=5) -> Tuple[List[str], List[float]]: """ 'classifyImage()' function is used to classify a given image by receiving the following arguments: * image_input: file path, numpy array or PIL image of the input image. * result_count (optional) , the number of classifications to be sent which must be whole numbers between 1 and 1000. The default is 5. This function returns 2 arrays namely 'classification_results' and 'classification_probabilities'. The 'classification_results' contains possible objects classes arranged in descending of their percentage probabilities. The 'classification_probabilities' contains the percentage probability of each object class. The position of each object class in the 'classification_results' array corresponds with the positions of the percentage probability in the 'classification_probabilities' array. :param image_input: :param result_count: :return classification_results, classification_probabilities: """ if not self.__model_loaded: raise RuntimeError( "Model not yet loaded. You need to call '.loadModel()' before performing image classification" ) images = self.__load_image(image_input) images = images.to(self.__device) with torch.no_grad(): output = self.__model(images) probabilities = torch.softmax(output, dim=1) topN_prob, topN_catid = torch.topk(probabilities, result_count) predictions = [ [ (self.__classes[topN_catid[i][j]], topN_prob[i][j].item()*100) for j in range(topN_prob.shape[1]) ] for i in range(topN_prob.shape[0]) ] labels_pred = [] probabilities_pred = [] for idx, pred in enumerate(predictions): for label, score in pred: labels_pred.append(label) probabilities_pred.append(round(score, 4)) return labels_pred, probabilities_pred ================================================ FILE: imageai/Classification/imagenet_classes.txt ================================================ tench goldfish great white shark tiger shark hammerhead electric ray stingray cock hen ostrich brambling goldfinch house finch junco indigo bunting robin bulbul jay magpie chickadee water ouzel kite bald eagle vulture great grey owl European fire salamander common newt eft spotted salamander axolotl bullfrog tree frog tailed frog loggerhead leatherback turtle mud turtle terrapin box turtle banded gecko common iguana American chameleon whiptail agama frilled lizard alligator lizard Gila monster green lizard African chameleon Komodo dragon African crocodile American alligator triceratops thunder snake ringneck snake hognose snake green snake king snake garter snake water snake vine snake night snake boa constrictor rock python Indian cobra green mamba sea snake horned viper diamondback sidewinder trilobite harvestman scorpion black and gold garden spider barn spider garden spider black widow tarantula wolf spider tick centipede black grouse ptarmigan ruffed grouse prairie chicken peacock quail partridge African grey macaw sulphur-crested cockatoo lorikeet coucal bee eater hornbill hummingbird jacamar toucan drake red-breasted merganser goose black swan tusker echidna platypus wallaby koala wombat jellyfish sea anemone brain coral flatworm nematode conch snail slug sea slug chiton chambered nautilus Dungeness crab rock crab fiddler crab king crab American lobster spiny lobster crayfish hermit crab isopod white stork black stork spoonbill flamingo little blue heron American egret bittern crane limpkin European gallinule American coot bustard ruddy turnstone red-backed sandpiper redshank dowitcher oystercatcher pelican king penguin albatross grey whale killer whale dugong sea lion Chihuahua Japanese spaniel Maltese dog Pekinese Shih-Tzu Blenheim spaniel papillon toy terrier Rhodesian ridgeback Afghan hound basset beagle bloodhound bluetick black-and-tan coonhound Walker hound English foxhound redbone borzoi Irish wolfhound Italian greyhound whippet Ibizan hound Norwegian elkhound otterhound Saluki Scottish deerhound Weimaraner Staffordshire bullterrier American Staffordshire terrier Bedlington terrier Border terrier Kerry blue terrier Irish terrier Norfolk terrier Norwich terrier Yorkshire terrier wire-haired fox terrier Lakeland terrier Sealyham terrier Airedale cairn Australian terrier Dandie Dinmont Boston bull miniature schnauzer giant schnauzer standard schnauzer Scotch terrier Tibetan terrier silky terrier soft-coated wheaten terrier West Highland white terrier Lhasa flat-coated retriever curly-coated retriever golden retriever Labrador retriever Chesapeake Bay retriever German short-haired pointer vizsla English setter Irish setter Gordon setter Brittany spaniel clumber English springer Welsh springer spaniel cocker spaniel Sussex spaniel Irish water spaniel kuvasz schipperke groenendael malinois briard kelpie komondor Old English sheepdog Shetland sheepdog collie Border collie Bouvier des Flandres Rottweiler German shepherd Doberman miniature pinscher Greater Swiss Mountain dog Bernese mountain dog Appenzeller EntleBucher boxer bull mastiff Tibetan mastiff French bulldog Great Dane Saint Bernard Eskimo dog malamute Siberian husky dalmatian affenpinscher basenji pug Leonberg Newfoundland Great Pyrenees Samoyed Pomeranian chow keeshond Brabancon griffon Pembroke Cardigan toy poodle miniature poodle standard poodle Mexican hairless timber wolf white wolf red wolf coyote dingo dhole African hunting dog hyena red fox kit fox Arctic fox grey fox tabby tiger cat Persian cat Siamese cat Egyptian cat cougar lynx leopard snow leopard jaguar lion tiger cheetah brown bear American black bear ice bear sloth bear mongoose meerkat tiger beetle ladybug ground beetle long-horned beetle leaf beetle dung beetle rhinoceros beetle weevil fly bee ant grasshopper cricket walking stick cockroach mantis cicada leafhopper lacewing dragonfly damselfly admiral ringlet monarch cabbage butterfly sulphur butterfly lycaenid starfish sea urchin sea cucumber wood rabbit hare Angora hamster porcupine fox squirrel marmot beaver guinea pig sorrel zebra hog wild boar warthog hippopotamus ox water buffalo bison ram bighorn ibex hartebeest impala gazelle Arabian camel llama weasel mink polecat black-footed ferret otter skunk badger armadillo three-toed sloth orangutan gorilla chimpanzee gibbon siamang guenon patas baboon macaque langur colobus proboscis monkey marmoset capuchin howler monkey titi spider monkey squirrel monkey Madagascar cat indri Indian elephant African elephant lesser panda giant panda barracouta eel coho rock beauty anemone fish sturgeon gar lionfish puffer abacus abaya academic gown accordion acoustic guitar aircraft carrier airliner airship altar ambulance amphibian analog clock apiary apron ashcan assault rifle backpack bakery balance beam balloon ballpoint Band Aid banjo bannister barbell barber chair barbershop barn barometer barrel barrow baseball basketball bassinet bassoon bathing cap bath towel bathtub beach wagon beacon beaker bearskin beer bottle beer glass bell cote bib bicycle-built-for-two bikini binder binoculars birdhouse boathouse bobsled bolo tie bonnet bookcase bookshop bottlecap bow bow tie brass brassiere breakwater breastplate broom bucket buckle bulletproof vest bullet train butcher shop cab caldron candle cannon canoe can opener cardigan car mirror carousel carpenter's kit carton car wheel cash machine cassette cassette player castle catamaran CD player cello cellular telephone chain chainlink fence chain mail chain saw chest chiffonier chime china cabinet Christmas stocking church cinema cleaver cliff dwelling cloak clog cocktail shaker coffee mug coffeepot coil combination lock computer keyboard confectionery container ship convertible corkscrew cornet cowboy boot cowboy hat cradle crane crash helmet crate crib Crock Pot croquet ball crutch cuirass dam desk desktop computer dial telephone diaper digital clock digital watch dining table dishrag dishwasher disk brake dock dogsled dome doormat drilling platform drum drumstick dumbbell Dutch oven electric fan electric guitar electric locomotive entertainment center envelope espresso maker face powder feather boa file fireboat fire engine fire screen flagpole flute folding chair football helmet forklift fountain fountain pen four-poster freight car French horn frying pan fur coat garbage truck gasmask gas pump goblet go-kart golf ball golfcart gondola gong gown grand piano greenhouse grille grocery store guillotine hair slide hair spray half track hammer hamper hand blower hand-held computer handkerchief hard disc harmonica harp harvester hatchet holster home theater honeycomb hook hoopskirt horizontal bar horse cart hourglass iPod iron jack-o'-lantern jean jeep jersey jigsaw puzzle jinrikisha joystick kimono knee pad knot lab coat ladle lampshade laptop lawn mower lens cap letter opener library lifeboat lighter limousine liner lipstick Loafer lotion loudspeaker loupe lumbermill magnetic compass mailbag mailbox maillot maillot manhole cover maraca marimba mask matchstick maypole maze measuring cup medicine chest megalith microphone microwave military uniform milk can minibus miniskirt minivan missile mitten mixing bowl mobile home Model T modem monastery monitor moped mortar mortarboard mosque mosquito net motor scooter mountain bike mountain tent mouse mousetrap moving van muzzle nail neck brace necklace nipple notebook obelisk oboe ocarina odometer oil filter organ oscilloscope overskirt oxcart oxygen mask packet paddle paddlewheel padlock paintbrush pajama palace panpipe paper towel parachute parallel bars park bench parking meter passenger car patio pay-phone pedestal pencil box pencil sharpener perfume Petri dish photocopier pick pickelhaube picket fence pickup pier piggy bank pill bottle pillow ping-pong ball pinwheel pirate pitcher plane planetarium plastic bag plate rack plow plunger Polaroid camera pole police van poncho pool table pop bottle pot potter's wheel power drill prayer rug printer prison projectile projector puck punching bag purse quill quilt racer racket radiator radio radio telescope rain barrel recreational vehicle reel reflex camera refrigerator remote control restaurant revolver rifle rocking chair rotisserie rubber eraser rugby ball rule running shoe safe safety pin saltshaker sandal sarong sax scabbard scale school bus schooner scoreboard screen screw screwdriver seat belt sewing machine shield shoe shop shoji shopping basket shopping cart shovel shower cap shower curtain ski ski mask sleeping bag slide rule sliding door slot snorkel snowmobile snowplow soap dispenser soccer ball sock solar dish sombrero soup bowl space bar space heater space shuttle spatula speedboat spider web spindle sports car spotlight stage steam locomotive steel arch bridge steel drum stethoscope stole stone wall stopwatch stove strainer streetcar stretcher studio couch stupa submarine suit sundial sunglass sunglasses sunscreen suspension bridge swab sweatshirt swimming trunks swing switch syringe table lamp tank tape player teapot teddy television tennis ball thatch theater curtain thimble thresher throne tile roof toaster tobacco shop toilet seat torch totem pole tow truck toyshop tractor trailer truck tray trench coat tricycle trimaran tripod triumphal arch trolleybus trombone tub turnstile typewriter keyboard umbrella unicycle upright vacuum vase vault velvet vending machine vestment viaduct violin volleyball waffle iron wall clock wallet wardrobe warplane washbasin washer water bottle water jug water tower whiskey jug whistle wig window screen window shade Windsor tie wine bottle wing wok wooden spoon wool worm fence wreck yawl yurt web site comic book crossword puzzle street sign traffic light book jacket menu plate guacamole consomme hot pot trifle ice cream ice lolly French loaf bagel pretzel cheeseburger hotdog mashed potato head cabbage broccoli cauliflower zucchini spaghetti squash acorn squash butternut squash cucumber artichoke bell pepper cardoon mushroom Granny Smith strawberry orange lemon fig pineapple banana jackfruit custard apple pomegranate hay carbonara chocolate sauce dough meat loaf pizza potpie burrito red wine espresso cup eggnog alp bubble cliff coral reef geyser lakeside promontory sandbar seashore valley volcano ballplayer groom scuba diver rapeseed daisy yellow lady's slipper corn acorn hip buckeye coral fungus agaric gyromitra stinkhorn earthstar hen-of-the-woods bolete ear toilet tissue ================================================ FILE: imageai/Detection/Custom/CUSTOMDETECTION.md ================================================ # ImageAI : Custom Object Detection ### TABLE OF CONTENTS - :white_square_button: Custom Object Detection - :white_square_button: Object Detection, Extraction and Fine-tune - :white_square_button: Hiding/Showing Object Name and Probability - :white_square_button: Image Input & Output Types - :white_square_button: Documentation ImageAI provides very convenient and powerful methods to perform object detection on images and extract each object from the image using your own **custom YOLOv3 or TinyYOLOv3 model** and the corresponding **.json** generated during the training. To test the custom object detection, you can download a sample custom model we have trained to detect the Hololens headset and its **.json** file via the links below: * [**yolov3_hololens-yolo_mAP-0.82726_epoch-73.pt**](https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/yolov3_hololens-yolo_mAP-0.82726_epoch-73.pt) _(Size = 236 mb)_ * [**hololens-yolo_yolov3_detection_config.json**](https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/hololens-yolo_yolov3_detection_config.json) Once you download the custom object detection model file, you should copy the model file to the your project folder where your **.py** files will be. Then create a python file and give it a name; an example is FirstCustomDetection.py. Then write the code below into the python file: ### FirstCustomDetection.py
```python from imageai.Detection.Custom import CustomObjectDetection detector = CustomObjectDetection() detector.setModelTypeAsYOLOv3() detector.setModelPath("yolov3_hololens-yolo_mAP-0.82726_epoch-73.pt") detector.setJsonPath("hololens-yolo_yolov3_detection_config.json") detector.loadModel() detections = detector.detectObjectsFromImage(input_image="holo2.jpg", output_image_path="holo2-detected.jpg") for detection in detections: print(detection["name"], " : ", detection["percentage_probability"], " : ", detection["box_points"]) ``` Sample Result - Input: ![Input](../../../data-images/holo2.jpg) Output: ![Output](../../../data-images/holo2-detected.jpg) ``` hololens : 39.69653248786926 : [611, 74, 751, 154] hololens : 87.6643180847168 : [23, 46, 90, 79] hololens : 89.25175070762634 : [191, 66, 243, 95] hololens : 64.49641585350037 : [437, 81, 514, 133] hololens : 91.78624749183655 : [380, 113, 423, 138] ``` Let us make a breakdown of the object detection code that we used above. ```python from imageai.Detection.Custom import CustomObjectDetection detector = CustomObjectDetection() detector.setModelTypeAsYOLOv3() ``` In the 3 lines above , we import the **ImageAI custom object detection** class in the first line, created the class instance on the second line and set the model type to YOLOv3. ```python detector.setModelPath("yolov3_hololens-yolo_mAP-0.82726_epoch-73.pt") detector.setJsonPath("hololens-yolo_yolov3_detection_config.json") detector.loadModel() ``` In the 3 lines above, we specified the file path to our downloaded model file in the first line , specified the path to our **hololens-yolo_yolov3_detection_config.json** file in the second line and loaded the model on the third line. ```python detections = detector.detectObjectsFromImage(input_image="holo2.jpg", output_image_path="holo2-detected.jpg") for detection in detections: print(detection["name"], " : ", detection["percentage_probability"], " : ", detection["box_points"]) ``` In the 3 lines above, we ran the `detectObjectsFromImage()` function and parse in the path to our test image, and the path to the new image which the function will save. Then the function returns an array of dictionaries with each dictionary corresponding to the number of objects detected in the image. Each dictionary has the properties `name` (name of the object), `percentage_probability` (percentage probability of the detection) and `box_points` (the x1,y1,x2 and y2 coordinates of the bounding box of the object). ### Object Detection, Extraction and Fine-tune
In the examples we used above, we ran the object detection on an image and it returned the detected objects in an array as well as save a new image with rectangular markers drawn on each object. In our next examples, we will be able to extract each object from the input image and save it independently. In the example code below which is very identical to the previous object detection code, we will save each object detected as a separate image. ```python from imageai.Detection.Custom import CustomObjectDetection detector = CustomObjectDetection() detector.setModelTypeAsYOLOv3() detector.setModelPath("yolov3_hololens-yolo_mAP-0.82726_epoch-73.pt") detector.setJsonPath("hololens-yolo_yolov3_detection_config.json") detector.loadModel() detections, extracted_objects_array = detector.detectObjectsFromImage(input_image="holo2.jpg", output_image_path="holo2-detected.jpg", extract_detected_objects=True) for detection, object_path in zip(detections, extracted_objects_array): print(object_path) print(detection["name"], " : ", detection["percentage_probability"], " : ", detection["box_points"]) print("---------------") ``` Sample Result: Output Images ![](../../../data-images/holo2-detected-objects/hololens-1.jpg) ![](../../../data-images/holo2-detected-objects/hololens-2.jpg) ![](../../../data-images/holo2-detected-objects/hololens-3.jpg) ![](../../../data-images/holo2-detected-objects/hololens-4.jpg) ![](../../../data-images/holo2-detected-objects/hololens-5.jpg) ![](../../../data-images/holo2-detected-objects/hololens-6.jpg) ![](../../../data-images/holo2-detected-objects/hololens-7.jpg) Let us review the part of the code that perform the object detection and extract the images: ```python detections, extracted_objects_array = detector.detectObjectsFromImage(input_image="holo2.jpg", output_image_path="holo2-detected.jpg", extract_detected_objects=True) for detection, object_path in zip(detections, extracted_objects_array): print(object_path) print(detection["name"], " : ", detection["percentage_probability"], " : ", detection["box_points"]) print("---------------") ``` In the above above lines, we called the `detectObjectsFromImage()` , parse in the input image path, output image part, and an extra parameter `extract_detected_objects=True`. This parameter states that the function should extract each object detected from the image and save it has a seperate image. The parameter is false by default. Once set to `true`, the function will create a directory which is the `output image path + "-objects"`. Then it saves all the extracted images into this new directory with each image's name being the `detected object name + "-" + a number` which corresponds to the order at which the objects were detected. This new parameter we set to extract and save detected objects as an image will make the function to return 2 values. The first is the array of dictionaries with each dictionary corresponding to a detected object. The second is an array of the paths to the saved images of each object detected and extracted, and they are arranged in order at which the objects are in the first array. ### And one important feature you need to know! You will recall that the percentage probability for each detected object is sent back by the `detectObjectsFromImage()` function. The function has a parameter `minimum_percentage_probability` , whose default value is `30` (value ranges between 0 - 100) , but it set to 30 in this example. That means the function will only return a detected object if it's percentage probability is **30 or above**. The value was kept at this number to ensure the integrity of the detection results. You fine-tune the object detection by setting `minimum_percentage_probability` equal to a smaller value to detect more number of objects or higher value to detect less number of objects. ### Hiding/Showing Object Name and Probability
**ImageAI** provides options to hide the name of objects detected and/or the percentage probability from being shown on the saved/returned detected image. Using the `detectObjectsFromImage()` and `detectCustomObjectsFromImage()` functions, the parameters `'display_object_name'` and `'display_percentage_probability'` can be set to True of False individually. Take a look at the code below: ```python detections = detector.detectObjectsFromImage(input_image=os.path.join(execution_path , "holo2.jpg"), output_image_path=os.path.join(execution_path , "holo2_nodetails.jpg"), minimum_percentage_probability=30, display_percentage_probability=False, display_object_name=False) ``` In the above code, we specified that both the object name and percentage probability should not be shown. As you can see in the result below, both the names of the objects and their individual percentage probability is not shown in the detected image. **Result** ![](../../../data-images/holo2-nodetails.jpg) ### Image Input & Output Types
**ImageAI** custom object detection supports 2 input types of inputs which are **file path to image file**(default) and **numpy array of an image** as well as 2 types of output which are image **file**(default) and numpy **array **. This means you can now perform object detection in production applications such as on a web server and system that returns file in any of the above stated formats. To perform object detection with numpy array input, you just need to state the input type in the `.detectObjectsFromImage()` function. See example below. ```python detections = detector.detectObjectsFromImage(input_type="array", input_image=image_array , output_image_path=os.path.join(execution_path , "holo2-detected.jpg")) # For numpy array input type ``` To perform object detection with numpy array output you just need to state the output type in the `.detectObjectsFromImage()` function. See example below. ```python detected_image_array, detections = detector.detectObjectsFromImage(output_type="array", input_image="holo2.jpg" ) # For numpy array output type ``` ### Documentation
We have provided full documentation for all **ImageAI** classes and functions. Find links below: * Documentation - **English Version** [https://imageai.readthedocs.io](https://imageai.readthedocs.io) ================================================ FILE: imageai/Detection/Custom/CUSTOMDETECTIONTRAINING.md ================================================ # ImageAI : Custom Detection Model Training ## --------------------------------------------------- ## Introducing Jarvis and TheiaEngine. We the creators of ImageAI are glad to announce 2 new AI projects to provide state-of-the-art Generative AI, LLM and Image Understanding on your personal computer and servers. [![](../../../jarvis.png)](https://jarvis.genxr.co) Install Jarvis on PC/Mac to setup limitless access to LLM powered AI Chats for your every day work, research and generative AI needs with 100% privacy and full offline capability. Visit [https://jarvis.genxr.co](https://jarvis.genxr.co/) to get started. [![](../../../theiaengine.png)](https://www.genxr.co/theia-engine) [TheiaEngine](https://www.genxr.co/theia-engine), the next-generation computer Vision AI API capable of all Generative and Understanding computer vision tasks in a single API call and available via REST API to all programming languages. Features include - **Detect 300+ objects** ( 220 more objects than ImageAI) - **Provide answers to any content or context questions** asked on an image - very useful to get information on any object, action or information without needing to train a new custom model for every tasks - **Generate scene description and summary** - **Convert 2D image to 3D pointcloud and triangular mesh** - **Semantic Scene mapping of objects, walls, floors, etc** - **Stateless Face recognition and emotion detection** - **Image generation and augmentation from prompt** - etc. Visit [https://www.genxr.co/theia-engine](https://www.genxr.co/theia-engine) to try the demo and join in the beta testing today. ## --------------------------------------------------- **ImageAI** provides the most simple and powerful approach to training custom object detection models using the YOLOv3 architeture, which which you can load into the `imageai.Detection.Custom.CustomObjectDetection` class. This allows you to train your own **YOLOv3** or **TinyYOLOv3** model on any set of images that corresponds to any type of objects of interest. The training process generates a JSON file that maps the objects names in your image dataset and the detection anchors, as well as creates lots of models. In choosing the best model for your custom object detection task, an `evaluateModel()` function has been provided to compute the **mAP** of your saved models by allowing you to state your desired **IoU** and **Non-maximum Suppression** values. Then you can perform custom object detection using the model and the JSON file generated. ### TABLE OF CONTENTS - :white_square_button: Preparing your custom dataset - :white_square_button: Training on your custom Dataset - :white_square_button: Evaluating your saved detection models' mAP ### Preparing your custom dataset
To train a custom detection model, you need to prepare the images you want to use to train the model. You will prepare the images as follows: 1. Decide the type of object(s) you want to detect and collect about **200 (minimum recommendation)** or more picture of each of the object(s) 2. Once you have collected the images, you need to annotate the object(s) in the images. **ImageAI** uses the **YOLO** for image annotation. You can generate this annotation for your images using the easy to use [**LabelImg**](https://github.com/tzutalin/labelImg) image annotation tool, available for Windows, Linux and MacOS systems. Open the link below to install the annotation tool. See: [https://github.com/tzutalin/labelImg](https://github.com/tzutalin/labelImg) 3. When you are done annotating your images, **annotation .txt** files will be generated for each image in your dataset. The **annotation .txt** file describes each or **all** of the objects in the image. For example, if each image your image names are **image(1).jpg**, **image(2).jpg**, **image(3).jpg** till **image(z).jpg**; the corresponding annotation for each of the images will be **image(1).txt**, **image(2).txt**, **image(3).txt** till **image(z).txt**. 4. Once you have the annotations for all your images, create a folder for your dataset (E.g headsets) and in this parent folder, create child folders **train** and **validation** 5. In the train folder, create **images** and **annotations** sub-folders. Put about 70-80% of your dataset of each object's images in the **images** folder and put the corresponding annotations for these images in the **annotations** folder. 6. In the validation folder, create **images** and **annotations** sub-folders. Put the rest of your dataset images in the **images** folder and put the corresponding annotations for these images in the **annotations** folder. 7. Once you have done this, the structure of your image dataset folder should look like below: ``` >> train >> images >> img_1.jpg (shows Object_1) >> images >> img_2.jpg (shows Object_2) >> images >> img_3.jpg (shows Object_1, Object_3 and Object_n) >> annotations >> img_1.txt (describes Object_1) >> annotations >> img_2.txt (describes Object_2) >> annotations >> img_3.txt (describes Object_1, Object_3 and Object_n) >> validation >> images >> img_151.jpg (shows Object_1, Object_3 and Object_n) >> images >> img_152.jpg (shows Object_2) >> images >> img_153.jpg (shows Object_1) >> annotations >> img_151.txt (describes Object_1, Object_3 and Object_n) >> annotations >> img_152.txt (describes Object_2) >> annotations >> img_153.txt (describes Object_1) ``` 8. You can train your custom detection model completely from scratch or use transfer learning (recommended for better accuracy) from a pre-trained YOLOv3 model. Also, we have provided a sample annotated Hololens and Headsets (Hololens and Oculus) dataset for you to train with. Download the pre-trained YOLOv3 model and the sample datasets in the link below. Download dataset `hololens-yolo.zip` [here](https://github.com/OlafenwaMoses/ImageAI/releases/tag/test-resources-v3) and pre-trained model `yolov3.pt` [here](https://github.com/OlafenwaMoses/ImageAI/releases/tag/3.0.0-pretrained) ### Training on your custom dataset
Before you start training your custom detection model, kindly take note of the following: - The default **batch_size** is 4. If you are training with **Google Colab**, this will be fine. However, I will advice you use a more powerful GPU than the K80 offered by Colab as the higher your **batch_size (8, 16)**, the better the accuracy of your detection model. Then your training code goes as follows: ```python from imageai.Detection.Custom import DetectionModelTrainer trainer = DetectionModelTrainer() trainer.setModelTypeAsYOLOv3() trainer.setDataDirectory(data_directory="hololens-yolo") trainer.setTrainConfig(object_names_array=["hololens"], batch_size=4, num_experiments=200, train_from_pretrained_model="yolov3.pt") # In the above,when training for detecting multiple objects, #set object_names_array=["object1", "object2", "object3",..."objectz"] trainer.trainModel() ``` Yes! Just 6 lines of code and you can train object detection models on your custom dataset. Now lets take a look at how the code above works. ```python from imageai.Detection.Custom import DetectionModelTrainer trainer = DetectionModelTrainer() trainer.setModelTypeAsYOLOv3() trainer.setDataDirectory(data_directory="hololens-yolo") ``` In the first line, we import the **ImageAI** detection model training class, then we define the model trainer in the second line, we set the network type in the third line and set the path to the image dataset we want to train the network on. ```python trainer.setTrainConfig(object_names_array=["hololens"], batch_size=4, num_experiments=200, train_from_pretrained_model="yolov3.pt") ``` In the line above, we configured our detection model trainer. The parameters we stated in the function as as below: - **num_objects** : this is an array containing the names of the objects in our dataset - **batch_size** : this is to state the batch size for the training - **num_experiments** : this is to state the number of times the network will train over all the training images, which is also called epochs - **train_from_pretrained_model(optional)** : this is to train using transfer learning from a pre-trained **YOLOv3** model ```python trainer.trainModel() ``` When you start the training, you should see something like this in the console: ``` Generating anchor boxes for training images... thr=0.25: 1.0000 best possible recall, 6.93 anchors past thr n=9, img_size=416, metric_all=0.463/0.856-mean/best, past_thr=0.549-mean: ==================== Pretrained YOLOv3 model loaded to initialize weights ==================== Epoch 1/100 ---------- Train: 30it [00:14, 2.09it/s] box loss-> 0.09820, object loss-> 0.27985, class loss-> 0.00000 Validation: 15it [01:45, 7.05s/it] recall: 0.085714 precision: 0.000364 mAP@0.5: 0.000186, mAP@0.5-0.95: 0.000030 Epoch 2/100 ---------- Train: 30it [00:07, 4.25it/s] box loss-> 0.08691, object loss-> 0.07011, class loss-> 0.00000 Validation: 15it [01:37, 6.53s/it] recall: 0.214286 precision: 0.000854 mAP@0.5: 0.000516, mAP@0.5-0.95: 0.000111 . . . . ``` Let us explain the details shown above: ``` Generating anchor boxes for training images... thr=0.25: 1.0000 best possible recall, 6.93 anchors past thr n=9, img_size=416, metric_all=0.463/0.856-mean/best, past_thr=0.549-mean: ==================== Pretrained YOLOv3 model loaded to initialize weights ==================== ``` The above details signifies the following: - **ImageAI** autogenerates the best match detection **anchor boxes** for your image dataset. - A the pretrained **yolov3.pt** was loaded to initalize the weights used to train the model. ``` Epoch 1/100 ---------- Train: 30it [00:14, 2.09it/s] box loss-> 0.09820, object loss-> 0.27985, class loss-> 0.00000 Validation: 15it [01:45, 7.05s/it] recall: 0.085714 precision: 0.000364 mAP@0.5: 0.000186, mAP@0.5-0.95: 0.000030 Epoch 2/100 ---------- Train: 30it [00:07, 4.25it/s] box loss-> 0.08691, object loss-> 0.07011, class loss-> 0.00000 Validation: 15it [01:37, 6.53s/it] recall: 0.214286 precision: 0.000854 mAP@0.5: 0.000516, mAP@0.5-0.95: 0.000111 ``` - The above signifies the progress of the training. - For each experiment (Epoch), a number of metrics are computed. The important once fo chosing an accuate models is detailed below - The bounding box loss `box loss` is reported and expected to drop as the training progresses - The object localization loss `object loss` is reported and expected to drop as the training progresses - The class loss `class loss` is reported and expected to drop as the training progresses. If the class loss persists at 0.0000, it's because your dataset has a single class. - The `mAP50` and `mAP0.5-0.95` metrics are expected to increase. This signifies the models accuracy increases. There might be flunctuations in these metrics sometimes. - For each increase in the `mAP50` after an experiment, a model is saved in the **hololens-yolo/models** folder. The higher the mAP50, the better the model. Once you are done training, you can visit the link below for performing object detection with your **custom detection model** and **detection_config.json** file. [Detection/Custom/CUSTOMDETECTION.md](./CUSTOMDETECTION.md) ### >> Documentation
We have provided full documentation for all **ImageAI** classes and functions. Find links below: * Documentation - **English Version** [https://imageai.readthedocs.io](https://imageai.readthedocs.io) ================================================ FILE: imageai/Detection/Custom/CUSTOMVIDEODETECTION.md ================================================ # ImageAI : Custom Video Object Detection, Tracking and Analysis ### TABLE OF CONTENTS - :white_square_button: First Custom Video Object Detection - :white_square_button: Camera / Live Stream Video Detection - :white_square_button: Video Analysis - :white_square_button: Hiding/Showing Object Name and Probability - :white_square_button: Frame Detection Intervals - :white_square_button: Video Detection Timeout (NEW) - :white_square_button: Documentation ImageAI provides convenient, flexible and powerful methods to perform object detection on videos using your own **custom YOLOv3 model** and the corresponding **.json** file generated during the training. This version of **ImageAI** provides commercial grade video objects detection features, which include but not limited to device/IP camera inputs, per frame, per second, per minute and entire video analysis for storing in databases and/or real-time visualizations and for future insights. To test the custom video object detection,you can download a sample custom model we have trained to detect the Hololens headset and its **.json** file via the links below: * [**yolov3_hololens-yolo_mAP-0.82726_epoch-73.pt**](https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/yolov3_hololens-yolo_mAP-0.82726_epoch-73.pt) _(Size = 236 mb)_ * [**hololens-yolo_yolov3_detection_config.json**](https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/hololens-yolo_yolov3_detection_config.json) Because video object detection is a compute intensive tasks, we advise you perform this experiment using a computer with a NVIDIA GPU and the GPU version of Tensorflow installed. Performing Video Object Detection CPU will be slower than using an NVIDIA GPU powered computer. You can use Google Colab for this experiment as it has an NVIDIA K80 GPU available for free.
Once you download the custom object detection model and JSON files, you should copy the model and the JSON files to the your project folder where your .py files will be. Then create a python file and give it a name; an example is FirstCustomVideoObjectDetection.py. Then write the code below into the python file:
### FirstCustomVideoObjectDetection.py
```python from imageai.Detection.Custom import CustomVideoObjectDetection import os execution_path = os.getcwd() video_detector = CustomVideoObjectDetection() video_detector.setModelTypeAsYOLOv3() video_detector.setModelPath("yolov3_hololens-yolo_mAP-0.82726_epoch-73.pt") video_detector.setJsonPath("hololens-yolo_yolov3_detection_config.json") video_detector.loadModel() video_detector.detectObjectsFromVideo(input_file_path="holo1.mp4", output_file_path=os.path.join(execution_path, "holo1-detected3"), frames_per_second=20, minimum_percentage_probability=40, log_progress=True) ``` [**Input Video**](../../../data-videos/holo1.mp4) [![Input Video](../../../data-images/holo-video.jpg)](../../../data-videos/holo1.mp4) [**Output Video**](https://www.youtube.com/watch?v=4o5GyAR4Mpw) [![Output Video](../../../data-images/holo-video-detected.jpg)](https://www.youtube.com/watch?v=4o5GyAR4Mpw) Let us make a breakdown of the object detection code that we used above. ```python from imageai.Detection.Custom import CustomVideoObjectDetection import os execution_path = os.getcwd() ``` In the 3 lines above , we import the **ImageAI custom video object detection** class in the first line, import the **os** in the second line and obtained the path to folder where our python file runs. ```python video_detector = CustomVideoObjectDetection() video_detector.setModelTypeAsYOLOv3() video_detector.setModelPath("yolov3_hololens-yolo_mAP-0.82726_epoch-73.pt") video_detector.setJsonPath("hololens-yolo_yolov3_detection_config.json") video_detector.loadModel() ``` In the 4 lines above, we created a new instance of the `CustomVideoObjectDetection` class in the first line, set the model type to YOLOv3 in the second line, set the model path to our custom YOLOv3 model file in the third line, specified the path to the model's corresponding **hololens-yolo_yolov3_detection_config.json** in the fourth line and load the model in the fifth line. ```python video_detector.detectObjectsFromVideo(input_file_path="holo1.mp4", output_file_path=os.path.join(execution_path, "holo1-detected3"), frames_per_second=20, minimum_percentage_probability=40, log_progress=True) ``` In the code above, we ran the `detectObjectsFromVideo()` function and parse in the path to our video,the path to the new video (without the extension, it saves a .mp4 video by default) which the function will save, the number of frames per second (fps) that you we desire the output video to have and option to log the progress of the detection in the console. Then the function returns a the path to the saved video which contains boxes and percentage probabilities rendered on objects detected in the video. ### Camera / Live Stream Video Detection
**ImageAI** now allows live-video detection with support for camera inputs. Using **OpenCV**'s **VideoCapture()** function, you can load live-video streams from a device camera, cameras connected by cable or IP cameras, and parse it into **ImageAI**'s **detectObjectsFromVideo()** function. All features that are supported for detecting objects in a video file is also available for detecting objects in a camera's live-video feed. Find below an example of detecting live-video feed from the device camera. ```python from imageai.Detection.Custom import CustomVideoObjectDetection import os import cv2 execution_path = os.getcwd() camera = cv2.VideoCapture(0) video_detector = CustomVideoObjectDetection() video_detector.setModelTypeAsYOLOv3() video_detector.setModelPath("yolov3_hololens-yolo_mAP-0.82726_epoch-73.pt") video_detector.setJsonPath("hololens-yolo_yolov3_detection_config.json") video_detector.loadModel() video_detector.detectObjectsFromVideo(camera_input=camera, output_file_path=os.path.join(execution_path, "holo1-detected3"), frames_per_second=20, minimum_percentage_probability=40, log_progress=True) ``` The difference in the code above and the code for the detection of a video file is that we defined an **OpenCV VideoCapture** instance and loaded the default device camera into it. Then we parsed the camera we defined into the parameter **camera_input** which replaces the **input_file_path** that is used for video file. ### Video Analysis
**ImageAI** now provide commercial-grade video analysis in the Custom Video Object Detection class, for both video file inputs and camera inputs. This feature allows developers to obtain deep insights into any video processed with **ImageAI**. This insights can be visualized in real-time, stored in a NoSQL database for future review or analysis.
For video analysis, the **detectObjectsFromVideo()** now allows you to state your own defined functions which will be executed for every frame, seconds and/or minute of the video detected as well as a state a function that will be executed at the end of a video detection. Once this functions are stated, they will receive raw but comprehensive analytical data on the index of the frame/second/minute, objects detected (name, percentage_probability and box_points), number of instances of each unique object detected and average number of occurrence of each unique object detected over a second/minute and entire video. To obtain the video analysis, all you need to do is specify a function, state the corresponding parameters it will be receiving and parse the function name into the **per_frame_function**, **per_second_function**, **per_minute_function** and **video_complete_function** parameters in the detection function. Find below examples of video analysis functions. ```python def forFrame(frame_number, output_array, output_count): print("FOR FRAME " , frame_number) print("Output for each object : ", output_array) print("Output count for unique objects : ", output_count) print("------------END OF A FRAME --------------") def forSeconds(second_number, output_arrays, count_arrays, average_output_count): print("SECOND : ", second_number) print("Array for the outputs of each frame ", output_arrays) print("Array for output count for unique objects in each frame : ", count_arrays) print("Output average count for unique objects in the last second: ", average_output_count) print("------------END OF A SECOND --------------") def forMinute(minute_number, output_arrays, count_arrays, average_output_count): print("MINUTE : ", minute_number) print("Array for the outputs of each frame ", output_arrays) print("Array for output count for unique objects in each frame : ", count_arrays) print("Output average count for unique objects in the last minute: ", average_output_count) print("------------END OF A MINUTE --------------") video_detector = CustomVideoObjectDetection() video_detector.setModelTypeAsYOLOv3() video_detector.setModelPath("yolov3_hololens-yolo_mAP-0.82726_epoch-73.pt") video_detector.setJsonPath("hololens-yolo_yolov3_detection_config.json") video_detector.loadModel() video_detector.detectObjectsFromVideo(camera_input=camera, output_file_path=os.path.join(execution_path, "holo1-detected3"), frames_per_second=20, per_second_function=forSeconds, per_frame_function = forFrame, per_minute_function= forMinute, minimum_percentage_probability=40, log_progress=True) ``` **ImageAI** also allows you to obtain complete analysis of the entire video processed. All you need is to define a function like the forSecond or forMinute function and set the **video_complete_function** parameter into your **.detectObjectsFromVideo()** function. The same values for the per_second-function and per_minute_function will be returned. The difference is that no index will be returned and the other 3 values will be returned, and the 3 values will cover all frames in the video. Below is a sample function: ```python def forFull(output_arrays, count_arrays, average_output_count): #Perform action on the 3 parameters returned into the function video_detector.detectObjectsFromVideo(camera_input=camera, output_file_path=os.path.join(execution_path, "holo1-detected3"), video_complete_function=forFull, minimum_percentage_probability=40, log_progress=True) ``` **FINAL NOTE ON VIDEO ANALYSIS** : **ImageAI** allows you to obtain the detected video frame as a Numpy array at each frame, second and minute function. All you need to do is specify one more parameter in your function and set **return_detected_frame=True** in your **detectObjectsFromVideo()** function. Once this is set, the extra parameter you sepecified in your function will be the Numpy array of the detected frame. See a sample below: ```python def forFrame(frame_number, output_array, output_count, detected_frame): print("FOR FRAME " , frame_number) print("Output for each object : ", output_array) print("Output count for unique objects : ", output_count) print("Returned Objects is : ", type(detected_frame)) print("------------END OF A FRAME --------------") video_detector.detectObjectsFromVideo(camera_input=camera, output_file_path=os.path.join(execution_path, "holo1-detected3"), per_frame_function=forFrame, minimum_percentage_probability=40, log_progress=True, return_detected_frame=True) ``` ### Frame Detection Intervals
The above video objects detection task are optimized for frame-real-time object detections that ensures that objects in every frame of the video is detected. **ImageAI** provides you the option to adjust the video frame detections which can speed up your video detection process. When calling the `.detectObjectsFromVideo()`, you can specify at which frame interval detections should be made. By setting the **frame_detection_interval** parameter to be equal to 5 or 20, that means the object detections in the video will be updated after 5 frames or 20 frames. If your output video **frames_per_second** is set to 20, that means the object detections in the video will be updated once in every quarter of a second or every second. This is useful in case scenarios where the available compute is less powerful and speeds of moving objects are low. This ensures you can have objects detected as second-real-time , half-a-second-real-time or whichever way suits your needs. ### Custom Video Detection Timeout
**ImageAI** now allows you to set a timeout in seconds for detection of objects in videos or camera live feed. To set a timeout for your video detection code, all you need to do is specify the `detection_timeout` parameter in the `detectObjectsFromVideo()` function to the number of desired seconds. In the example code below, we set `detection_timeout` to 120 seconds (2 minutes). ```python from imageai.Detection.Custom import CustomVideoObjectDetection import os import cv2 execution_path = os.getcwd() camera = cv2.VideoCapture(0) video_detector = CustomVideoObjectDetection() video_detector.setModelTypeAsYOLOv3() video_detector.setModelPath("yolov3_hololens-yolo_mAP-0.82726_epoch-73.pt") video_detector.setJsonPath("hololens-yolo_yolov3_detection_config.json") video_detector.loadModel() video_detector.detectObjectsFromVideo(camera_input=camera, output_file_path=os.path.join(execution_path, "holo1-detected3"), frames_per_second=20, minimum_percentage_probability=40, detection_timeout=120) ``` ### >> Documentation
We have provided full documentation for all **ImageAI** classes and functions. Find links below: * Documentation - **English Version** [https://imageai.readthedocs.io](https://imageai.readthedocs.io)** ================================================ FILE: imageai/Detection/Custom/__init__.py ================================================ import os import time import math import json import warnings from typing import List, Union, Tuple, Dict from collections import defaultdict import numpy as np from PIL import Image import cv2 import torch from torch.cuda import amp from torch.utils.data import DataLoader from torch.optim import SGD, lr_scheduler from tqdm import tqdm from .yolo.dataset import LoadImagesAndLabels from .yolo.custom_anchors import generate_anchors from .yolo.compute_loss import compute_loss from .yolo import validate from ...yolov3.tiny_yolov3 import YoloV3Tiny from ...yolov3.yolov3 import YoloV3 from ...yolov3.utils import draw_bbox_and_label, get_predictions, prepare_image from ...backend_check.model_extension import extension_check class DetectionModelTrainer: """ This is the Detection Model training class, which allows you to train object detection models on image datasets that are in YOLO format, using the YOLOv3. """ def __init__(self) -> None: self.__device = "cuda" if torch.cuda.is_available() else "cpu" self.__cuda = (self.__device != "cpu") self.__model_type = "" self.__model = None self.__optimizer = None self.__data_dir = "" self.__classes: List[str] = None self.__num_classes = None self.__anchors = None self.__dataset_name = None self.__mini_batch_size: int = None self.__scaler = amp.GradScaler(enabled=self.__cuda) self.__lr_lambda = None self.__custom_train_dataset = None self.__custom_val_dataset = None self.__train_loader = None self.__val_loader = None self.__model_path: str = None self.__epochs: int = None self.__output_models_dir: str = None self.__output_json_dir: str = None def __set_training_param(self, epochs : int, accumulate : int) -> None: # self.__lr_lambda = lambda x : ((1 - math.cos(x * math.pi / epochs)) / 2 ) * (0.1 - 1.0) + 1.0 self.__lr_lambda = lambda x: (1 - x / (epochs - 1)) * (1.0 - 0.01) + 0.01 self.__anchors = generate_anchors( self.__custom_train_dataset, n=9 if self.__model_type=="yolov3" else 6 ) self.__anchors = [round(i) for i in self.__anchors.reshape(-1).tolist()] if self.__model_type == "yolov3": self.__model = YoloV3( num_classes=self.__num_classes, anchors=self.__anchors, device=self.__device ) elif self.__model_type == "tiny-yolov3": self.__model = YoloV3Tiny( num_classes=self.__num_classes, anchors=self.__anchors, device=self.__device ) if self.__model_path: self.__load_model() w_d = (5e-4) * (self.__mini_batch_size * accumulate / 64) # scale weight decay g0, g1, g2 = [], [], [] # optimizer parameter groups for m in self.__model.modules(): if hasattr(m, 'bias') and isinstance(m.bias, torch.nn.Parameter): # bias g2.append(m.bias) if isinstance(m, torch.nn.BatchNorm2d): # weight (no decay) g0.append(m.weight) elif hasattr(m, 'weight') and isinstance(m.weight, torch.nn.Parameter): # weight (with decay) g1.append(m.weight) self.__optimizer = SGD( g0, lr=1e-2, momentum=0.6, # weight_decay=w_d, nesterov=True ) self.__optimizer.add_param_group({'params': g1, 'weight_decay': w_d}) # add g1 with weight_decay self.__optimizer.add_param_group({'params': g2}) # add g2 (biases) self.__lr_scheduler = lr_scheduler.LambdaLR( self.__optimizer, lr_lambda=self.__lr_lambda ) del g0, g1, g2 self.__model.to(self.__device) def __load_model(self) -> None: try: state_dict = torch.load(self.__model_path, map_location=self.__device) # check against cases where number of classes differs, causing the # channel of the convolutional layer just before the detection layer # to differ. new_state_dict = {k:v for k,v in state_dict.items() if k in self.__model.state_dict().keys() and v.shape==self.__model.state_dict()[k].shape} self.__model.load_state_dict(new_state_dict, strict=False) print("="*20) print("Pretrained YOLOv3 model loaded to initialize weights") print("="*20) except Exception as e: print("="*20) print("pretrained weight loading failed. Defaulting to using random weight.") print("="*20) def __load_data(self) -> None: self.__num_classes = len(self.__classes) self.__dataset_name = os.path.basename(os.path.dirname(self.__data_dir+os.path.sep)) self.__custom_train_dataset = LoadImagesAndLabels(self.__data_dir, train=True) self.__custom_val_dataset = LoadImagesAndLabels(self.__data_dir, train=False) self.__train_loader = DataLoader( self.__custom_train_dataset, batch_size=self.__mini_batch_size, shuffle=True, collate_fn=self.__custom_train_dataset.collate_fn ) self.__val_loader = DataLoader( self.__custom_val_dataset, batch_size=self.__mini_batch_size//2, shuffle=True, collate_fn=self.__custom_val_dataset.collate_fn ) def setModelTypeAsYOLOv3(self) -> None: """ 'setModelTypeAsYOLOv3()' is used to set the model type to the YOLOv3 model. :return: """ self.__model_type = "yolov3" def setModelTypeAsTinyYOLOv3(self) -> None: """ 'setModelTypeAsTinyYOLOv3()' is used to set the model type to the TinyYOLOv3 model. :return: """ self.__model_type = "tiny-yolov3" def setDataDirectory(self, data_directory: str): """ 'setDataDirectory()' is required to set the path to which the data/dataset to be used for training is kept. The input dataset must be in the YOLO format. The directory can have any name, but it must have 'train' and 'validation' sub-directory. In the 'train' and 'validation' sub-directories, there must be 'images' and 'annotations' sub-directories respectively. The 'images' folder will contain the pictures for the dataset and the 'annotations' folder will contain the TXT files with details of the annotations for each image in the 'images folder'. N.B: Strictly take note that the filenames (without the extension) of the pictures in the 'images folder' must be the same as the filenames (except the extension) of their corresponding annotation TXT files in the 'annotations' folder. The structure of the 'train' and 'validation' folder must be as follows: >> train >> images >> img_1.jpg >> images >> img_2.jpg >> images >> img_3.jpg >> annotations >> img_1.txt >> annotations >> img_2.txt >> annotations >> img_3.txt >> validation >> images >> img_151.jpg >> images >> img_152.jpg >> images >> img_153.jpg >> annotations >> img_151.txt >> annotations >> img_152.txt >> annotations >> img_153.txt :param data_directory: :return: """ if os.path.isdir(data_directory): self.__data_dir = data_directory else: raise ValueError( "The parameter passed should point to a valid directory" ) def setTrainConfig(self, object_names_array: List[str], batch_size: int=4, num_experiments=100, train_from_pretrained_model: str = None): """ 'setTrainConfig()' function allows you to set the properties for the training instances. It accepts the following values: - object_names_array , this is an array of the names of the different objects in your dataset, in the index order your dataset is annotated - batch_size (optional), this is the batch size for the training instance - num_experiments (optional), also known as epochs, it is the number of times the network will train on all the training dataset - train_from_pretrained_model (optional), this is used to perform transfer learning by specifying the path to a pre-trained YOLOv3 or TinyYOLOv3 model :param object_names_array: :param batch_size: :param num_experiments: :param train_from_pretrained_model: :return: """ self.__model_path = train_from_pretrained_model if self.__model_path: extension_check(self.__model_path) self.__classes = object_names_array self.__mini_batch_size = batch_size self.__epochs = num_experiments self.__output_models_dir = os.path.join(self.__data_dir, "models") self.__output_json_dir = os.path.join(self.__data_dir, "json") def trainModel(self) -> None: """ 'trainModel()' function starts the actual model training. Once the training starts, the training instance creates 3 sub-folders in your dataset folder which are: - json, where the JSON configuration file for using your trained model is stored - models, where your trained models are stored once they are generated after each improved experiments - cache , where temporary traing configuraton files are stored :return: """ self.__load_data() os.makedirs(self.__output_models_dir, exist_ok=True) os.makedirs(self.__output_json_dir, exist_ok=True) mp, mr, map50, map50_95, best_fitness = 0, 0, 0, 0, 0.0 nbs = 64 # norminal batch size nb = len(self.__train_loader) # number of batches nw = max(3 * nb, 1000) # number of warmup iterations. last_opt_step = -1 prev_save_name, recent_save_name = "", "" accumulate = max(round(nbs / self.__mini_batch_size), 1) # accumulate loss before optimizing. self.__set_training_param(self.__epochs, accumulate) with open(os.path.join(self.__output_json_dir, f"{self.__dataset_name}_{self.__model_type}_detection_config.json"), "w") as configWriter: json.dump( { "labels": self.__classes, "anchors": self.__anchors }, configWriter ) since = time.time() self.__lr_scheduler.last_epoch = -1 for epoch in range(1, self.__epochs+1): self.__optimizer.zero_grad() mloss = torch.zeros(3, device=self.__device) print(f"Epoch {epoch}/{self.__epochs}", "-"*10, sep="\n") for phase in ["train", "validation"]: if phase=="train": self.__model.train() print("Train: ") for batch_i, (data, anns) in tqdm(enumerate(self.__train_loader)): batches_done = batch_i + nb * epoch data = data.to(self.__device) anns = anns.to(self.__device) # warmup if batches_done <= nw: xi = [0, nw] # x interp accumulate = max(1, np.interp(batches_done, xi, [1, nbs / self.__mini_batch_size]).round()) for j, x in enumerate(self.__optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(batches_done, xi, [0.1 if j == 2 else 0.0, 0.01 * self.__lr_lambda(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(batches_done, xi, [0.8, 0.9]) with amp.autocast(enabled=self.__cuda): _ = self.__model(data) loss_layers = self.__model.get_loss_layers() loss, loss_components = compute_loss(loss_layers, anns.detach(), self.__device) self.__scaler.scale(loss).backward() mloss = (mloss * batch_i + loss_components) / (batch_i + 1) # Optimize if batches_done - last_opt_step >= accumulate: self.__scaler.step(self.__optimizer) # optimizer.step self.__scaler.update() self.__optimizer.zero_grad() last_opt_step = batches_done print(f" box loss-> {float(mloss[0]):.5f}, object loss-> {float(mloss[1]):.5f}, class loss-> {float(mloss[2]):.5f}") self.__lr_scheduler.step() else: self.__model.eval() print("Validation:") mp, mr, map50, map50_95 = validate.run( self.__model, self.__val_loader, self.__num_classes, device=self.__device ) print(f" recall: {mr:0.6f} precision: {mp:0.6f} mAP@0.5: {map50:0.6f}, mAP@0.5-0.95: {map50_95:0.6f}" "\n") if map50 > best_fitness: best_fitness = map50 recent_save_name = self.__model_type+f"_{self.__dataset_name}_mAP-{best_fitness:0.5f}_epoch-{epoch}.pt" if prev_save_name: os.remove(os.path.join(self.__output_models_dir, prev_save_name)) torch.save( self.__model.state_dict(), os.path.join(self.__output_models_dir, recent_save_name) ) prev_save_name = recent_save_name if epoch == self.__epochs: torch.save( self.__model.state_dict(), os.path.join(self.__output_models_dir, self.__model_type+f"_{self.__dataset_name}_last.pt") ) elapsed_time = time.time() - since print(f"Training completed in {elapsed_time//60:.0f}m {elapsed_time % 60:.0f}s") torch.cuda.empty_cache() class CustomObjectDetection: """ This is the object detection class for using your custom trained models. It supports your custom trained YOLOv3 and TinyYOLOv3 model and allows to you to perform object detection in images. """ def __init__(self) -> None: self.__device = "cuda" if torch.cuda.is_available() else "cpu" self.__anchors: List[int] = None self.__classes: List[str] = None self.__model = None self.__model_loaded: bool = False self.__model_path: str = None self.__json_path: str = None self.__model_type: str = None self.__nms_score = 0.4 self.__objectness_score = 0.4 def setModelTypeAsYOLOv3(self) -> None: """ 'setModelTypeAsYOLOv3()' is used to set the model type to the YOLOv3 model. :return: """ self.__model_type = "yolov3" def setModelTypeAsTinyYOLOv3(self) -> None: """ 'setModelTypeAsTinyYOLOv3()' is used to set the model type to the TinyYOLOv3 model. :return: """ self.__model_type = "tiny-yolov3" def setModelPath(self, model_path: str): if os.path.isfile(model_path): extension_check(model_path) self.__model_path = model_path self.__model_loaded = False else: raise ValueError( "invalid path, path not pointing to the weightfile." ) from None self.__model_path = model_path def setJsonPath(self, configuration_json: str): self.__json_path = configuration_json def __load_classes_and_anchors(self) -> List[str]: with open(self.__json_path) as f: json_config = json.load(f) self.__anchors = json_config["anchors"] self.__classes = json_config["labels"] def __load_image_yolo(self, input_image : Union[str, np.ndarray, Image.Image]) -> Tuple[List[str], List[np.ndarray], torch.Tensor, torch.Tensor]: """ Loads image/images from the given path. If the given path is a directory, this function only load the images in the directory (it does noot visit the subdirectories). """ allowed_exts = ["jpg", "jpeg", "png"] fnames = [] original_dims = [] inputs = [] original_imgs = [] if type(input_image) == str: if os.path.isfile(input_image): if input_image.rsplit('.')[-1].lower() in allowed_exts: img = cv2.imread(input_image) else: raise ValueError(f"image path '{input_image}' is not found or a valid file") elif type(input_image) == np.ndarray: img = input_image elif "PIL" in str(type(input_image)): img = np.asarray(input_image) else: raise ValueError(f"Invalid image input format") img_h, img_w, _ = img.shape original_imgs.append(np.array(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)).astype(np.uint8)) original_dims.append((img_w, img_h)) if type(input_image) == str: fnames.append(os.path.basename(input_image)) else: fnames.append("") inputs.append(prepare_image(img, (416, 416))) if original_dims: return ( fnames, original_imgs, torch.FloatTensor(original_dims).repeat(1,2).to(self.__device), torch.cat(inputs, 0).to(self.__device) ) raise RuntimeError( f"Error loading image." "\nEnsure the file is a valid image," " allowed file extensions are .jpg, .jpeg, .png" ) def useCPU(self): """ Used to force classification to be done on CPU. By default, classification will occur on GPU compute if available else CPU compute. """ self.__device = "cpu" if self.__model_loaded: self.__model_loaded = False self.loadModel() def loadModel(self) -> None: """ Loads the pretrained weights in the specified model path. """ self.__load_classes_and_anchors() if self.__model_type == "yolov3": self.__model = YoloV3( anchors=self.__anchors, num_classes=len(self.__classes), device=self.__device ) elif self.__model_type == "tiny-yolov3": self.__model = YoloV3Tiny( anchors=self.__anchors, num_classes=len(self.__classes), device=self.__device ) else: raise ValueError(f"Invalid model type. Call setModelTypeAsYOLOv3() or setModelTypeAsTinyYOLOv3() to set a model type before loading the model") self.__model.to(self.__device) state_dict = torch.load(self.__model_path, map_location=self.__device) try: self.__model.load_state_dict(state_dict) self.__model_loaded = True self.__model.to(self.__device).eval() except Exception as e: raise RuntimeError(f"Invalid weights!!! {e}") def detectObjectsFromImage(self, input_image: Union[str, np.ndarray, Image.Image], output_image_path: str=None, output_type: str ="file", extract_detected_objects: bool=False, minimum_percentage_probability: int=40, display_percentage_probability: bool=True, display_object_name: bool=True, display_box: bool=True, custom_objects: List=None, nms_treshold: float= 0.4, objectness_treshold: float= 0.4, ) -> Union[List[List[Tuple[str, float, Dict[str, int]]]], np.ndarray, List[np.ndarray], List[str]]: """ Detects objects in an image using the unique classes provided by COCO. :param input_image: path to an image file, cv2 image or PIL image :param output_image_path: path to save input image with predictions rendered :param output_type: type of output for rendered image. Acceptable values are 'file' and 'array` ( a cv2 image ) :param extract_detected_objects: extract each object based on the output type :param minimum_percentage_probability: the minimum confidence a detected object must have :param display_percentage_probability: to diplay/not display the confidence on rendered image :param display_object_name: to diplay/not display the object name on rendered image :param display_box: to diplay/not display the object bounding box on rendered image :param custom_objects: a dictionary of detectable objects set to boolean values :returns: A list of tuples containing the label of detected object and the confidence. """ self.__nms_score = nms_treshold self.__objectness_score = objectness_treshold self.__model.eval() if not self.__model_loaded: if self.__model_path: warnings.warn( "Model path has changed but pretrained weights in the" " new path is yet to be loaded.", ResourceWarning ) else: raise RuntimeError( "Model path isn't set, pretrained weights aren't used." ) predictions = defaultdict(lambda : []) if self.__model_type == "yolov3" or self.__model_type == "tiny-yolov3": fnames, original_imgs, input_dims, imgs = self.__load_image_yolo(input_image) with torch.no_grad(): output = self.__model(imgs) output = get_predictions( pred=output.to(self.__device), num_classes=len(self.__classes), nms_confidence_level=self.__nms_score, objectness_confidence= self.__objectness_score, device=self.__device ) if output is None: if output_type == "array": if extract_detected_objects: return original_imgs[0], [], [] else: return original_imgs[0], [] else: if extract_detected_objects: return original_imgs[0], [] else: return [] # scale the output to match the dimension of the original image input_dims = torch.index_select(input_dims, 0, output[:, 0].long()) scaling_factor = torch.min(416 / input_dims, 1)[0].view(-1, 1) output[:, [1,3]] -= (416 - (scaling_factor * input_dims[:, 0].view(-1,1))) / 2 output[:, [2,4]] -= (416 - (scaling_factor * input_dims[:, 1].view(-1,1))) / 2 output[:, 1:5] /= scaling_factor #clip bounding box for those that extended outside the detected image. for idx in range(output.shape[0]): output[idx, [1,3]] = torch.clamp(output[idx, [1,3]], 0.0, input_dims[idx, 0]) output[idx, [2,4]] = torch.clamp(output[idx, [2,4]], 0.0, input_dims[idx, 1]) for pred in output: pred_label = self.__classes[int(pred[-1])] if custom_objects: if pred_label.replace(" ", "_") in custom_objects.keys(): if not custom_objects[pred_label.replace(" ", "_")]: continue else: continue predictions[int(pred[0])].append(( pred_label, float(pred[-2]), {k:v for k,v in zip(["x1", "y1", "x2", "y2"], map(int, pred[1:5]))}, )) # Render detection on copy of input image original_input_image = None output_image_array = None extracted_objects = [] if self.__model_type == "yolov3" or self.__model_type == "tiny-yolov3": original_input_image = cv2.cvtColor(original_imgs[0], cv2.COLOR_RGB2BGR) if isinstance(output, torch.Tensor): for pred in output: percentage_conf = round(float(pred[-2]) * 100, 2) if percentage_conf < minimum_percentage_probability: continue displayed_label = "" if display_object_name: displayed_label = f"{self.__classes[int(pred[-1].item())]} : " if display_percentage_probability: displayed_label += f" {percentage_conf}%" original_imgs[int(pred[0].item())] = draw_bbox_and_label(pred[1:5].int() if display_box else None, displayed_label, original_imgs[int(pred[0].item())] ) output_image_array = cv2.cvtColor(original_imgs[0], cv2.COLOR_RGB2BGR) # Format predictions for function reponse predictions_batch = list(predictions.values()) predictions_list = predictions_batch[0] if len(predictions_batch) > 0 else [] min_probability = minimum_percentage_probability / 100 if output_type == "file": if output_image_path: cv2.imwrite(output_image_path, output_image_array) if extract_detected_objects: extraction_dir = ".".join(output_image_path.split(".")[:-1]) + "-extracted" os.mkdir(extraction_dir) count = 0 for obj_prediction in predictions_list: if obj_prediction[1] >= min_probability: count += 1 extracted_path = os.path.join( extraction_dir, ".".join(os.path.basename(output_image_path).split(".")[:-1]) + f"-{count}.jpg" ) obj_bbox = obj_prediction[2] cv2.imwrite(extracted_path, original_input_image[obj_bbox["y1"] : obj_bbox["y2"], obj_bbox["x1"] : obj_bbox["x2"]]) extracted_objects.append(extracted_path) elif output_type == "array": if extract_detected_objects: for obj_prediction in predictions_list: if obj_prediction[1] >= min_probability: obj_bbox = obj_prediction[2] extracted_objects.append(original_input_image[obj_bbox["y1"] : obj_bbox["y2"], obj_bbox["x1"] : obj_bbox["x2"]]) else: raise ValueError(f"Invalid output_type '{output_type}'. Supported values are 'file' and 'array' ") predictions_list = [ { "name": prediction[0], "percentage_probability": round(prediction[1] * 100, 2), "box_points": [prediction[2]["x1"], prediction[2]["y1"], prediction[2]["x2"], prediction[2]["y2"]] } for prediction in predictions_list if prediction[1] >= min_probability ] if output_type == "array": if extract_detected_objects: return output_image_array, predictions_list, extracted_objects else: return output_image_array, predictions_list else: if extract_detected_objects: return predictions_list, extracted_objects else: return predictions_list class CustomVideoObjectDetection: """ This is the custom objects detection class for videos and camera live stream inputs in the ImageAI library. It provides support for YOLOv3 and TinyYOLOv3 object detection networks. After instantiating this class, you can set it's properties and make object detections using it's pre-defined functions. The following functions are required to be called before object detection can be made * setModelPath() * At least of of the following and it must correspond to the model set in the setModelPath() [setModelTypeAsRetinaNet(), setModelTypeAsYOLOv3(), setModelTinyYOLOv3()] * loadModel() [This must be called once only before performing object detection] Once the above functions have been called, you can call the detectObjectsFromVideo() function or the detectCustomObjectsFromVideo() of the object detection instance object at anytime to obtain observable objects in any video or camera live stream. """ def __init__(self): self.__detector = CustomObjectDetection() def setModelTypeAsYOLOv3(self): self.__detector.setModelTypeAsYOLOv3() def setModelTypeAsTinyYOLOv3(self): self.__detector.setModelTypeAsTinyYOLOv3() def setModelPath(self, model_path: str): extension_check(model_path) self.__detector.setModelPath(model_path) def setJsonPath(self, configuration_json: str): self.__detector.setJsonPath(configuration_json) def loadModel(self): self.__detector.loadModel() def useCPU(self): self.__detector.useCPU() def detectObjectsFromVideo(self, input_file_path="", camera_input=None, output_file_path="", frames_per_second=20, frame_detection_interval=1, minimum_percentage_probability=40, log_progress=False, display_percentage_probability=True, display_object_name=True, display_box=True, save_detected_video=True, per_frame_function=None, per_second_function=None, per_minute_function=None, video_complete_function=None, return_detected_frame=False, detection_timeout = None): """ 'detectObjectsFromVideo()' function is used to detect objects observable in the given video path or a camera input: * input_file_path , which is the file path to the input video. It is required only if 'camera_input' is not set * camera_input , allows you to parse in camera input for live video detections * output_file_path , which is the path to the output video. It is required only if 'save_detected_video' is not set to False * frames_per_second , which is the number of frames to be used in the output video * frame_detection_interval (optional, 1 by default) , which is the intervals of frames that will be detected. * minimum_percentage_probability (optional, 50 by default) , option to set the minimum percentage probability for nominating a detected object for output. * log_progress (optional) , which states if the progress of the frame processed is to be logged to console * display_percentage_probability (optional), can be used to hide or show probability scores on the detected video frames * display_object_name (optional), can be used to show or hide object names on the detected video frames * save_save_detected_video (optional, True by default), can be set to or not to save the detected video * per_frame_function (optional), this parameter allows you to parse in a function you will want to execute after each frame of the video is detected. If this parameter is set to a function, after every video frame is detected, the function will be executed with the following values parsed into it: -- position number of the frame -- an array of dictinaries, with each dictionary corresponding to each object detected. Each dictionary contains 'name', 'percentage_probability' and 'box_points' -- a dictionary with with keys being the name of each unique objects and value are the number of instances of the object present -- If return_detected_frame is set to True, the numpy array of the detected frame will be parsed as the fourth value into the function * per_second_function (optional), this parameter allows you to parse in a function you will want to execute after each second of the video is detected. If this parameter is set to a function, after every second of a video is detected, the function will be executed with the following values parsed into it: -- position number of the second -- an array of dictionaries whose keys are position number of each frame present in the last second , and the value for each key is the array for each frame that contains the dictionaries for each object detected in the frame -- an array of dictionaries, with each dictionary corresponding to each frame in the past second, and the keys of each dictionary are the name of the number of unique objects detected in each frame, and the key values are the number of instances of the objects found in the frame -- a dictionary with its keys being the name of each unique object detected throughout the past second, and the key values are the average number of instances of the object found in all the frames contained in the past second -- If return_detected_frame is set to True, the numpy array of the detected frame will be parsed as the fifth value into the function * per_minute_function (optional), this parameter allows you to parse in a function you will want to execute after each minute of the video is detected. If this parameter is set to a function, after every minute of a video is detected, the function will be executed with the following values parsed into it: -- position number of the minute -- an array of dictionaries whose keys are position number of each frame present in the last minute , and the value for each key is the array for each frame that contains the dictionaries for each object detected in the frame -- an array of dictionaries, with each dictionary corresponding to each frame in the past minute, and the keys of each dictionary are the name of the number of unique objects detected in each frame, and the key values are the number of instances of the objects found in the frame -- a dictionary with its keys being the name of each unique object detected throughout the past minute, and the key values are the average number of instances of the object found in all the frames contained in the past minute -- If return_detected_frame is set to True, the numpy array of the detected frame will be parsed as the fifth value into the function * video_complete_function (optional), this parameter allows you to parse in a function you will want to execute after all of the video frames have been detected. If this parameter is set to a function, after all of frames of a video is detected, the function will be executed with the following values parsed into it: -- an array of dictionaries whose keys are position number of each frame present in the entire video , and the value for each key is the array for each frame that contains the dictionaries for each object detected in the frame -- an array of dictionaries, with each dictionary corresponding to each frame in the entire video, and the keys of each dictionary are the name of the number of unique objects detected in each frame, and the key values are the number of instances of the objects found in the frame -- a dictionary with its keys being the name of each unique object detected throughout the entire video, and the key values are the average number of instances of the object found in all the frames contained in the entire video * return_detected_frame (optionally, False by default), option to obtain the return the last detected video frame into the per_per_frame_function, per_per_second_function or per_per_minute_function * detection_timeout (optionally, None by default), option to state the number of seconds of a video that should be detected after which the detection function stop processing the video * thread_safe (optional, False by default), enforce the loaded detection model works across all threads if set to true, made possible by forcing all Tensorflow inference to run on the default graph. :param input_file_path: :param camera_input :param output_file_path: :param save_detected_video: :param frames_per_second: :param frame_detection_interval: :param minimum_percentage_probability: :param log_progress: :param display_percentage_probability: :param display_object_name: :param per_frame_function: :param per_second_function: :param per_minute_function: :param video_complete_function: :param return_detected_frame: :param detection_timeout: :param thread_safe: :return output_video_filepath: :return counting: :return output_objects_array: :return output_objects_count: :return detected_copy: :return this_second_output_object_array: :return this_second_counting_array: :return this_second_counting: :return this_minute_output_object_array: :return this_minute_counting_array: :return this_minute_counting: :return this_video_output_object_array: :return this_video_counting_array: :return this_video_counting: """ if (input_file_path == "" and camera_input == None): raise ValueError( "You must set 'input_file_path' to a valid video file, or set 'camera_input' to a valid camera") elif (save_detected_video == True and output_file_path == ""): raise ValueError( "You must set 'output_video_filepath' to a valid video file name, in which the detected video will be saved. If you don't intend to save the detected video, set 'save_detected_video=False'") else: output_frames_dict = {} output_frames_count_dict = {} input_video = cv2.VideoCapture(input_file_path) if (camera_input != None): input_video = camera_input output_video_filepath = output_file_path + '.mp4' frame_width = int(input_video.get(3)) frame_height = int(input_video.get(4)) output_video = cv2.VideoWriter(output_video_filepath, cv2.VideoWriter_fourcc(*"MP4V"), frames_per_second, (frame_width, frame_height)) counting = 0 detection_timeout_count = 0 video_frames_count = 0 while (input_video.isOpened()): ret, frame = input_video.read() if (ret == True): video_frames_count += 1 if (detection_timeout != None): if ((video_frames_count % frames_per_second) == 0): detection_timeout_count += 1 if (detection_timeout_count >= detection_timeout): break output_objects_array = [] counting += 1 if (log_progress == True): print("Processing Frame : ", str(counting)) detected_copy = frame.copy() check_frame_interval = counting % frame_detection_interval if (counting == 1 or check_frame_interval == 0): try: detected_copy, output_objects_array = self.__detector.detectObjectsFromImage( input_image=frame, output_type="array", minimum_percentage_probability=minimum_percentage_probability, display_percentage_probability=display_percentage_probability, display_object_name=display_object_name, display_box=display_box) except Exception as e: warnings.warn() if (save_detected_video == True): output_video.write(detected_copy) if detected_copy is not None and output_objects_array is not None: output_frames_dict[counting] = output_objects_array output_objects_count = {} for eachItem in output_objects_array: eachItemName = eachItem["name"] try: output_objects_count[eachItemName] = output_objects_count[eachItemName] + 1 except: output_objects_count[eachItemName] = 1 output_frames_count_dict[counting] = output_objects_count if (counting == 1 or check_frame_interval == 0): if (per_frame_function != None): if (return_detected_frame == True): per_frame_function(counting, output_objects_array, output_objects_count, detected_copy) elif (return_detected_frame == False): per_frame_function(counting, output_objects_array, output_objects_count) if (per_second_function != None): if (counting != 1 and (counting % frames_per_second) == 0): this_second_output_object_array = [] this_second_counting_array = [] this_second_counting = {} for aa in range(counting): if (aa >= (counting - frames_per_second)): this_second_output_object_array.append(output_frames_dict[aa + 1]) this_second_counting_array.append(output_frames_count_dict[aa + 1]) for eachCountingDict in this_second_counting_array: for eachItem in eachCountingDict: try: this_second_counting[eachItem] = this_second_counting[eachItem] + \ eachCountingDict[eachItem] except: this_second_counting[eachItem] = eachCountingDict[eachItem] for eachCountingItem in this_second_counting: this_second_counting[eachCountingItem] = int(this_second_counting[eachCountingItem] / frames_per_second) if (return_detected_frame == True): per_second_function(int(counting / frames_per_second), this_second_output_object_array, this_second_counting_array, this_second_counting, detected_copy) elif (return_detected_frame == False): per_second_function(int(counting / frames_per_second), this_second_output_object_array, this_second_counting_array, this_second_counting) if (per_minute_function != None): if (counting != 1 and (counting % (frames_per_second * 60)) == 0): this_minute_output_object_array = [] this_minute_counting_array = [] this_minute_counting = {} for aa in range(counting): if (aa >= (counting - (frames_per_second * 60))): this_minute_output_object_array.append(output_frames_dict[aa + 1]) this_minute_counting_array.append(output_frames_count_dict[aa + 1]) for eachCountingDict in this_minute_counting_array: for eachItem in eachCountingDict: try: this_minute_counting[eachItem] = this_minute_counting[eachItem] + \ eachCountingDict[eachItem] except: this_minute_counting[eachItem] = eachCountingDict[eachItem] for eachCountingItem in this_minute_counting: this_minute_counting[eachCountingItem] = int(this_minute_counting[eachCountingItem] / (frames_per_second * 60)) if (return_detected_frame == True): per_minute_function(int(counting / (frames_per_second * 60)), this_minute_output_object_array, this_minute_counting_array, this_minute_counting, detected_copy) elif (return_detected_frame == False): per_minute_function(int(counting / (frames_per_second * 60)), this_minute_output_object_array, this_minute_counting_array, this_minute_counting) else: break if (video_complete_function != None): this_video_output_object_array = [] this_video_counting_array = [] this_video_counting = {} for aa in range(counting): this_video_output_object_array.append(output_frames_dict[aa + 1]) this_video_counting_array.append(output_frames_count_dict[aa + 1]) for eachCountingDict in this_video_counting_array: for eachItem in eachCountingDict: try: this_video_counting[eachItem] = this_video_counting[eachItem] + \ eachCountingDict[eachItem] except: this_video_counting[eachItem] = eachCountingDict[eachItem] for eachCountingItem in this_video_counting: this_video_counting[eachCountingItem] = int(this_video_counting[eachCountingItem] / counting) video_complete_function(this_video_output_object_array, this_video_counting_array, this_video_counting) input_video.release() output_video.release() if (save_detected_video == True): return output_video_filepath ================================================ FILE: imageai/Detection/Custom/yolo/__init__.py ================================================ ================================================ FILE: imageai/Detection/Custom/yolo/compute_loss.py ================================================ import math import torch import torch.nn as nn # This new loss function is based on https://github.com/ultralytics/yolov3/blob/master/utils/loss.py def bbox_iou(box1, box2, x1y1x2y2=True, GIoU=False, DIoU=False, CIoU=False, eps=1e-9): # Returns the IoU of box1 to box2. box1 is 4, box2 is nx4 box2 = box2.T # Get the coordinates of bounding boxes if x1y1x2y2: # x1, y1, x2, y2 = box1 b1_x1, b1_y1, b1_x2, b1_y2 = box1[0], box1[1], box1[2], box1[3] b2_x1, b2_y1, b2_x2, b2_y2 = box2[0], box2[1], box2[2], box2[3] else: # transform from xywh to xyxy b1_x1, b1_x2 = box1[0] - box1[2] / 2, box1[0] + box1[2] / 2 b1_y1, b1_y2 = box1[1] - box1[3] / 2, box1[1] + box1[3] / 2 b2_x1, b2_x2 = box2[0] - box2[2] / 2, box2[0] + box2[2] / 2 b2_y1, b2_y2 = box2[1] - box2[3] / 2, box2[1] + box2[3] / 2 # Intersection area inter = (torch.min(b1_x2, b2_x2) - torch.max(b1_x1, b2_x1)).clamp(0) * \ (torch.min(b1_y2, b2_y2) - torch.max(b1_y1, b2_y1)).clamp(0) # Union Area w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps union = w1 * h1 + w2 * h2 - inter + eps iou = inter / union if GIoU or DIoU or CIoU: # convex (smallest enclosing box) width cw = torch.max(b1_x2, b2_x2) - torch.min(b1_x1, b2_x1) ch = torch.max(b1_y2, b2_y2) - torch.min(b1_y1, b2_y1) # convex height if CIoU or DIoU: # Distance or Complete IoU https://arxiv.org/abs/1911.08287v1 c2 = cw ** 2 + ch ** 2 + eps # convex diagonal squared rho2 = ((b2_x1 + b2_x2 - b1_x1 - b1_x2) ** 2 + (b2_y1 + b2_y2 - b1_y1 - b1_y2) ** 2) / 4 # center distance squared if DIoU: return iou - rho2 / c2 # DIoU elif CIoU: # https://github.com/Zzh-tju/DIoU-SSD-pytorch/blob/master/utils/box/box_utils.py#L47 v = (4 / math.pi ** 2) * \ torch.pow(torch.atan(w2 / h2) - torch.atan(w1 / h1), 2) with torch.no_grad(): alpha = v / ((1 + eps) - iou + v) return iou - (rho2 / c2 + v * alpha) # CIoU else: # GIoU https://arxiv.org/pdf/1902.09630.pdf c_area = cw * ch + eps # convex area return iou - (c_area - union) / c_area # GIoU else: return iou # IoU def compute_loss(loss_layers, targets, device="cpu"): nc = loss_layers[0].num_classes nl = len(loss_layers) # output at each layer predictions = [layer.pred for layer in loss_layers] # placeholders for the losses. lcls, lbox, lobj = torch.zeros(1, device=device), torch.zeros(1, device=device), torch.zeros(1, device=device) # Build yolo targets tcls, tbox, indices, anchors = build_targets(predictions, targets, loss_layers, device) # targets BCEcls = nn.BCEWithLogitsLoss( pos_weight=torch.tensor([1.0], device=device)) BCEobj = nn.BCEWithLogitsLoss( pos_weight=torch.tensor([1.0], device=device)) balance = [4.0, 1.0, 0.4] # Calculate losses for each yolo layer for layer_index, layer_predictions in enumerate(predictions): # Get image ids, anchors, grid index i and j for each target in the current yolo layer b, anchor, grid_j, grid_i = indices[layer_index] # Build empty object target tensor with the same shape as the object prediction tobj = torch.zeros_like(layer_predictions[..., 0], device=device) # target obj # Get the number of targets for this layer. # Each target is a label box with some scaling and the association of an anchor box. # Label boxes may be associated to 0 or multiple anchors. So they are multiple times or not at all in the targets. num_targets = b.shape[0] # Check if there are targets for this batch if num_targets: # Load the corresponding values from the predictions for each of the targets ps = layer_predictions[b, anchor, grid_j, grid_i] # Regression of the box # Apply sigmoid to xy offset predictions in each cell that has a target pxy = ps[:, :2].sigmoid() * 2 - 0.5 # Apply exponent to wh predictions and multiply with the anchor box that matched best with the label for each cell that has a target pwh = (ps[:, 2:4].sigmoid() * 2) ** 2 * anchors[layer_index] # Build box out of xy and wh pbox = torch.cat((pxy, pwh), 1) # Calculate CIoU or GIoU for each target with the predicted box for its cell + anchor iou = bbox_iou(pbox.T, tbox[layer_index], x1y1x2y2=False, CIoU=True) # We want to minimize our loss so we and the best possible IoU is 1 so we take 1 - IoU and reduce it with a mean lbox += (1.0 - iou).mean() # iou loss # Classification of the objectness # Fill our empty object target tensor with the IoU we just calculated for each target at the targets position tobj[b, anchor, grid_j, grid_i] = iou.detach().clamp(0).type(tobj.dtype) # Use cells with iou > 0 as object targets # Classification of the class # Check if we need to do a classification (number of classes > 1) if nc > 1: # Hot one class encoding t = torch.full_like(ps[:, 5:], 0.0, device=device) # targets t[range(num_targets), tcls[layer_index]] = 1 # Use the tensor to calculate the BCE loss lcls += BCEcls(ps[:, 5:], t) # BCE # Classification of the objectness the sequel # Calculate the BCE loss between the on the fly generated target and the network prediction obji = BCEobj(layer_predictions[..., 4], tobj) # obj loss lobj += obji * balance[layer_index] lbox *= 0.05 lobj *= (1.0 * ((416 / 640) ** 2)) # scale to image size lcls *= (0.5 * (nc / 80)) # scale to classes # Merge losses loss = (lbox + lobj + lcls) * tobj.shape[0] return loss, (torch.cat((lbox, lobj, lcls))).detach() def build_targets(p, targets, loss_layers, device="cpu"): # Build targets for compute_loss(), input targets(image,class,x,y,w,h) na, nt = len(loss_layers[0].anchors), targets.shape[0] # number of anchors, targets tcls, tbox, indices, anch = [], [], [], [] gain = torch.ones(7, device=device) # normalized to gridspace gain # Make a tensor that iterates 0-2 for 3 anchors and repeat that as many times as we have target boxes ai = torch.arange(na, device=device).float().view(na, 1).repeat(1, nt) # Copy target boxes anchor size times and append an anchor index to each copy the anchor index is also expressed by the new first dimension targets = torch.cat((targets.repeat(na, 1, 1), ai[:, :, None]), 2) g = 0.5 off = torch.tensor([ [0, 0], [1, 0], [0, 1], [-1, 0], [0, -1] ], device=device).float() * g #offsets for i, yolo_layer in enumerate(loss_layers): # Scale anchors by the yolo grid cell size so that an anchor with the size of the cell would result in 1 anchors = yolo_layer.anchors / yolo_layer.stride # Add the number of yolo cells in this layer the gain tensor # The gain tensor matches the collums of our targets (img id, class, x, y, w, h, anchor id) gain[2:6] = torch.tensor(p[i].shape)[[3, 2, 3, 2]] # xyxy gain # Scale targets by the number of yolo layer cells, they are now in the yolo cell coordinate system t = targets * gain # Check if we have targets if nt: # Calculate ration between anchor and target box for both width and height r = t[:, :, 4:6] / anchors[:, None] # Select the ratios that have the highest divergence in any axis and check if the ratio is less than 4 j = torch.max(r, 1.0 / r).max(2)[0] < 4.0 # compare # Only use targets that have the correct ratios for their anchors # That means we only keep ones that have a matching anchor and we loose the anchor dimension # The anchor id is still saved in the 7th value of each target t = t[j] #offsets gxy = t[:, 2:4] #grid xy gxi = gain[[2,3]] - gxy j, k = ((gxy % 1 < g) & (gxy > 1)).T l, m = ((gxi % 1 < g) & (gxi > 1)).T j = torch.stack((torch.ones_like(j), j, k, l, m)) t = t.repeat((5, 1, 1))[j] offsets = (torch.zeros_like(gxy)[None] + off[:, None])[j] else: t = targets[0] offsets = 0 # Extract image id in batch and class id b, c = t[:, :2].long().T # We isolate the target cell associations. # x, y, w, h are allready in the cell coordinate system meaning an x = 1.2 would be 1.2 times cellwidth gxy = t[:, 2:4] #grid xy gwh = t[:, 4:6] # grid wh # Cast to int to get an cell index e.g. 1.2 gets associated to cell 1 gij = (gxy - offsets).long() # Isolate x and y index dimensions gi, gj = gij.T # grid xy indices # Convert anchor indexes to int a = t[:, 6].long() # Add target tensors for this yolo layer to the output lists # Add to index list and limit index range to prevent out of bounds indices.append((b, a, gj.clamp_(0, int(gain[3] - 1)), gi.clamp_(0, int(gain[2] - 1)))) # Add to target box list and convert box coordinates from global grid coordinates to local offsets in the grid cell tbox.append(torch.cat((gxy - gij, gwh), 1)) # box # Add correct anchor for each target to the list anch.append(anchors[a]) # Add class for each target to the list tcls.append(c) return tcls, tbox, indices, anch ================================================ FILE: imageai/Detection/Custom/yolo/custom_anchors.py ================================================ import random import torch import numpy as np from scipy.cluster.vq import kmeans # This new anchor generator function is based on https://github.com/ultralytics/yolov3/blob/master/utils/autoanchor.py def generate_anchors(dataset, n=9, img_size=416, thr=4.0, gen=1000, verbose=True): """ Creates kmeans-evolved anchors from training dataset Arguments: dataset: a loaded dataset i.e. subclass of torch.utils.data.Dataset n: number of anchors img_size: image size used for training thr: anchor-label wh ratio threshold used for training, default=4.0 gen: generations to evolve anchors using genetic algorithm verbose: print all results Return: k: kmeans evolved anchors """ thr = 1 / thr def metric(k, wh): # compute metrics r = wh[:, None] / k[None] x = torch.min(r, 1 / r).min(2)[0] # ratio metric return x, x.max(1)[0] # x, best_x def anchor_fitness(k): # mutation fitness _, best = metric(torch.tensor(k, dtype=torch.float32), wh) return (best * (best > thr).float()).mean() # fitness def print_results(k, verbose=True): k = k[np.argsort(k.prod(1))] # sort small to large if verbose: x, best = metric(k, wh0) bpr, aat = (best > thr).float().mean(), (x > thr).float().mean() * n # best possible recall, anch > thr s = f'thr={thr:.2f}: {bpr:.4f} best possible recall, {aat:.2f} anchors past thr\n' \ f'n={n}, img_size={img_size}, metric_all={x.mean():.3f}/{best.mean():.3f}-mean/best, ' \ f'past_thr={x[x > thr].mean():.3f}-mean: ' print(s) return k # Get label wh shapes = img_size * dataset.shapes / dataset.shapes.max(1, keepdims=True) wh0 = np.concatenate([l[:, 3:5] * s for s, l in zip(shapes, dataset.labels)]) # wh # Filter i = (wh0 < 3.0).any(1).sum() if i and verbose: print(f'WARNING: Extremely small objects found. {i} of {len(wh0)} labels are < 3 pixels in size.') wh = wh0[(wh0 >= 2.0).any(1)] # filter > 2 pixels # wh = wh * (np.random.rand(wh.shape[0], 1) * 0.9 + 0.1) # multiply by random scale 0-1 # Kmeans calculation s = wh.std(0) # sigmas for whitening k, dist = kmeans(wh / s, n, iter=30) # points, mean distance assert len(k) == n, f'ERROR: scipy.cluster.vq.kmeans requested {n} points but returned only {len(k)}' k *= s wh = torch.tensor(wh, dtype=torch.float32) # filtered wh0 = torch.tensor(wh0, dtype=torch.float32) # unfiltered k = print_results(k, verbose=False) # Evolve npr = np.random f, sh, mp, s = anchor_fitness(k), k.shape, 0.9, 0.1 # fitness, generations, mutation prob, sigma if verbose: print("Generating anchor boxes for training images...") for _ in range(gen): v = np.ones(sh) while (v == 1).all(): # mutate until a change occurs (prevent duplicates) v = ((npr.random(sh) < mp) * random.random() * npr.randn(*sh) * s + 1).clip(0.3, 3.0) kg = (k.copy() * v).clip(min=2.0) fg = anchor_fitness(kg) if fg > f: f, k = fg, kg.copy() return print_results(k) ================================================ FILE: imageai/Detection/Custom/yolo/dataset.py ================================================ import os import warnings from typing import Tuple, List import cv2 as cv import numpy as np import torch from torch.utils.data import Dataset from torchvision import transforms from ....yolov3.utils import prepare_image class LoadImagesAndLabels(Dataset): def __init__(self, path : str, net_dim=(416, 416), train=True): if not os.path.isdir(path): raise NotADirectoryError("path is not a valid directory!!!") super().__init__() if train: path = os.path.join(path, "train") else: path = os.path.join(path, "validation") self.__net_width, self.__net_height = net_dim self.__images_paths = [] self.shapes = [] self.labels = [] for img in os.listdir(os.path.join(path, "images")): p = os.path.join(path, "images", img) image = cv.imread(p) if isinstance(image, np.ndarray): l_p = self.__img_path2label_path(p) self.__images_paths.append(p) self.shapes.append((image.shape[1], image.shape[0])) self.labels.append(self.__load_raw_label(l_p)) self.__nsamples = len(self.__images_paths) self.shapes = np.array(self.shapes) def __len__(self) -> int: return self.__nsamples def __img_path2label_path(self, path : str) -> str: im, lb = os.sep+"images"+os.sep, os.sep+"annotations"+os.sep return lb.join(path.rsplit(im, 1)).rsplit(".", 1)[0] + ".txt" def __getitem__(self, idx) -> Tuple[torch.Tensor, torch.Tensor]: if idx >= self.__nsamples: raise IndexError("Index out of range.") image_path = self.__images_paths[idx] label = self.labels[idx].copy() image, label = self.__load_data(image_path, label) return image, label def __xywhn2xyxy(self, nlabel : torch.Tensor, width : int, height : int) -> torch.Tensor: """ Transformed label from normalized center_x, center_y, width, height to x_1, y_1, x_2, y_2 """ label = nlabel.clone() label[:, 1] = (nlabel[:, 1] - (nlabel[:, 3] / 2)) * width label[:, 2] = (nlabel[:, 2] - (nlabel[:, 4] / 2)) * height label[:, 3] = (nlabel[:, 1] + (nlabel[:, 3] / 2)) * width label[:, 4] = (nlabel[:, 2] + (nlabel[:, 4] / 2)) * height return label def __load_data(self, img_path : str, label : np.ndarray) -> Tuple[torch.Tensor, torch.Tensor]: img = cv.imread(img_path) img_h, img_w = img.shape[:2] img = prepare_image(img[:, :, :3], [self.__net_width, self.__net_height]) lab = self.__process_label(label, img_w, img_h) return img.squeeze(), lab def __load_raw_label(self, label_path : str): if os.path.isfile(label_path): with warnings.catch_warnings(): l = np.loadtxt(label_path).reshape(-1,5) assert (l >= 0).all(), "bounding box values should be positive and in range 0 - 1" assert (l[:, 1:] <= 1).all(), "bounding box values should be in the range 0 - 1" else: l = np.zeros((0,5), dtype=np.float32) return l def __process_label(self, label : np.ndarray, image_width : int, image_height : int) -> torch.Tensor: """ Process corresponding label and resize the ground truth bounding boxes to match the dimension of the resizes image. """ #max_box = 50 scaling_factor = min( self.__net_width/image_width, self.__net_width/image_height ) #bs = torch.zeros((max_box, 6)) bs = torch.zeros((len(label), 6)) if label.size > 0: nlabels = torch.from_numpy(label) labels = self.__xywhn2xyxy(nlabels, image_width, image_height) # scale bounding box to match new image size labels[:, [1,3]] = ((labels[:, [1,3]] * scaling_factor) +\ (self.__net_width - (image_width * scaling_factor))/2) labels[:, [2,4]] = ((labels[:, [2,4]] * scaling_factor) +\ (self.__net_width - (image_height * scaling_factor))/2) # convert x1, y1, x2, y2 to center_x, center_y, width, height label_copy = labels.clone() labels[:, 1] = (label_copy[:, 3] + label_copy[:, 1])/2 labels[:, 2] = (label_copy[:, 4] + label_copy[:, 2])/2 labels[:, 3] = (label_copy[:, 3] - label_copy[:, 1]) labels[:, 4] = (label_copy[:, 4] - label_copy[:, 2]) # scale labels by new image dimension labels[:, 1:5] /= self.__net_width bs[:, 1:] = labels[:, :] return bs def collate_fn(self, batch) -> Tuple[torch.Tensor, torch.Tensor]: batch = [data for data in batch if data is not None] imgs, bboxes = list(zip(*batch)) imgs = torch.stack(imgs) for i, boxes in enumerate(bboxes): boxes[:, 0] = i bboxes = torch.cat(bboxes, 0) return imgs, bboxes ================================================ FILE: imageai/Detection/Custom/yolo/metric.py ================================================ import math import warnings import numpy as np import torch # This new metric functions is based on https://github.com/ultralytics/yolov3/blob/master/utils/metric.py def ap_per_class(tp, conf, pred_cls, target_cls): """ Compute the average precision, given the recall and precision curves. Source: https://github.com/rafaelpadilla/Object-Detection-Metrics. # Arguments tp: True positives (nparray, nx1 or nx10). conf: Objectness value from 0-1 (nparray). pred_cls: Predicted object classes (nparray). target_cls: True object classes (nparray). # Returns The average precision as computed in py-faster-rcnn. """ # Sort by objectness i = np.argsort(-conf) tp, conf, pred_cls = tp[i], conf[i], pred_cls[i] # Find unique classes unique_classes = np.unique(target_cls) nc = unique_classes.shape[0] # number of classes, number of detections # Create Precision-Recall curve and compute AP for each class px = np.linspace(0, 1, 1000) ap, p, r = np.zeros((nc, tp.shape[1])), np.zeros((nc, 1000)), np.zeros((nc, 1000)) for ci, c in enumerate(unique_classes): i = pred_cls == c n_l = (target_cls == c).sum() # number of labels n_p = i.sum() # number of predictions if n_p == 0 or n_l == 0: continue else: # Accumulate FPs and TPs fpc = (1 - tp[i]).cumsum(0) tpc = tp[i].cumsum(0) # Recall recall = tpc / (n_l + 1e-16) # recall curve r[ci] = np.interp(-px, -conf[i], recall[:, 0], left=0) # negative x, xp because xp decreases # Precision precision = tpc / (tpc + fpc) # precision curve p[ci] = np.interp(-px, -conf[i], precision[:, 0], left=1) # p at pr_score # AP from recall-precision curve for j in range(tp.shape[1]): ap[ci, j], mpre, mrec = compute_ap(recall[:, j], precision[:, j]) # Compute F1 (harmonic mean of precision and recall) f1 = 2 * p * r / (p + r + 1e-16) i = f1.mean(0).argmax() # max F1 index return p[:, i], r[:, i], ap, f1[:, i], unique_classes.astype('int32') def compute_ap(recall, precision): """ Compute the average precision, given the recall and precision curves # Arguments recall: The recall curve (list) precision: The precision curve (list) # Returns Average precision, precision curve, recall curve """ # Append sentinel values to beginning and end mrec = np.concatenate(([0.0], recall, [1.0])) mpre = np.concatenate(([1.0], precision, [0.0])) # Compute the precision envelope mpre = np.flip(np.maximum.accumulate(np.flip(mpre))) # Integrate area under curve x = np.linspace(0, 1, 101) # 101-point interp (COCO) ap = np.trapz(np.interp(x, mrec, mpre), x) # integrate return ap, mpre, mrec ================================================ FILE: imageai/Detection/Custom/yolo/validate.py ================================================ import os import numpy as np import torch from torchvision.ops import box_iou from ....yolov3.utils import get_predictions from .metric import ap_per_class from tqdm import tqdm # This new validation function is based on https://github.com/ultralytics/yolov3/blob/master/val.py def xywh2xyxy(box_coord : torch.Tensor): """ Convert bounding box coordinates from center_x, center_y, width, height to x_1, y_1, x_2, x_3 """ n = box_coord.clone() n[:, 0] = (box_coord[:, 0] - (box_coord[:, 2] / 2)) n[:, 1] = (box_coord[:, 1] - (box_coord[:, 3] / 2)) n[:, 2] = (box_coord[:, 0] + (box_coord[:, 2] / 2)) n[:, 3] = (box_coord[:, 1] + (box_coord[:, 3] / 2)) return n def process_batch(detections, labels, iouv): """ Return correct predictions matrix. Both sets of boxes are in (x1, y1, x2, y2) format. Arguments: detections (Array[N, 6]), x1, y1, x2, y2, conf, class labels (Array[M, 5]), class, x1, y1, x2, y2 Returns: correct (Array[N, 10]), for 10 IoU levels """ detections[:, [1,3]] = torch.clamp(detections[:, [1,3]], 0.0, 416) detections[:, [2,4]] = torch.clamp(detections[:, [2,4]], 0.0, 416) correct = torch.zeros(detections.shape[0], iouv.shape[0], dtype=torch.bool, device=iouv.device) iou = box_iou(labels[:, 1:], detections[:, 1:5]) x = torch.where((iou >= iouv[0]) & (labels[:, 0:1] == detections[:, 7])) # IoU above threshold and classes match if x[0].shape[0]: matches = torch.cat((torch.stack(x, 1), iou[x[0], x[1]][:, None]), 1).cpu().numpy() # [label, detection, iou] if x[0].shape[0] > 1: matches = matches[matches[:, 2].argsort()[::-1]] matches = matches[np.unique(matches[:, 1], return_index=True)[1]] matches = matches[np.unique(matches[:, 0], return_index=True)[1]] matches = torch.Tensor(matches).to(iouv.device) correct[matches[:, 1].long()] = matches[:, 2:3] >= iouv return correct @torch.no_grad() def run(model, val_dataloader, num_class, net_dim=416, nms_thresh=0.6, objectness_thresh=0.001, device="cpu"): model.eval() nc = int(num_class) # number of classes iouv = torch.linspace(0.5, 0.95, 10).to(device) # iou vector for mAP@0.5:0.95 niou = iouv.numel() p, r, f1, mp, mr, map50, map = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 stats, ap, ap_class = [], [], [] for batch_i, (im, targets) in tqdm(enumerate(val_dataloader)): im = im.to(device) targets = targets.to(device) nb = im.shape[0] # batch # Inference out = model(im) # inference # NMS targets[:, 2:] *= torch.Tensor([net_dim, net_dim, net_dim, net_dim]).to(device) # to pixels out = get_predictions( pred=out.to(device), num_classes=nc, objectness_confidence=objectness_thresh, nms_confidence_level=nms_thresh, device=device ) # Metrics for si in range(nb): labels = targets[targets[:, 0] == si, 1:] pred = out[out[:, 0]==si, :] if isinstance(out, torch.Tensor) else torch.zeros((0,0), device=device) nl = len(labels) tcls = labels[:, 0].tolist() if nl else [] # target class if len(pred) == 0: if nl: stats.append((torch.zeros(0, niou, dtype=torch.bool, device="cpu"), torch.Tensor(device="cpu"), torch.Tensor(device="cpu"), tcls)) continue # Predictions if nc==1: pred[:, 7] = 0 if pred.shape[0] > 300: pred = pred[:300, :] # sorted by confidence predn = pred.clone() # Evaluate if nl: tbox = xywh2xyxy(labels[:, 1:5]).to(device) # target boxes labelsn = torch.cat((labels[:, 0:1], tbox), 1).to(device) # native-space labels correct = process_batch(predn, labelsn, iouv) else: correct = torch.zeros(pred.shape[0], niou, dtype=torch.bool) stats.append((correct.cpu(), pred[:, 5].cpu(), pred[:, 7].cpu(), tcls)) # (correct, conf, pcls, tcls) # Compute metrics stats = [np.concatenate(x, 0) for x in zip(*stats)] # to numpy if len(stats) and stats[0].any(): p, r, ap, f1, ap_class = ap_per_class(*stats) ap50, ap = ap[:, 0], ap.mean(1) # AP@0.5, AP@0.5:0.95 mp, mr, map50, map = p.mean(), r.mean(), ap50.mean(), ap.mean() return mp, mr, map50, map ================================================ FILE: imageai/Detection/README.md ================================================ # ImageAI : Object Detection ## --------------------------------------------------- ## Introducing Jarvis and TheiaEngine. We the creators of ImageAI are glad to announce 2 new AI projects to provide state-of-the-art Generative AI, LLM and Image Understanding on your personal computer and servers. [![](../../jarvis.png)](https://jarvis.genxr.co) Install Jarvis on PC/Mac to setup limitless access to LLM powered AI Chats for your every day work, research and generative AI needs with 100% privacy and full offline capability. Visit [https://jarvis.genxr.co](https://jarvis.genxr.co/) to get started. [![](../../theiaengine.png)](https://www.genxr.co/theia-engine) [TheiaEngine](https://www.genxr.co/theia-engine), the next-generation computer Vision AI API capable of all Generative and Understanding computer vision tasks in a single API call and available via REST API to all programming languages. Features include - **Detect 300+ objects** ( 220 more objects than ImageAI) - **Provide answers to any content or context questions** asked on an image - very useful to get information on any object, action or information without needing to train a new custom model for every tasks - **Generate scene description and summary** - **Convert 2D image to 3D pointcloud and triangular mesh** - **Semantic Scene mapping of objects, walls, floors, etc** - **Stateless Face recognition and emotion detection** - **Image generation and augmentation from prompt** - etc. Visit [https://www.genxr.co/theia-engine](https://www.genxr.co/theia-engine) to try the demo and join in the beta testing today. ## --------------------------------------------------- ### TABLE OF CONTENTS - :white_square_button: First Object Detection - :white_square_button: Object Detection, Extraction and Fine-tune - :white_square_button: Custom Object Detection - :white_square_button: Detection Speed - :white_square_button: Hiding/Showing Object Name and Probability - :white_square_button: Image Input & Output Types - :white_square_button: Documentation ImageAI provides very convenient and powerful methods to perform object detection on images and extract each object from the image. The object detection class supports RetinaNet, YOLOv3 and TinyYOLOv3. To start performing object detection, you must download the RetinaNet, YOLOv3 or TinyYOLOv3 object detection model via the links below: * **[RetinaNet](https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/retinanet_resnet50_fpn_coco-eeacb38b.pth)** _(Size = 130 mb, high performance and accuracy, with longer detection time)_ * **[YOLOv3](https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/yolov3.pt)** _(Size = 237 mb, moderate performance and accuracy, with a moderate detection time)_ * **[TinyYOLOv3](https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/tiny-yolov3.pt)** _(Size = 34 mb, optimized for speed and moderate performance, with fast detection time)_ Once you download the object detection model file, you should copy the model file to the your project folder where your .py files will be. Then create a python file and give it a name; an example is FirstObjectDetection.py. Then write the code below into the python file: ### FirstObjectDetection.py
```python from imageai.Detection import ObjectDetection import os execution_path = os.getcwd() detector = ObjectDetection() detector.setModelTypeAsYOLOv3() detector.setModelPath( os.path.join(execution_path , "yolov3.pt")) detector.loadModel() detections = detector.detectObjectsFromImage(input_image=os.path.join(execution_path , "image2.jpg"), output_image_path=os.path.join(execution_path , "image2new.jpg"), minimum_percentage_probability=30) for eachObject in detections: print(eachObject["name"] , " : ", eachObject["percentage_probability"], " : ", eachObject["box_points"] ) print("--------------------------------") ``` Sample Result: Input Image ![Input Image](../../data-images/image2.jpg) Output Image ![Output Image](../../data-images/yolo.jpg) ``` laptop : 87.32235431671143 : (306, 238, 390, 284) -------------------------------- laptop : 96.86298966407776 : (121, 209, 258, 293) -------------------------------- laptop : 98.6301600933075 : (279, 321, 401, 425) -------------------------------- laptop : 99.78572130203247 : (451, 204, 579, 285) -------------------------------- bed : 94.02391314506531 : (23, 205, 708, 553) -------------------------------- apple : 48.03136885166168 : (527, 343, 557, 364) -------------------------------- cup : 34.09906327724457 : (462, 347, 496, 379) -------------------------------- cup : 44.65090036392212 : (582, 342, 618, 386) -------------------------------- person : 57.70219564437866 : (27, 311, 341, 437) -------------------------------- person : 85.26121377944946 : (304, 173, 387, 253) -------------------------------- person : 96.33603692054749 : (415, 130, 538, 266) -------------------------------- person : 96.95255160331726 : (174, 108, 278, 269) -------------------------------- ``` Let us make a breakdown of the object detection code that we used above. ```python from imageai.Detection import ObjectDetection import os execution_path = os.getcwd() ``` In the 3 lines above , we import the **ImageAI object detection** class in the first line, import the `os` in the second line and obtained the path to folder where our python file runs. ```python detector = ObjectDetection() detector.setModelTypeAsYOLOv3() detector.setModelPath( os.path.join(execution_path , "yolov3.pt")) detector.loadModel() ``` In the 4 lines above, we created a new instance of the `ObjectDetection` class in the first line, set the model type to YOLOv3 in the second line, set the model path to the YOLOv3 model file we downloaded and copied to the python file folder in the third line and load the model in the fourth line. ```python detections = detector.detectObjectsFromImage(input_image=os.path.join(execution_path , "image2.jpg"), output_image_path=os.path.join(execution_path , "image2new.jpg")) for eachObject in detections: print(eachObject["name"] , " : ", eachObject["percentage_probability"], " : ", eachObject["box_points"] ) print("--------------------------------") ``` In the 2 lines above, we ran the `detectObjectsFromImage()` function and parse in the path to our image, and the path to the new image which the function will save. Then the function returns an array of dictionaries with each dictionary corresponding to the number of objects detected in the image. Each dictionary has the properties `name` (name of the object), `percentage_probability` (percentage probability of the detection) and `box_points` (the x1,y1,x2 and y2 coordinates of the bounding box of the object). Should you want to use the RetinaNet which is appropriate for high-performance and high-accuracy demanding detection tasks, you will download the RetinaNet model file from the links above, copy it to your python file's folder, set the model type and model path in your python code as seen below: ```python detector = ObjectDetection() detector.setModelTypeAsRetinaNet() detector.setModelPath( os.path.join(execution_path , "retinanet_resnet50_fpn_coco-eeacb38b.pth")) detector.loadModel() ``` However, if you desire TinyYOLOv3 which is optimized for speed and embedded devices, you will download the TinyYOLOv3 model file from the links above, copy it to your python file's folder, set the model type and model path in your python code as seen below: ```python detector = ObjectDetection() detector.setModelTypeAsTinyYOLOv3() detector.setModelPath( os.path.join(execution_path , "tiny-yolov3.pt")) detector.loadModel() ``` ## Object Detection, Extraction and Fine-tune
In the examples we used above, we ran the object detection on an image and it returned the detected objects in an array as well as save a new image with rectangular markers drawn on each object. In our next examples, we will be able to extract each object from the input image and save it independently. In the example code below which is very identical to the previous object detction code, we will save each object detected as a seperate image. ```python from imageai.Detection import ObjectDetection import os execution_path = os.getcwd() detector = ObjectDetection() detector.setModelTypeAsYOLOv3() detector.setModelPath( os.path.join(execution_path , "yolov3.pt")) detector.loadModel() detections, objects_path = detector.detectObjectsFromImage(input_image=os.path.join(execution_path , "image3.jpg"), output_image_path=os.path.join(execution_path , "image3new.jpg"), minimum_percentage_probability=30, extract_detected_objects=True) for eachObject, eachObjectPath in zip(detections, objects_path): print(eachObject["name"] , " : " , eachObject["percentage_probability"], " : ", eachObject["box_points"] ) print("Object's image saved in " + eachObjectPath) print("--------------------------------") ``` ![Input Image](../../data-images/image3.jpg) ![Output Images](../../data-images/image3new.jpg) ![dog](../../data-images/image3new-objects/dog-1.jpg) ![motorcycle](../../data-images/image3new-objects/motorcycle-3.jpg) ![car](../../data-images/image3new-objects/car-4.jpg) ![bicycle](../../data-images/image3new-objects/bicycle-5.jpg) ![person](../../data-images/image3new-objects/person-6.jpg) ![person](../../data-images/image3new-objects/person-7.jpg) ![person](../../data-images/image3new-objects/person-8.jpg) ![person](../../data-images/image3new-objects/person-9.jpg) ![person](../../data-images/image3new-objects/person-10.jpg) Let us review the part of the code that perform the object detection and extract the images: ```python detections, objects_path = detector.detectObjectsFromImage(input_image=os.path.join(execution_path , "image3.jpg"), output_image_path=os.path.join(execution_path , "image3new.jpg"), minimum_percentage_probability=30, extract_detected_objects=True) for eachObject, eachObjectPath in zip(detections, objects_path): print(eachObject["name"] , " : " , eachObject["percentage_probability"], " : ", eachObject["box_points"] ) print("Object's image saved in " + eachObjectPath) print("--------------------------------") ``` In the above above lines, we called the `detectObjectsFromImage()` , parse in the input image path, output image path, and an extra parameter `extract_detected_objects=True`. This parameter states that the function should extract each object detected from the image and save it has a seperate image. The parameter is false by default. Once set to `true`, the function will create a directory which is the **output image path + "-objects"** . Then it saves all the extracted images into this new directory with each image's name being the **detected object name + "-" + a number** which corresponds to the order at which the objects were detected. This new parameter we set to extract and save detected objects as an image will make the function to return 2 values. The first is the array of dictionaries with each dictionary corresponding to a detected object. The second is an array of the paths to the saved images of each object detected and extracted, and they are arranged in order at which the objects are in the first array. **And one important feature you need to know!** You will recall that the percentage probability for each detected object is sent back by the `detectObjectsFromImage()` function. The function has a parameter `minimum_percentage_probability`, whose default value is `50` (value ranges between 0 - 100) , but it set to 30 in this example. That means the function will only return a detected object if it's percentage probability is **30 or above**. The value was kept at this number to ensure the integrity of the detection results. You fine-tune the object detection by setting **minimum_percentage_probability** equal to a smaller value to detect more number of objects or higher value to detect less number of objects. ## Custom Object Detection
The object detection model (**RetinaNet**) supported by **ImageAI** can detect 80 different types of objects. They include: ``` person, bicycle, car, motorcycle, airplane, bus, train, truck, boat, traffic light, fire hydrant, stop_sign, parking meter, bench, bird, cat, dog, horse, sheep, cow, elephant, bear, zebra, giraffe, backpack, umbrella, handbag, tie, suitcase, frisbee, skis, snowboard, sports ball, kite, baseball bat, baseball glove, skateboard, surfboard, tennis racket, bottle, wine glass, cup, fork, knife, spoon, bowl, banana, apple, sandwich, orange, broccoli, carrot, hot dog, pizza, donot, cake, chair, couch, potted plant, bed, dining table, toilet, tv, laptop, mouse, remote, keyboard, cell phone, microwave, oven, toaster, sink, refrigerator, book, clock, vase, scissors, teddy bear, hair dryer, toothbrush. ``` Interestingly, **ImageAI** allow you to perform detection for one or more of the items above. That means you can customize the type of object(s) you want to be detected in the image. Let's take a look at the code below: ```python from imageai.Detection import ObjectDetection import os execution_path = os.getcwd() detector = ObjectDetection() detector.setModelTypeAsYOLOv3() detector.setModelPath( os.path.join(execution_path , "yolov3.pt")) detector.loadModel() custom_objects = detector.CustomObjects(car=True, motorcycle=True) detections = detector.detectCustomObjectsFromImage(custom_objects=custom_objects, input_image=os.path.join(execution_path , "image3.jpg"), output_image_path=os.path.join(execution_path , "image3custom.jpg"), minimum_percentage_probability=30) for eachObject in detections: print(eachObject["name"] , " : ", eachObject["percentage_probability"], " : ", eachObject["box_points"] ) print("--------------------------------") ``` ![Result](../../data-images/image3custom.jpg) Let us take a look at the part of the code that made this possible. ```python custom_objects = detector.CustomObjects(car=True, motorcycle=True) detections = detector.detectCustomObjectsFromImage(custom_objects=custom_objects, input_image=os.path.join(execution_path , "image3.jpg"), output_image_path=os.path.join(execution_path , "image3custom.jpg"), minimum_percentage_probability=30) ``` In the above code, after loading the model (can be done before loading the model as well), we defined a new variable `custom_objects = detector.CustomObjects()`, in which we set its car and motorcycle properties equal to **True**. This is to tell the model to detect only the object we set to True. Then we call the `detector.detectCustomObjectsFromImage()` which is the function that allows us to perform detection of custom objects. Then we will set the `custom_objects` value to the custom objects variable we defined. ## Hiding/Showing Object Name and Probability
**ImageAI** provides options to hide the name of objects detected and/or the percentage probability from being shown on the saved/returned detected image. Using the `detectObjectsFromImage()` and `detectCustomObjectsFromImage()` functions, the parameters `display_object_name` and `display_percentage_probability` can be set to True of False individually. Take a look at the code below: ```python detections = detector.detectObjectsFromImage(input_image=os.path.join(execution_path , "image3.jpg"), output_image_path=os.path.join(execution_path , "image3new_nodetails.jpg"), minimum_percentage_probability=30, display_percentage_probability=False, display_object_name=False) ``` In the above code, we specified that both the object name and percentage probability should not be shown. As you can see in the result below, both the names of the objects and their individual percentage probability is not shown in the detected image. ![Result](../../data-images/nodetails.jpg) ## Image Input & Output Types
**ImageAI** supports 3 types of inputs which are **file path to image file**(default), **numpy array of image** and **image file stream** as well as 2 types of output which are image **file**(default) and numpy **array **. This means you can now perform object detection in production applications such as on a web server and system that returns file in any of the above stated formats. To perform object detection with numpy array or file stream input, you just need to state the input type in the `.detectObjectsFromImage()` function or the `.detectCustomObjectsFromImage()` function. See example below. ```python detections = detector.detectObjectsFromImage(input_type="array", input_image=image_array , output_image_path=os.path.join(execution_path , "image.jpg")) # For numpy array input type detections = detector.detectObjectsFromImage(input_type="stream", input_image=image_stream , output_image_path=os.path.join(execution_path , "test2new.jpg")) # For file stream input type ``` To perform object detection with numpy array output you just need to state the output type in the `.detectObjectsFromImage()` function or the `.detectCustomObjectsFromImage()` function. See example below. ```python detected_image_array, detections = detector.detectObjectsFromImage(output_type="array", input_image="image.jpg" ) # For numpy array output type ``` ## Documentation
We have provided full documentation for all **ImageAI** classes and functions. Find links below: * Documentation - **English Version [https://imageai.readthedocs.io](https://imageai.readthedocs.io)** ================================================ FILE: imageai/Detection/VIDEO.md ================================================ # ImageAI : Video Object Detection, Tracking and Analysis ## --------------------------------------------------- ## Introducing Jarvis and TheiaEngine. We the creators of ImageAI are glad to announce 2 new AI projects to provide state-of-the-art Generative AI, LLM and Image Understanding on your personal computer and servers. [![](../../jarvis.png)](https://jarvis.genxr.co) Install Jarvis on PC/Mac to setup limitless access to LLM powered AI Chats for your every day work, research and generative AI needs with 100% privacy and full offline capability. Visit [https://jarvis.genxr.co](https://jarvis.genxr.co/) to get started. [![](../../theiaengine.png)](https://www.genxr.co/theia-engine) [TheiaEngine](https://www.genxr.co/theia-engine), the next-generation computer Vision AI API capable of all Generative and Understanding computer vision tasks in a single API call and available via REST API to all programming languages. Features include - **Detect 300+ objects** ( 220 more objects than ImageAI) - **Provide answers to any content or context questions** asked on an image - very useful to get information on any object, action or information without needing to train a new custom model for every tasks - **Generate scene description and summary** - **Convert 2D image to 3D pointcloud and triangular mesh** - **Semantic Scene mapping of objects, walls, floors, etc** - **Stateless Face recognition and emotion detection** - **Image generation and augmentation from prompt** - etc. Visit [https://www.genxr.co/theia-engine](https://www.genxr.co/theia-engine) to try the demo and join in the beta testing today. ## --------------------------------------------------- ## TABLE OF CONTENTS - :white_square_button: First Video Object Detection - :white_square_button: Custom Video Object Detection (Object Tracking) - :white_square_button: Camera / Live Stream Video Detection - :white_square_button: Video Analysis - :white_square_button: Detection Speed - :white_square_button: Hiding/Showing Object Name and Probability - :white_square_button: Frame Detection Intervals - :white_square_button: Video Detection Timeout (NEW) - :white_square_button: Documentation ImageAI provides convenient, flexible and powerful methods to perform object detection on videos. The video object detection class provided only supports RetinaNet, YOLOv3 and TinyYOLOv3. This version of **ImageAI** provides commercial grade video objects detection features, which include but not limited to device/IP camera inputs, per frame, per second, per minute and entire video analysis for storing in databases and/or real-time visualizations and for future insights. To start performing video object detection, you must download the RetinaNet, YOLOv3 or TinyYOLOv3 object detection model via the links below: * **[RetinaNet](https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/retinanet_resnet50_fpn_coco-eeacb38b.pth)** _(Size = 130 mb, high performance and accuracy, with longer detection time)_ * **[YOLOv3](https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/yolov3.pt)** _(Size = 237 mb, moderate performance and accuracy, with a moderate detection time)_ * **[TinyYOLOv3](https://github.com/OlafenwaMoses/ImageAI/releases/download/3.0.0-pretrained/tiny-yolov3.pt)** _(Size = 34 mb, optimized for speed and moderate performance, with fast detection time)_ Because video object detection is a compute intensive tasks, we advise you perform this experiment using a computer with a NVIDIA GPU and the GPU version of Tensorflow installed. Performing Video Object Detection CPU will be slower than using an NVIDIA GPU powered computer. You can use Google Colab for this experiment as it has an NVIDIA K80 GPU available for free. Once you download the object detection model file, you should copy the model file to the your project folder where your .py files will be. Then create a python file and give it a name; an example is `FirstVideoObjectDetection.py`. Then write the code below into the python file: ### FirstVideoObjectDetection.py
```python from imageai.Detection import VideoObjectDetection import os execution_path = os.getcwd() detector = VideoObjectDetection() detector.setModelTypeAsRetinaNet() detector.setModelPath( os.path.join(execution_path , "retinanet_resnet50_fpn_coco-eeacb38b.pth")) detector.loadModel() video_path = detector.detectObjectsFromVideo(input_file_path=os.path.join(execution_path, "traffic.mp4"), output_file_path=os.path.join(execution_path, "traffic_detected") , frames_per_second=20, log_progress=True) print(video_path) ``` Input Video (a 1min 24seconds video) [![](../../data-images/video--1.jpg)](https://github.com/OlafenwaMoses/ImageAI/blob/master/data-videos/traffic.mp4) Output Video [![](../../data-images/video-2.jpg)](https://www.youtube.com/embed/qplVDqOmElI?rel=0) Let us make a breakdown of the object detection code that we used above. ```python from imageai.Detection import VideoObjectDetection import os execution_path = os.getcwd() ``` In the 3 lines above , we import the **ImageAI video object detection ** class in the first line, import the **os** in the second line and obtained the path to folder where our python file runs. ```python detector = VideoObjectDetection() detector.setModelTypeAsRetinaNet() detector.setModelPath( os.path.join(execution_path , "retinanet_resnet50_fpn_coco-eeacb38b.pth")) detector.loadModel() ``` In the 4 lines above, we created a new instance of the **VideoObjectDetection** class in the first line, set the model type to RetinaNet in the second line, set the model path to the RetinaNet model file we downloaded and copied to the python file folder in the third line and load the model in the fourth line. ```python video_path = detector.detectObjectsFromVideo(input_file_path=os.path.join(execution_path, "traffic.mp4"), output_file_path=os.path.join(execution_path, "traffic_detected"), frames_per_second=20, log_progress=True) print(video_path) ``` In the 2 lines above, we ran the `detectObjectsFromVideo()` function and parse in the path to our video,the path to the new video (without the extension, it saves a .avi video by default) which the function will save, the number of frames per second (fps) that you we desire the output video to have and option to log the progress of the detection in the console. Then the function returns a the path to the saved video which contains boxes and percentage probabilities rendered on objects detected in the video. ### Custom Video Object Detection
The video object detection model (**RetinaNet**) supported by **ImageAI** can detect 80 different types of objects. They include: ``` person, bicycle, car, motorcycle, airplane, bus, train, truck, boat, traffic light, fire hydrant, stop_sign, parking meter, bench, bird, cat, dog, horse, sheep, cow, elephant, bear, zebra, giraffe, backpack, umbrella, handbag, tie, suitcase, frisbee, skis, snowboard, sports ball, kite, baseball bat, baseball glove, skateboard, surfboard, tennis racket, bottle, wine glass, cup, fork, knife, spoon, bowl, banana, apple, sandwich, orange, broccoli, carrot, hot dog, pizza, donot, cake, chair, couch, potted plant, bed, dining table, toilet, tv, laptop, mouse, remote, keyboard, cell phone, microwave, oven, toaster, sink, refrigerator, book, clock, vase, scissors, teddy bear, hair dryer, toothbrush. ``` Interestingly, **ImageAI** allow you to perform detection for one or more of the items above. That means you can customize the type of object(s) you want to be detected in the video. Let's take a look at the code below: ```python from imageai.Detection import VideoObjectDetection import os execution_path = os.getcwd() detector = VideoObjectDetection() detector.setModelTypeAsRetinaNet() detector.setModelPath( os.path.join(execution_path , "retinanet_resnet50_fpn_coco-eeacb38b.pth")) detector.loadModel() custom_objects = detector.CustomObjects(person=True, bicycle=True, motorcycle=True) video_path = detector.detectCustomObjectsFromVideo( custom_objects=custom_objects, input_file_path=os.path.join(execution_path, "traffic.mp4"), output_file_path=os.path.join(execution_path, "traffic_custom_detected"), frames_per_second=20, log_progress=True) print(video_path) ``` Let us take a look at the part of the code that made this possible. ```python custom_objects = detector.CustomObjects(person=True, bicycle=True, motorcycle=True) video_path = detector.detectCustomObjectsFromVideo( custom_objects=custom_objects, input_file_path=os.path.join(execution_path, "traffic.mp4"), output_file_path=os.path.join(execution_path, "traffic_custom_detected"), frames_per_second=20, log_progress=True) ``` In the above code, after loading the model (can be done before loading the model as well), we defined a new variable `custom_objects = detector.CustomObjects()`, in which we set its person, car and motorcycle properties equal to **True**. This is to tell the model to detect only the object we set to True. Then we call the `detector.detectCustomObjectsFromVideo()` which is the function that allows us to perform detection of custom objects. Then we will set the `custom_objects` value to the custom objects variable we defined. Output Video [![Output Video](../../data-images/video-3.jpg)](https://www.youtube.com/embed/YfAycAzkwPM?rel=0) C:\Users\User\PycharmProjects\ImageAITest\traffic_custom_detected.avi ### Camera / Live Stream Video Detection
**ImageAI** now allows live-video detection with support for camera inputs. Using **OpenCV**'s `VideoCapture()` function, you can load live-video streams from a device camera, cameras connected by cable or IP cameras, and parse it into **ImageAI**'s `detectObjectsFromVideo()` and `detectCustomObjectsFromVideo()` functions. All features that are supported for detecting objects in a video file is also available for detecting objects in a camera's live-video feed. Find below an example of detecting live-video feed from the device camera. ```python from imageai.Detection import VideoObjectDetection import os import cv2 execution_path = os.getcwd() camera = cv2.VideoCapture(0) detector = VideoObjectDetection() detector.setModelTypeAsRetinaNet() detector.setModelPath(os.path.join(execution_path , "retinanet_resnet50_fpn_coco-eeacb38b.pth")) detector.loadModel() video_path = detector.detectObjectsFromVideo( camera_input=camera, output_file_path=os.path.join(execution_path, "camera_detected_video"), frames_per_second=20, log_progress=True, minimum_percentage_probability=40) ``` The difference in the code above and the code for the detection of a video file is that we defined an **OpenCV VideoCapture** instance and loaded the default device camera into it. Then we parsed the camera we defined into the parameter `camera_input` which replaces the `input_file_path` that is used for video file. ### Video Analysis
**ImageAI** now provide commercial-grade video analysis in the Video Object Detection class, for both video file inputs and camera inputs. This feature allows developers to obtain deep insights into any video processed with **ImageAI**. This insights can be visualized in real-time, stored in a NoSQL database for future review or analysis. For video analysis, the `detectObjectsFromVideo()` and `detectCustomObjectsFromVideo()` now allows you to state your own defined functions which will be executed for every frame, seconds and/or minute of the video detected as well as a state a function that will be executed at the end of a video detection. Once this functions are stated, they will receive raw but comprehensive analytical data on the index of the frame/second/minute, objects detected (name, percentage_probability and box_points), number of instances of each unique object detected and average number of occurrence of each unique object detected over a second/minute and entire video. To obtain the video analysis, all you need to do is specify a function, state the corresponding parameters it will be receiving and parse the function name into the `per_frame_function`, `per_second_function`, `per_minute_function` and `video_complete_function` parameters in the detection function. Find below examples of video analysis functions. ```python def forFrame(frame_number, output_array, output_count): print("FOR FRAME " , frame_number) print("Output for each object : ", output_array) print("Output count for unique objects : ", output_count) print("------------END OF A FRAME --------------") def forSeconds(second_number, output_arrays, count_arrays, average_output_count): print("SECOND : ", second_number) print("Array for the outputs of each frame ", output_arrays) print("Array for output count for unique objects in each frame : ", count_arrays) print("Output average count for unique objects in the last second: ", average_output_count) print("------------END OF A SECOND --------------") def forMinute(minute_number, output_arrays, count_arrays, average_output_count): print("MINUTE : ", minute_number) print("Array for the outputs of each frame ", output_arrays) print("Array for output count for unique objects in each frame : ", count_arrays) print("Output average count for unique objects in the last minute: ", average_output_count) print("------------END OF A MINUTE --------------") video_detector = VideoObjectDetection() video_detector.setModelTypeAsYOLOv3() video_detector.setModelPath(os.path.join(execution_path, "yolov3.pt")) video_detector.loadModel() video_detector.detectObjectsFromVideo( input_file_path=os.path.join(execution_path, "traffic.mp4"), output_file_path=os.path.join(execution_path, "traffic_detected"), frames_per_second=10, per_second_function=forSeconds, per_frame_function=forFrame, per_minute_function=forMinute, minimum_percentage_probability=30 ) ``` When the detection starts on a video feed, be it from a video file or camera input, the result will have the format as below: **Results for the Frame function** ``` FOR FRAME : 1 Output for each object : [{'box_points': (362, 295, 443, 355), 'name': 'boat', 'percentage_probability': 26.666194200515747}, {'box_points': (319, 245, 386, 296), 'name': 'boat', 'percentage_probability': 30.052968859672546}, {'box_points': (219, 308, 341, 358), 'name': 'boat', 'percentage_probability': 47.46982455253601}, {'box_points': (589, 198, 621, 241), 'name': 'bus', 'percentage_probability': 24.62330162525177}, {'box_points': (519, 181, 583, 263), 'name': 'bus', 'percentage_probability': 27.446213364601135}, {'box_points': (493, 197, 561, 272), 'name': 'bus', 'percentage_probability': 59.81815457344055}, {'box_points': (432, 187, 491, 240), 'name': 'bus', 'percentage_probability': 64.42965269088745}, {'box_points': (157, 225, 220, 255), 'name': 'car', 'percentage_probability': 21.150341629981995}, {'box_points': (324, 249, 377, 293), 'name': 'car', 'percentage_probability': 24.089913070201874}, {'box_points': (152, 275, 260, 327), 'name': 'car', 'percentage_probability': 30.341443419456482}, {'box_points': (433, 198, 485, 244), 'name': 'car', 'percentage_probability': 37.205660343170166}, {'box_points': (184, 226, 233, 260), 'name': 'car', 'percentage_probability': 38.52525353431702}, {'box_points': (3, 296, 134, 359), 'name': 'car', 'percentage_probability': 47.80363142490387}, {'box_points': (357, 302, 439, 359), 'name': 'car', 'percentage_probability': 47.94844686985016}, {'box_points': (481, 266, 546, 314), 'name': 'car', 'percentage_probability': 65.8585786819458}, {'box_points': (597, 269, 624, 318), 'name': 'person', 'percentage_probability': 27.125394344329834}] Output count for unique objects : {'bus': 4, 'boat': 3, 'person': 1, 'car': 8} ------------END OF A FRAME -------------- ``` For any function you parse into the **per_frame_function**, the function will be executed after every single video frame is processed and he following will be parsed into it: * **Frame Index:** This is the position number of the frame inside the video (e.g 1 for first frame and 20 for twentieth frame). * **Output Array:** This is an array of dictionaries. Each dictionary corresponds to each detected object in the image and it contains the "name", "percentage_probabaility" and "box_points"(x1,y1,x2,y2) values of the object. * **Output Count:** This is a dictionary that has the name of each unique object detected as its keys and the number of instances of the objects detected as the values. **Results for the Second function** ``` FOR SECOND : 1 Array for the outputs of each frame [[{'box_points': (362, 295, 443, 355), 'name': 'boat', 'percentage_probability': 26.666194200515747}, {'box_points': (319, 245, 386, 296), 'name': 'boat', 'percentage_probability': 30.052968859672546}, {'box_points': (219, 308, 341, 358), 'name': 'boat', 'percentage_probability': 47.46982455253601}, {'box_points': (589, 198, 621, 241), 'name': 'bus', 'percentage_probability': 24.62330162525177}, {'box_points': (519, 181, 583, 263), 'name': 'bus', 'percentage_probability': 27.446213364601135}, {'box_points': (493, 197, 561, 272), 'name': 'bus', 'percentage_probability': 59.81815457344055}, {'box_points': (432, 187, 491, 240), 'name': 'bus', 'percentage_probability': 64.42965269088745}, {'box_points': (157, 225, 220, 255), 'name': 'car', 'percentage_probability': 21.150341629981995}, {'box_points': (324, 249, 377, 293), 'name': 'car', 'percentage_probability': 24.089913070201874}, {'box_points': (152, 275, 260, 327), 'name': 'car', 'percentage_probability': 30.341443419456482}, {'box_points': (433, 198, 485, 244), 'name': 'car', 'percentage_probability': 37.205660343170166}, {'box_points': (184, 226, 233, 260), 'name': 'car', 'percentage_probability': 38.52525353431702}, {'box_points': (3, 296, 134, 359), 'name': 'car', 'percentage_probability': 47.80363142490387}, {'box_points': (357, 302, 439, 359), 'name': 'car', 'percentage_probability': 47.94844686985016}, {'box_points': (481, 266, 546, 314), 'name': 'car', 'percentage_probability': 65.8585786819458}, {'box_points': (597, 269, 624, 318), 'name': 'person', 'percentage_probability': 27.125394344329834}], [{'box_points': (316, 240, 384, 302), 'name': 'boat', 'percentage_probability': 29.594269394874573}, {'box_points': (361, 295, 441, 354), 'name': 'boat', 'percentage_probability': 36.11513376235962}, {'box_points': (216, 305, 340, 357), 'name': 'boat', 'percentage_probability': 44.89373862743378}, {'box_points': (432, 198, 488, 244), 'name': 'truck', 'percentage_probability': 22.914741933345795}, {'box_points': (589, 199, 623, 240), 'name': 'bus', 'percentage_probability': 20.545457303524017}, {'box_points': (519, 182, 583, 263), 'name': 'bus', 'percentage_probability': 24.467085301876068}, {'box_points': (492, 197, 563, 271), 'name': 'bus', 'percentage_probability': 61.112016439437866}, {'box_points': (433, 188, 490, 241), 'name': 'bus', 'percentage_probability': 65.08989334106445}, {'box_points': (352, 303, 442, 357), 'name': 'car', 'percentage_probability': 20.025095343589783}, {'box_points': (136, 172, 188, 195), 'name': 'car', 'percentage_probability': 21.571354568004608}, {'box_points': (152, 276, 261, 326), 'name': 'car', 'percentage_probability': 33.07966589927673}, {'box_points': (181, 225, 230, 256), 'name': 'car', 'percentage_probability': 35.111838579177856}, {'box_points': (432, 198, 488, 244), 'name': 'car', 'percentage_probability': 36.25282347202301}, {'box_points': (3, 292, 130, 360), 'name': 'car', 'percentage_probability': 67.55480170249939}, {'box_points': (479, 265, 546, 314), 'name': 'car', 'percentage_probability': 71.47912979125977}, {'box_points': (597, 269, 625, 318), 'name': 'person', 'percentage_probability': 25.903674960136414}],................, [{'box_points': (133, 250, 187, 278), 'name': 'umbrella', 'percentage_probability': 21.518094837665558}, {'box_points': (154, 233, 218, 259), 'name': 'umbrella', 'percentage_probability': 23.687003552913666}, {'box_points': (348, 311, 425, 360), 'name': 'boat', 'percentage_probability': 21.015766263008118}, {'box_points': (11, 164, 137, 225), 'name': 'bus', 'percentage_probability': 32.20453858375549}, {'box_points': (424, 187, 485, 243), 'name': 'bus', 'percentage_probability': 38.043853640556335}, {'box_points': (496, 186, 570, 264), 'name': 'bus', 'percentage_probability': 63.83994221687317}, {'box_points': (588, 197, 622, 240), 'name': 'car', 'percentage_probability': 23.51653128862381}, {'box_points': (58, 268, 111, 303), 'name': 'car', 'percentage_probability': 24.538707733154297}, {'box_points': (2, 246, 72, 301), 'name': 'car', 'percentage_probability': 28.433072566986084}, {'box_points': (472, 273, 539, 323), 'name': 'car', 'percentage_probability': 87.17672824859619}, {'box_points': (597, 270, 626, 317), 'name': 'person', 'percentage_probability': 27.459821105003357}] ] Array for output count for unique objects in each frame : [{'bus': 4, 'boat': 3, 'person': 1, 'car': 8}, {'truck': 1, 'bus': 4, 'boat': 3, 'person': 1, 'car': 7}, {'bus': 5, 'boat': 2, 'person': 1, 'car': 5}, {'bus': 5, 'boat': 1, 'person': 1, 'car': 9}, {'truck': 1, 'bus': 2, 'car': 6, 'person': 1}, {'truck': 2, 'bus': 4, 'boat': 2, 'person': 1, 'car': 7}, {'truck': 1, 'bus': 3, 'car': 7, 'person': 1, 'umbrella': 1}, {'bus': 4, 'car': 7, 'person': 1, 'umbrella': 2}, {'bus': 3, 'car': 6, 'boat': 1, 'person': 1, 'umbrella': 3}, {'bus': 3, 'car': 4, 'boat': 1, 'person': 1, 'umbrella': 2}] Output average count for unique objects in the last second: {'truck': 0.5, 'bus': 3.7, 'umbrella': 0.8, 'boat': 1.3, 'person': 1.0, 'car': 6.6} ------------END OF A SECOND -------------- ``` In the above result, the video was processed and saved in 10 frames per second (FPS). For any function you parse into the **per_second_function**, the function will be executed after every single second of the video that is processed and he following will be parsed into it: - **Second Index:** This is the position number of the second inside the video (e.g 1 for first second and 20 for twentieth second). - **Output Array:** This is an array of arrays, with each contained array and its position (array index + 1) corresponding to the equivalent frame in the last second of the video (In the above example, their are 10 arrays which corresponds to the 10 frames contained in one second). Each contained array contains dictionaries. Each dictionary corresponds to each detected object in the image and it contains the "name", "percentage_probabaility" and "box_points"(x1,y1,x2,y2) values of the object. - **Count arrays:** This is an array of dictionaries. Each dictionary and its position (array index + 1) corresponds to the equivalent frame in the last second of he video. Each dictionary has the name of each unique object detected as its keys and the number of instances of the objects detected as the values. - **Average Output Count:** This is a dictionary that has the name of each unique object detected in the last second as its keys and the average number of instances of the objects detected across the number of frames as the values. **Results for the Minute function** The above set of **4 parameters** that are returned for every second of the video processed is the same parameters to that will be returned for every minute of the video processed. The difference is that the index returned corresponds to the minute index, the **output_arrays** is an array that contains the number of FPS * 60 number of arrays (in the code example above, 10 frames per second(fps) * 60 seconds = 600 frames = 600 arrays), and the **count_arrays** is an array that contains the number of FPS * 60 number of dictionaries (in the code example above, 10 frames per second(fps) * 60 seconds = 600 frames = 600 dictionaries) and the **average_output_count** is a dictionary that covers all the objects detected in all the frames contained in the last minute. **Results for the Video Complete Function** **ImageAI** allows you to obtain complete analysis of the entire video processed. All you need is to define a function like the forSecond or forMinute function and set the **video_complete_function** parameter into your `.detectObjectsFromVideo()` or `.detectCustomObjectsFromVideo()` function. The same values for the per_second-function and per_minute_function will be returned. The difference is that no index will be returned and the other 3 values will be returned, and the 3 values will cover all frames in the video. Below is a sample function: ```python def forFull(output_arrays, count_arrays, average_output_count): #Perform action on the 3 parameters returned into the function video_detector.detectObjectsFromVideo( input_file_path=os.path.join(execution_path, "traffic.mp4"), output_file_path=os.path.join(execution_path, "traffic_detected"), frames_per_second=10, video_complete_function=forFull, minimum_percentage_probability=30 ) ``` **FINAL NOTE ON VIDEO ANALYSIS** : **ImageAI** allows you to obtain the detected video frame as a Numpy array at each frame, second and minute function. All you need to do is specify one more parameter in your function and set `return_detected_frame=True` in your `detectObjectsFromVideo()` or `detectCustomObjectsFrom()` function. Once this is set, the extra parameter you sepecified in your function will be the Numpy array of the detected frame. See a sample below: ```python def forFrame(frame_number, output_array, output_count, detected_frame): print("FOR FRAME " , frame_number) print("Output for each object : ", output_array) print("Output count for unique objects : ", output_count) print("Returned Objects is : ", type(detected_frame)) print("------------END OF A FRAME --------------") video_detector.detectObjectsFromVideo( input_file_path=os.path.join(execution_path, "traffic.mp4"), output_file_path=os.path.join(execution_path, "traffic_detected"), frames_per_second=10, per_frame_function=forFrame, minimum_percentage_probability=30, return_detected_frame=True ) ``` ### Frame Detection Intervals
The above video objects detection task are optimized for frame-real-time object detections that ensures that objects in every frame of the video is detected. **ImageAI** provides you the option to adjust the video frame detections which can speed up your video detection process. When calling the `.detectObjectsFromVideo()` or `.detectCustomObjectsFromVideo()`, you can specify at which frame interval detections should be made. By setting the **frame_detection_interval** parameter to be equal to 5 or 20, that means the object detections in the video will be updated after 5 frames or 20 frames. If your output video **frames_per_second** is set to 20, that means the object detections in the video will be updated once in every quarter of a second or every second. This is useful in case scenarious where the available compute is less powerful and speeds of moving objects are low. This ensures you can have objects detected as second-real-time , half-a-second-real-time or whichever way suits your needs. We conducted video object detection on the same input video we have been using all this while by applying a **frame_detection_interval** value equal to 5. ###Video Detection Timeout
**ImageAI** now allows you to set a timeout in seconds for detection of objects in videos or camera live feed. To set a timeout for your video detection code, all you need to do is specify the `detection_timeout` parameter in the `detectObjectsFromVideo()` function to the number of desired seconds. In the example code below, we set `detection_timeout` to 120 seconds (2 minutes). ```python from imageai.Detection import VideoObjectDetection import os import cv2 execution_path = os.getcwd() camera = cv2.VideoCapture(0) detector = VideoObjectDetection() detector.setModelTypeAsRetinaNet() detector.setModelPath(os.path.join(execution_path , "retinanet_resnet50_fpn_coco-eeacb38b.pth")) detector.loadModel() video_path = detector.detectObjectsFromVideo(camera_input=camera, output_file_path=os.path.join(execution_path, "camera_detected_video"), frames_per_second=20, log_progress=True, minimum_percentage_probability=40, detection_timeout=120) ``` ### Documentation
We have provided full documentation for all **ImageAI** classes and functions. Find links below: - Documentation - **English Version [https://imageai.readthedocs.io](https://imageai.readthedocs.io)** ================================================ FILE: imageai/Detection/__init__.py ================================================ import os, warnings from tkinter import Image from collections import defaultdict from typing import List, Tuple, Dict, Union from PIL import Image import torchvision import numpy as np from enum import Enum import torch import cv2 from typing import Union, List from ..yolov3.yolov3 import YoloV3 from ..yolov3.tiny_yolov3 import YoloV3Tiny from ..yolov3.utils import draw_bbox_and_label, get_predictions, prepare_image from ..retinanet.utils import read_image, draw_bounding_boxes_and_labels, tensor_to_ndarray import uuid from ..backend_check.model_extension import extension_check warnings.filterwarnings("once", category=ResourceWarning) class ImageReadMode(Enum): """ Support for various modes while reading images. Use ``ImageReadMode.UNCHANGED`` for loading the image as-is, ``ImageReadMode.GRAY`` for converting to grayscale, ``ImageReadMode.GRAY_ALPHA`` for grayscale with transparency, ``ImageReadMode.RGB`` for RGB and ``ImageReadMode.RGB_ALPHA`` for RGB with transparency. """ UNCHANGED = 0 GRAY = 1 GRAY_ALPHA = 2 RGB = 3 RGB_ALPHA = 4 class ObjectDetection: """ This is the object detection class for images in the ImageAI library. It allows you to detect the 80 objects in the COCO dataset [ https://cocodataset.org/#home ] in any image. This class provides support for RetinaNet, YOLOv3 and TinyYOLOv3 object detection networks . After instantiating this class, you can set its properties and make object detections using pretrained models. The following functions are required to be called before object detection can be made * setModelPath: Used to specify the filepath to the pretrained model. * At least of of the following and it must correspond to the model set in the setModelPath() [setModelTypeAsRetinaNet(), setModelTypeAsYOLOv3(), setModelTypeAsTinyYOLOv3()] * loadModel: [This must be called once only before performing object detection] Once the above functions have been called, you can call the detectObjectsFromImage() function of the object detection instance object at anytime to obtain observable objects in any image. * detectObjectsFromImage: Used to perform object detection on an image """ def __init__(self) -> None: self.__device: str = "cuda" if torch.cuda.is_available() else "cpu" self.__nms_score: float = 0.4 self.__objectness_score: float = 0.5 self.__anchors: List[int] = None self.__anchors_yolov3: List[int] = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326] self.__anchors_tiny_yolov3: List[int] = [10, 14, 23, 27, 37, 58, 81, 82, 135, 169, 344, 319] self.__classes = self.__load_classes(os.path.join(os.path.dirname(os.path.abspath(__file__)), "coco_classes.txt")) self.__model_type = "" self.__model = None self.__model_loaded = False self.__model_path = "" def __load_classes(self, path: str) -> List[str]: with open(path) as f: unique_classes = [c.strip() for c in f.readlines()] return unique_classes def __load_image_yolo(self, input_image : Union[str, np.ndarray, Image.Image]) -> Tuple[List[str], List[np.ndarray], torch.Tensor, torch.Tensor]: allowed_exts = ["jpg", "jpeg", "png"] fnames = [] original_dims = [] inputs = [] original_imgs = [] if type(input_image) == str: if os.path.isfile(input_image): if input_image.rsplit('.')[-1].lower() in allowed_exts: img = cv2.imread(input_image) else: raise ValueError(f"image path '{input_image}' is not found or a valid file") elif type(input_image) == np.ndarray: img = input_image elif "PIL" in str(type(input_image)): img = np.asarray(input_image) else: raise ValueError(f"Invalid image input format") img_h, img_w, _ = img.shape original_imgs.append(np.array(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)).astype(np.uint8)) original_dims.append((img_w, img_h)) if type(input_image) == str: fnames.append(os.path.basename(input_image)) else: fnames.append("") inputs.append(prepare_image(img, (416, 416))) if original_dims: return ( fnames, original_imgs, torch.FloatTensor(original_dims).repeat(1,2).to(self.__device), torch.cat(inputs, 0).to(self.__device) ) raise RuntimeError( f"Error loading image." "\nEnsure the file is a valid image," " allowed file extensions are .jpg, .jpeg, .png" ) def __save_temp_img(self, input_image : Union[np.ndarray, Image.Image]) -> str: temp_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), f"{str(uuid.uuid4())}.jpg" ) if type(input_image) == np.ndarray: cv2.imwrite(temp_path, input_image) elif "PIL" in str(type(input_image)): input_image.save(temp_path) else: raise ValueError( f"Invalid image input. Supported formats are OpenCV/Numpy array, PIL image or image file path" ) return temp_path def __load_image_retinanet(self, input_image : str) -> Tuple[List[str], List[torch.Tensor], List[torch.Tensor]]: """ Loads image from the given path. """ allowed_file_extensions = ["jpg", "jpeg", "png"] images = [] scaled_images = [] fnames = [] delete_file = False if type(input_image) is not str: input_image = self.__save_temp_img(input_image=input_image) delete_file = True if os.path.isfile(input_image): if input_image.rsplit('.')[-1].lower() in allowed_file_extensions: img = read_image(input_image, ImageReadMode.RGB) images.append(img) scaled_images.append(img.div(255.0).to(self.__device)) fnames.append(os.path.basename(input_image)) else: raise ValueError(f"Input image with path {input_image} not a valid file") if delete_file: os.remove(input_image) if images: return (fnames, images, scaled_images) raise RuntimeError( f"Error loading image from input." "\nEnsure the folder contains images," " allowed file extensions are .jpg, .jpeg, .png" ) def setModelTypeAsYOLOv3(self): """ 'setModelTypeAsYOLOv3()' is used to set the model type to the YOLOv3 model. :return: """ self.__anchors = self.__anchors_yolov3 self.__model_type = "yolov3" def setModelTypeAsTinyYOLOv3(self): """ 'setModelTypeAsTinyYOLOv3()' is used to set the model type to the TinyYOLOv3 model. :return: """ self.__anchors = self.__anchors_tiny_yolov3 self.__model_type = "tiny-yolov3" def setModelTypeAsRetinaNet(self): """ 'setModelTypeAsRetinaNet()' is used to set the model type to the RetinaNet model. :return: """ self.__anchors = self.__anchors_tiny_yolov3 self.__model_type = "retinanet" def setModelPath(self, path: str) -> None: """ 'setModelPath()' function is required and is used to set the file path to the model adopted from the list of the available 3 model types. The model path must correspond to the model type. :param model_path: :return: """ if os.path.isfile(path): extension_check(path) self.__model_path = path self.__model_loaded = False else: raise ValueError( "invalid path, path not pointing to a valid file." ) from None def useCPU(self): """ Used to force classification to be done on CPU. By default, classification will occur on GPU compute if available else CPU compute. """ self.__device = "cpu" if self.__model_loaded: self.__model_loaded = False self.loadModel() def loadModel(self) -> None: """ 'loadModel()' function is used to load the model weights into the model architecture from the file path defined in the setModelPath() function. :return: """ if not self.__model_loaded: if self.__model_type=="yolov3": self.__model = YoloV3( anchors=self.__anchors , num_classes=len(self.__classes),\ device=self.__device ) elif self.__model_type=="tiny-yolov3": self.__model = YoloV3Tiny( anchors=self.__anchors, num_classes=len(self.__classes), device=self.__device ) elif self.__model_type=="retinanet": self.__classes = self.__load_classes(os.path.join(os.path.dirname(os.path.abspath(__file__)), "coco91_classes.txt")) self.__model = torchvision.models.detection.retinanet_resnet50_fpn( pretrained=False, num_classes=91, pretrained_backbone = False ) else: raise ValueError(f"Invalid model type. Call setModelTypeAsYOLOv3(), setModelTypeAsTinyYOLOv3() or setModelTypeAsRetinaNet to set a model type before loading the model") state_dict = torch.load(self.__model_path, map_location=self.__device) try: self.__model.load_state_dict(state_dict) self.__model_loaded = True self.__model.to(self.__device).eval() except: raise RuntimeError("Invalid weights!!!") from None def CustomObjects(self, **kwargs): """ The 'CustomObjects()' function allows you to handpick the type of objects ( from the COCO classes ) you want to detect from an image. The objects are pre-initiated in the function variables and predefined as 'False', which you can easily set to true for any number of objects available. This function returns a dictionary which must be parsed into the 'detectObjectsFromImage()'. Detecting custom objects only happens when you call the function 'detectObjectsFromImage()' Acceptable values are 'True' and 'False' for all object values present :param boolean_values: :return: custom_objects_dict """ if not self.__model_loaded: self.loadModel() all_objects_str = (obj_label.replace(" ", "_") for obj_label in self.__classes) all_objects_dict = {} for object_str in all_objects_str: all_objects_dict[object_str] = False for karg in kwargs: if karg in all_objects_dict: all_objects_dict[karg] = kwargs[karg] else: raise ValueError(f" object '{karg}' doesn't exist in the supported object classes") return all_objects_dict def detectObjectsFromImage(self, input_image: Union[str, np.ndarray, Image.Image], output_image_path: str=None, output_type: str ="file", extract_detected_objects: bool=False, minimum_percentage_probability: int=50, display_percentage_probability: bool=True, display_object_name: bool=True, display_box: bool=True, custom_objects: List=None ) -> Union[List[List[Tuple[str, float, Dict[str, int]]]], np.ndarray, List[np.ndarray], List[str]]: """ Detects objects in an image using the unique classes provided by COCO. :param input_image: path to an image file, cv2 image or PIL image :param output_image_path: path to save input image with predictions rendered :param output_type: type of output for rendered image. Acceptable values are 'file' and 'array` ( a cv2 image ) :param extract_detected_objects: extract each object based on the output type :param minimum_percentage_probability: the minimum confidence a detected object must have :param display_percentage_probability: to diplay/not display the confidence on rendered image :param display_object_name: to diplay/not display the object name on rendered image :param display_box: to diplay/not display the object bounding box on rendered image :param custom_objects: a dictionary of detectable objects set to boolean values :returns: A list of tuples containing the label of detected object and the confidence. """ self.__model.eval() if not self.__model_loaded: if self.__model_path: warnings.warn( "Model path has changed but pretrained weights in the" " new path is yet to be loaded.", ResourceWarning ) else: raise RuntimeError( "Model path isn't set, pretrained weights aren't used." ) predictions = defaultdict(lambda : []) if self.__model_type == "yolov3" or self.__model_type == "tiny-yolov3": fnames, original_imgs, input_dims, imgs = self.__load_image_yolo(input_image) with torch.no_grad(): output = self.__model(imgs) output = get_predictions( pred=output.to(self.__device), num_classes=len(self.__classes), nms_confidence_level=self.__nms_score, objectness_confidence= self.__objectness_score, device=self.__device ) if output is None: if output_type == "array": if extract_detected_objects: return original_imgs[0], [], [] else: return original_imgs[0], [] else: if extract_detected_objects: return original_imgs[0], [] else: return [] # scale the output to match the dimension of the original image input_dims = torch.index_select(input_dims, 0, output[:, 0].long()) scaling_factor = torch.min(416 / input_dims, 1)[0].view(-1, 1) output[:, [1,3]] -= (416 - (scaling_factor * input_dims[:, 0].view(-1,1))) / 2 output[:, [2,4]] -= (416 - (scaling_factor * input_dims[:, 1].view(-1,1))) / 2 output[:, 1:5] /= scaling_factor #clip bounding box for those that extended outside the detected image. for idx in range(output.shape[0]): output[idx, [1,3]] = torch.clamp(output[idx, [1,3]], 0.0, input_dims[idx, 0]) output[idx, [2,4]] = torch.clamp(output[idx, [2,4]], 0.0, input_dims[idx, 1]) for pred in output: pred_label = self.__classes[int(pred[-1])] if custom_objects: if pred_label.replace(" ", "_") in custom_objects.keys(): if not custom_objects[pred_label.replace(" ", "_")]: continue else: continue predictions[int(pred[0])].append(( pred_label, float(pred[-2]), {k:v for k,v in zip(["x1", "y1", "x2", "y2"], map(int, pred[1:5]))}, )) elif self.__model_type == "retinanet": fnames, original_imgs, scaled_images = self.__load_image_retinanet(input_image) with torch.no_grad(): output = self.__model(scaled_images) if output is None: if output_type == "array": if extract_detected_objects: return original_imgs[0], [], [] else: return original_imgs[0], [] else: if extract_detected_objects: return original_imgs[0], [] else: return [] for idx, pred in enumerate(output): for id in range(pred["labels"].shape[0]): if pred["scores"][id] >= self.__objectness_score: pred_label = self.__classes[pred["labels"][id]] if custom_objects: if pred_label.replace(" ", "_") in custom_objects.keys(): if not custom_objects[pred_label.replace(" ", "_")]: continue else: continue predictions[idx].append( ( pred_label, pred["scores"][id].item(), {k:v for k,v in zip(["x1", "y1", "x2", "y2"], map(int, pred["boxes"][id]))} ) ) # Render detection on copy of input image original_input_image = None output_image_array = None extracted_objects = [] if self.__model_type == "yolov3" or self.__model_type == "tiny-yolov3": original_input_image = cv2.cvtColor(original_imgs[0], cv2.COLOR_RGB2BGR) if isinstance(output, torch.Tensor): for pred in output: percentage_conf = round(float(pred[-2]) * 100, 2) if percentage_conf < minimum_percentage_probability: continue displayed_label = "" if display_object_name: displayed_label = f"{self.__classes[int(pred[-1].item())]} : " if display_percentage_probability: displayed_label += f" {percentage_conf}%" original_imgs[int(pred[0].item())] = draw_bbox_and_label(pred[1:5].int() if display_box else None, displayed_label, original_imgs[int(pred[0].item())] ) output_image_array = cv2.cvtColor(original_imgs[0], cv2.COLOR_RGB2BGR) elif self.__model_type == "retinanet": original_input_image = tensor_to_ndarray(original_imgs[0].div(255.0)) original_input_image = cv2.cvtColor(original_input_image, cv2.COLOR_RGB2BGR) for idx, pred in predictions.items(): max_dim = max(list(original_imgs[idx].size())) for label, score, bbox in pred: percentage_conf = round(score * 100, 2) if percentage_conf < minimum_percentage_probability: continue displayed_label = "" if display_object_name: displayed_label = f"{label} :" if display_percentage_probability: displayed_label += f" {percentage_conf}%" original_imgs[idx] = draw_bounding_boxes_and_labels( image=original_imgs[idx], boxes=torch.Tensor([[bbox["x1"], bbox["y1"], bbox["x2"], bbox["y2"]]]), draw_boxes=display_box, labels=[displayed_label], label_color=(0, 0, 255), box_color=(0, 255, 0), width=1, fill=False, font_size=int(max_dim / 30) ) output_image_array = tensor_to_ndarray(original_imgs[0].div(255.0)) output_image_array = cv2.cvtColor(output_image_array, cv2.COLOR_RGB2BGR) # Format predictions for function reponse predictions_batch = list(predictions.values()) predictions_list = predictions_batch[0] if len(predictions_batch) > 0 else [] min_probability = minimum_percentage_probability / 100 if output_type == "file": if output_image_path: cv2.imwrite(output_image_path, output_image_array) if extract_detected_objects: extraction_dir = ".".join(output_image_path.split(".")[:-1]) + "-extracted" os.mkdir(extraction_dir) count = 0 for obj_prediction in predictions_list: if obj_prediction[1] >= min_probability: count += 1 extracted_path = os.path.join( extraction_dir, ".".join(os.path.basename(output_image_path).split(".")[:-1]) + f"-{count}.jpg" ) obj_bbox = obj_prediction[2] cv2.imwrite(extracted_path, original_input_image[obj_bbox["y1"] : obj_bbox["y2"], obj_bbox["x1"] : obj_bbox["x2"]]) extracted_objects.append(extracted_path) elif output_type == "array": if extract_detected_objects: for obj_prediction in predictions_list: if obj_prediction[1] >= min_probability: obj_bbox = obj_prediction[2] extracted_objects.append(original_input_image[obj_bbox["y1"] : obj_bbox["y2"], obj_bbox["x1"] : obj_bbox["x2"]]) else: raise ValueError(f"Invalid output_type '{output_type}'. Supported values are 'file' and 'array' ") predictions_list = [ { "name": prediction[0], "percentage_probability": round(prediction[1] * 100, 2), "box_points": [prediction[2]["x1"], prediction[2]["y1"], prediction[2]["x2"], prediction[2]["y2"]] } for prediction in predictions_list if prediction[1] >= min_probability ] if output_type == "array": if extract_detected_objects: return output_image_array, predictions_list, extracted_objects else: return output_image_array, predictions_list else: if extract_detected_objects: return predictions_list, extracted_objects else: return predictions_list class VideoObjectDetection: """ This is the object detection class for videos and camera live stream inputs in the ImageAI library. It provides support for RetinaNet, YOLOv3 and TinyYOLOv3 object detection networks. After instantiating this class, you can set it's properties and make object detections using it's pre-defined functions. The following functions are required to be called before object detection can be made * setModelPath() * At least of of the following and it must correspond to the model set in the setModelPath() [setModelTypeAsRetinaNet(), setModelTypeAsYOLOv3(), setModelTinyYOLOv3()] * loadModel() [This must be called once only before performing object detection] Once the above functions have been called, you can call the detectObjectsFromVideo() function or the detectCustomObjectsFromVideo() of the object detection instance object at anytime to obtain observable objects in any video or camera live stream. """ def __init__(self): self.__detector = ObjectDetection() def setModelTypeAsYOLOv3(self): self.__detector.setModelTypeAsYOLOv3() def setModelTypeAsTinyYOLOv3(self): self.__detector.setModelTypeAsTinyYOLOv3() def setModelTypeAsRetinaNet(self): self.__detector.setModelTypeAsRetinaNet() def setModelPath(self, model_path: str): extension_check(model_path) self.__detector.setModelPath(model_path) def loadModel(self): self.__detector.loadModel() def useCPU(self): self.__detector.useCPU() def CustomObjects(self, **kwargs): return self.__detector.CustomObjects(**kwargs) def detectObjectsFromVideo(self, input_file_path="", camera_input=None, output_file_path="", frames_per_second=20, frame_detection_interval=1, minimum_percentage_probability=50, log_progress=False, display_percentage_probability=True, display_object_name=True, display_box=True, save_detected_video=True, per_frame_function=None, per_second_function=None, per_minute_function=None, video_complete_function=None, return_detected_frame=False, detection_timeout = None, custom_objects=None): """ 'detectObjectsFromVideo()' function is used to detect objects observable in the given video path or a camera input: * input_file_path , which is the file path to the input video. It is required only if 'camera_input' is not set * camera_input , allows you to parse in camera input for live video detections * output_file_path , which is the path to the output video. It is required only if 'save_detected_video' is not set to False * frames_per_second , which is the number of frames to be used in the output video * frame_detection_interval (optional, 1 by default) , which is the intervals of frames that will be detected. * minimum_percentage_probability (optional, 50 by default) , option to set the minimum percentage probability for nominating a detected object for output. * log_progress (optional) , which states if the progress of the frame processed is to be logged to console * display_percentage_probability (optional), can be used to hide or show probability scores on the detected video frames * display_object_name (optional), can be used to show or hide object names on the detected video frames * save_save_detected_video (optional, True by default), can be set to or not to save the detected video * per_frame_function (optional), this parameter allows you to parse in a function you will want to execute after each frame of the video is detected. If this parameter is set to a function, after every video frame is detected, the function will be executed with the following values parsed into it: -- position number of the frame -- an array of dictinaries, with each dictionary corresponding to each object detected. Each dictionary contains 'name', 'percentage_probability' and 'box_points' -- a dictionary with with keys being the name of each unique objects and value are the number of instances of the object present -- If return_detected_frame is set to True, the numpy array of the detected frame will be parsed as the fourth value into the function * per_second_function (optional), this parameter allows you to parse in a function you will want to execute after each second of the video is detected. If this parameter is set to a function, after every second of a video is detected, the function will be executed with the following values parsed into it: -- position number of the second -- an array of dictionaries whose keys are position number of each frame present in the last second , and the value for each key is the array for each frame that contains the dictionaries for each object detected in the frame -- an array of dictionaries, with each dictionary corresponding to each frame in the past second, and the keys of each dictionary are the name of the number of unique objects detected in each frame, and the key values are the number of instances of the objects found in the frame -- a dictionary with its keys being the name of each unique object detected throughout the past second, and the key values are the average number of instances of the object found in all the frames contained in the past second -- If return_detected_frame is set to True, the numpy array of the detected frame will be parsed as the fifth value into the function * per_minute_function (optional), this parameter allows you to parse in a function you will want to execute after each minute of the video is detected. If this parameter is set to a function, after every minute of a video is detected, the function will be executed with the following values parsed into it: -- position number of the minute -- an array of dictionaries whose keys are position number of each frame present in the last minute , and the value for each key is the array for each frame that contains the dictionaries for each object detected in the frame -- an array of dictionaries, with each dictionary corresponding to each frame in the past minute, and the keys of each dictionary are the name of the number of unique objects detected in each frame, and the key values are the number of instances of the objects found in the frame -- a dictionary with its keys being the name of each unique object detected throughout the past minute, and the key values are the average number of instances of the object found in all the frames contained in the past minute -- If return_detected_frame is set to True, the numpy array of the detected frame will be parsed as the fifth value into the function * video_complete_function (optional), this parameter allows you to parse in a function you will want to execute after all of the video frames have been detected. If this parameter is set to a function, after all of frames of a video is detected, the function will be executed with the following values parsed into it: -- an array of dictionaries whose keys are position number of each frame present in the entire video , and the value for each key is the array for each frame that contains the dictionaries for each object detected in the frame -- an array of dictionaries, with each dictionary corresponding to each frame in the entire video, and the keys of each dictionary are the name of the number of unique objects detected in each frame, and the key values are the number of instances of the objects found in the frame -- a dictionary with its keys being the name of each unique object detected throughout the entire video, and the key values are the average number of instances of the object found in all the frames contained in the entire video * return_detected_frame (optionally, False by default), option to obtain the return the last detected video frame into the per_per_frame_function, per_per_second_function or per_per_minute_function * detection_timeout (optionally, None by default), option to state the number of seconds of a video that should be detected after which the detection function stop processing the video * thread_safe (optional, False by default), enforce the loaded detection model works across all threads if set to true, made possible by forcing all Tensorflow inference to run on the default graph. :param input_file_path: :param camera_input :param output_file_path: :param save_detected_video: :param frames_per_second: :param frame_detection_interval: :param minimum_percentage_probability: :param log_progress: :param display_percentage_probability: :param display_object_name: :param per_frame_function: :param per_second_function: :param per_minute_function: :param video_complete_function: :param return_detected_frame: :param detection_timeout: :param thread_safe: :return output_video_filepath: :return counting: :return output_objects_array: :return output_objects_count: :return detected_copy: :return this_second_output_object_array: :return this_second_counting_array: :return this_second_counting: :return this_minute_output_object_array: :return this_minute_counting_array: :return this_minute_counting: :return this_video_output_object_array: :return this_video_counting_array: :return this_video_counting: """ if (input_file_path == "" and camera_input == None): raise ValueError( "You must set 'input_file_path' to a valid video file, or set 'camera_input' to a valid camera") elif (save_detected_video == True and output_file_path == ""): raise ValueError( "You must set 'output_video_filepath' to a valid video file name, in which the detected video will be saved. If you don't intend to save the detected video, set 'save_detected_video=False'") else: try: output_frames_dict = {} output_frames_count_dict = {} input_video = cv2.VideoCapture(input_file_path) if (camera_input != None): input_video = camera_input output_video_filepath = output_file_path + '.mp4' frame_width = int(input_video.get(3)) frame_height = int(input_video.get(4)) output_video = cv2.VideoWriter(output_video_filepath, cv2.VideoWriter_fourcc(*"MP4V"), frames_per_second, (frame_width, frame_height)) counting = 0 detection_timeout_count = 0 video_frames_count = 0 while (input_video.isOpened()): ret, frame = input_video.read() if (ret == True): video_frames_count += 1 if (detection_timeout != None): if ((video_frames_count % frames_per_second) == 0): detection_timeout_count += 1 if (detection_timeout_count >= detection_timeout): break output_objects_array = [] counting += 1 if (log_progress == True): print("Processing Frame : ", str(counting)) detected_copy = frame.copy() check_frame_interval = counting % frame_detection_interval if (counting == 1 or check_frame_interval == 0): try: detected_copy, output_objects_array = self.__detector.detectObjectsFromImage( input_image=frame, output_type="array", minimum_percentage_probability=minimum_percentage_probability, display_percentage_probability=display_percentage_probability, display_object_name=display_object_name, display_box=display_box, custom_objects=custom_objects) except: None output_frames_dict[counting] = output_objects_array output_objects_count = {} for eachItem in output_objects_array: eachItemName = eachItem["name"] try: output_objects_count[eachItemName] = output_objects_count[eachItemName] + 1 except: output_objects_count[eachItemName] = 1 output_frames_count_dict[counting] = output_objects_count if (save_detected_video == True): output_video.write(detected_copy) if (counting == 1 or check_frame_interval == 0): if (per_frame_function != None): if (return_detected_frame == True): per_frame_function(counting, output_objects_array, output_objects_count, detected_copy) elif (return_detected_frame == False): per_frame_function(counting, output_objects_array, output_objects_count) if (per_second_function != None): if (counting != 1 and (counting % frames_per_second) == 0): this_second_output_object_array = [] this_second_counting_array = [] this_second_counting = {} for aa in range(counting): if (aa >= (counting - frames_per_second)): this_second_output_object_array.append(output_frames_dict[aa + 1]) this_second_counting_array.append(output_frames_count_dict[aa + 1]) for eachCountingDict in this_second_counting_array: for eachItem in eachCountingDict: try: this_second_counting[eachItem] = this_second_counting[eachItem] + \ eachCountingDict[eachItem] except: this_second_counting[eachItem] = eachCountingDict[eachItem] for eachCountingItem in this_second_counting: this_second_counting[eachCountingItem] = int(this_second_counting[eachCountingItem] / frames_per_second) if (return_detected_frame == True): per_second_function(int(counting / frames_per_second), this_second_output_object_array, this_second_counting_array, this_second_counting, detected_copy) elif (return_detected_frame == False): per_second_function(int(counting / frames_per_second), this_second_output_object_array, this_second_counting_array, this_second_counting) if (per_minute_function != None): if (counting != 1 and (counting % (frames_per_second * 60)) == 0): this_minute_output_object_array = [] this_minute_counting_array = [] this_minute_counting = {} for aa in range(counting): if (aa >= (counting - (frames_per_second * 60))): this_minute_output_object_array.append(output_frames_dict[aa + 1]) this_minute_counting_array.append(output_frames_count_dict[aa + 1]) for eachCountingDict in this_minute_counting_array: for eachItem in eachCountingDict: try: this_minute_counting[eachItem] = this_minute_counting[eachItem] + \ eachCountingDict[eachItem] except: this_minute_counting[eachItem] = eachCountingDict[eachItem] for eachCountingItem in this_minute_counting: this_minute_counting[eachCountingItem] = int(this_minute_counting[eachCountingItem] / (frames_per_second * 60)) if (return_detected_frame == True): per_minute_function(int(counting / (frames_per_second * 60)), this_minute_output_object_array, this_minute_counting_array, this_minute_counting, detected_copy) elif (return_detected_frame == False): per_minute_function(int(counting / (frames_per_second * 60)), this_minute_output_object_array, this_minute_counting_array, this_minute_counting) else: break if (video_complete_function != None): this_video_output_object_array = [] this_video_counting_array = [] this_video_counting = {} for aa in range(counting): this_video_output_object_array.append(output_frames_dict[aa + 1]) this_video_counting_array.append(output_frames_count_dict[aa + 1]) for eachCountingDict in this_video_counting_array: for eachItem in eachCountingDict: try: this_video_counting[eachItem] = this_video_counting[eachItem] + \ eachCountingDict[eachItem] except: this_video_counting[eachItem] = eachCountingDict[eachItem] for eachCountingItem in this_video_counting: this_video_counting[eachCountingItem] = int(this_video_counting[eachCountingItem] / counting) video_complete_function(this_video_output_object_array, this_video_counting_array, this_video_counting) input_video.release() output_video.release() if (save_detected_video == True): return output_video_filepath except: raise ValueError( "An error occured. It may be that your input video is invalid. Ensure you specified a proper string value for 'output_file_path' is 'save_detected_video' is not False. " "Also ensure your per_frame, per_second, per_minute or video_complete_analysis function is properly configured to receive the right parameters. ") ================================================ FILE: imageai/Detection/coco91_classes.txt ================================================ unlabeled person bicycle car motorcycle airplane bus train truck boat traffic light fire hydrant street sign stop sign parking meter bench bird cat dog horse sheep cow elephant bear zebra giraffe hat backpack umbrella shoe eye glasses handbag tie suitcase frisbee skis snowboard sports ball kite baseball bat baseball glove skateboard surfboard tennis racket bottle plate wine glass cup fork knife spoon bowl banana apple sandwich orange broccoli carrot hot dog pizza donut cake chair couch potted plant bed mirror dining table window desk toilet door tv laptop mouse remote keyboard cell phone microwave oven toaster sink refrigerator blender book clock vase scissors teddy bear hair drier toothbrush hair brush ================================================ FILE: imageai/Detection/coco_classes.txt ================================================ person bicycle car motorbike aeroplane bus train truck boat traffic light fire hydrant stop sign parking meter bench bird cat dog horse sheep cow elephant bear zebra giraffe backpack umbrella handbag tie suitcase frisbee skis snowboard sports ball kite baseball bat baseball glove skateboard surfboard tennis racket bottle wine glass cup fork knife spoon bowl banana apple sandwich orange broccoli carrot hot dog pizza donut cake chair sofa pottedplant bed diningtable toilet tvmonitor laptop mouse remote keyboard cell phone microwave oven toaster sink refrigerator book clock vase scissors teddy bear hair drier toothbrush ================================================ FILE: imageai/__init__.py ================================================ from .backend_check import backend_check ================================================ FILE: imageai/backend_check/__init__.py ================================================ ================================================ FILE: imageai/backend_check/backend_check.py ================================================ try: import torch import torchvision except: try: import tensorflow import keras raise RuntimeError("Dependency error!!! It appears you are trying to use ImageAI with a Tensorflow backend. ImageAI now uses PyTorch as backed as from version 3.0.2 . If you want to use the Tensorflow models or a customly trained '.h5' model, install ImageAI 2.1.6 or earlier. To use the latest Pytorch models, see the documentation in https://imageai.readthedocs.io/") except: raise RuntimeError("Dependency error!!! PyTorch and TorchVision are not installed. Please see installation instructions in the documentation https://imageai.readthedocs.io/") ================================================ FILE: imageai/backend_check/model_extension.py ================================================ import os def extension_check(file_path: str): if file_path.endswith(".h5"): raise RuntimeError("You are trying to use a Tensorflow model with ImageAI. ImageAI now uses PyTorch as backed as from version 3.0.2 . If you want to use the Tensorflow models or a customly trained '.h5' model, install ImageAI 2.1.6 or earlier. To use the latest Pytorch models, see the documentation in https://imageai.readthedocs.io/") elif file_path.endswith(".pt") == False and file_path.endswith(".pth") == False: raise ValueError(f"Invalid model file {os.path.basename(file_path)}. Please parse in a '.pt' and '.pth' model file.") ================================================ FILE: imageai/densenet121/__init__.py ================================================ import os, warnings from pathlib import Path from typing import List, Tuple import torch, torchvision import torch.nn.functional as F from torchvision import transforms from PIL import Image warnings.filterwarnings("once", category=ResourceWarning) class DenseNet121Pretrained: """ An implementation that allows for easy classification of images using the state of the art MobileNet computer vision model. """ def __init__(self, label_path : str) -> None: self.__model = torchvision.models.densenet121(pretrained=False) self.__classes = self.__load_classes(label_path) self.__has_loaded_weights = False self.__device = "cuda" if torch.cuda.is_available() else "cpu" self.__model_path = "" def __load_classes(self, path : str) -> List[str]: with open(path) as f: unique_classes = [c.strip() for c in f.readlines()] return unique_classes def __load_image(self, image_path : str) -> Tuple[List[str], torch.Tensor]: """ Loads image/images from the given path. If image_path is a directory, this function only load the images in the directory (it does not visit the sub- directories). This function also convert the loaded image/images to the specification expected by the MobileNetV2 architecture. """ allowed_file_extensions = ["jpg", "jpeg", "png"] images = [] fnames = [] preprocess = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) if os.path.isfile(image_path): img = Image.open(image_path).convert("RGB") images.append(preprocess(img)) fnames.append(os.path.basename(image_path)) elif os.path.isdir(image_path): for file in os.listdir(image_path): if os.path.isfile(os.path.join(image_path, file)) and\ file.rsplit('.')[-1].lower() in allowed_file_extensions: img = Image.open(os.path.join(image_path, file)).convert("RGB") images.append(preprocess(img)) fnames.append(file) if images: return fnames, torch.stack(images) raise RuntimeError( f"Error loading images from {os.path.abspath(image_path)}." "\nEnsure the folder contains images," " allowed file extensions are .jpg, .jpeg, .png" ) # properties model_path = property( fget=lambda self : self.__model_path, fset=lambda self, path: self.set_model_path(path), doc="Path containing the pretrained weight." ) def set_model_path(self, path : str) -> None: """ Sets the path to the pretrained weight. """ if os.path.isfile(path): self.__model_path = path self.__has_loaded_weights = False else: raise ValueError( "parameter path should be a path to the pretrianed weight file." ) def load_model(self) -> None: """ Loads the mobilenet vison weight into the model architecture. """ if not self.__has_loaded_weights: try: import re state_dict = torch.load(self.__model_path, map_location=self.__device) # '.'s are no longer allowed in module names, but previous densenet layers # as provided by the pytorch organization has names that uses '.'s. pattern = re.compile( r"^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\." "(?:weight|bias|running_mean|running_var))$" ) for key in list(state_dict.keys()): res = pattern.match(key) if res: new_key = res.group(1) + res.group(2) state_dict[new_key] = state_dict[key] del state_dict[key] self.__model.load_state_dict(state_dict) self.__has_loaded_weights = True self.__model.eval() except Exception: print("Weight loading failed.\nEnsure the model path is" " set and the weight file is in the specified model path.") def classify(self, image_path : str, top_n : int = 5, verbose : bool = True) -> List[List[Tuple[str, str]]]: """ Classfies image/images according to the classes provided by imagenet. Parameters: ----------- image_path: a path to a single image or a path to a directory containing images. If image_path is a path to a file, this functions classifies the image according to the categories provided by imagenet, else, if image_path is a path to a directory that contains images, this function classifies all images in the given directory (it doesn't visit the subdirectories). top_n: number of top predictions to return. verbose: if true, it prints the top_n predictions. """ if not self.__has_loaded_weights: warnings.warn("Pretrained weights aren't loaded", ResourceWarning) fnames, images = self.__load_image(image_path) images = images.to(self.__device) with torch.no_grad(): output = self.__model(images) probabilities = torch.softmax(output, dim=1) top5_prob, top5_catid = torch.topk(probabilities, 5) predictions = [ [ (self.__classes[top5_catid[i][j]], f"{top5_prob[i][j].item()*100:.5f}%") for j in range(top5_prob.shape[1]) ] for i in range(top5_prob.shape[0]) ] if verbose: for idx, pred in enumerate(predictions): print("-"*50, f"Top 5 predictions for {fnames[idx]}", "-"*50, sep="\n") for label, score in pred: print(f"\t{label}:{score: >10}") print("-"*50, "\n") return predictions ================================================ FILE: imageai/inceptionv3/__init__.py ================================================ import os, warnings from pathlib import Path from typing import List, Tuple import torch, torchvision import torch.nn.functional as F from torchvision import transforms from PIL import Image warnings.filterwarnings("once", category=ResourceWarning) class InceptionV3Pretrained: """ An implementation that allows for easy classification of images using the state of the art MobileNet computer vision model. """ def __init__(self, label_path : str) -> None: self.__model = torchvision.models.inception_v3(pretrained=False) self.__classes = self.__load_classes(label_path) self.__has_loaded_weights = False self.__device = "cuda" if torch.cuda.is_available() else "cpu" self.__model_path = "" def __load_classes(self, path : str) -> List[str]: with open(path) as f: unique_classes = [c.strip() for c in f.readlines()] return unique_classes def __load_image(self, image_path : str) -> Tuple[List[str], torch.Tensor]: """ Loads image/images from the given path. If image_path is a directory, this function only load the images in the directory (it does not visit the sub- directories). This function also convert the loaded image/images to the specification expected by the MobileNetV2 architecture. """ allowed_file_extensions = ["jpg", "jpeg", "png"] images = [] fnames = [] preprocess = transforms.Compose([ transforms.Resize(299), transforms.CenterCrop(299), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) if os.path.isfile(image_path): img = Image.open(image_path).convert("RGB") images.append(preprocess(img)) fnames.append(os.path.basename(image_path)) elif os.path.isdir(image_path): for file in os.listdir(image_path): if os.path.isfile(os.path.join(image_path, file)) and\ file.rsplit('.')[-1].lower() in allowed_file_extensions: img = Image.open(os.path.join(image_path, file)).convert("RGB") images.append(preprocess(img)) fnames.append(file) if images: return fnames, torch.stack(images) raise RuntimeError( f"Error loading images from {os.path.abspath(image_path)}." "\nEnsure the folder contains images," " allowed file extensions are .jpg, .jpeg, .png" ) # properties model_path = property( fget=lambda self : self.__model_path, fset=lambda self, path: self.set_model_path(path), doc="Path containing the pretrained weight." ) def set_model_path(self, path : str) -> None: """ Sets the path to the pretrained weight. """ if os.path.isfile(path): self.__model_path = path self.__has_loaded_weights = False else: raise ValueError( "parameter path should be a path to the pretrianed weight file." ) def load_model(self) -> None: """ Loads the mobilenet vison weight into the model architecture. """ if not self.__has_loaded_weights: try: self.__model.load_state_dict( torch.load(self.__model_path, map_location=self.__device) ) self.__has_loaded_weights = True self.__model.eval() except Exception: print("Weight loading failed.\nEnsure the model path is" " set and the weight file is in the specified model path.") def classify(self, image_path : str, top_n : int = 5, verbose : bool = True) -> List[List[Tuple[str, str]]]: """ Classfies image/images according to the classes provided by imagenet. Parameters: ----------- image_path: a path to a single image or a path to a directory containing images. If image_path is a path to a file, this functions classifies the image according to the categories provided by imagenet, else, if image_path is a path to a directory that contains images, this function classifies all images in the given directory (it doesn't visit the subdirectories). top_n: number of top predictions to return. verbose: if true, it prints the top_n predictions. """ if not self.__has_loaded_weights: if self.__model_path: warnings.warn( "Model path has changed but pretrained weights in the" " new path are yet to be loaded.", ResourceWarning ) else: warnings.warn( "Model path isn't set, pretrained weights aren't used.", ResourceWarning ) fnames, images = self.__load_image(image_path) images = images.to(self.__device) print(images.shape) with torch.no_grad(): output = self.__model(images) probabilities = torch.softmax(output, dim=1) top5_prob, top5_catid = torch.topk(probabilities, 5) with open(os.path.join(str(Path(__file__).resolve().parent.parent), "imagenet_classes.txt")) as f: categories = [c.strip() for c in f.readlines()] predictions = [ [ (categories[top5_catid[i][j]], f"{top5_prob[i][j].item()*100:.5f}%") for j in range(top5_prob.shape[1]) ] for i in range(top5_prob.shape[0]) ] if verbose: for idx, pred in enumerate(predictions): print("-"*50, f"Top 5 predictions for {fnames[idx]}", "-"*50, sep="\n") for label, score in pred: print(f"\t{label}:{score: >10}") print("-"*50, "\n") return predictions ================================================ FILE: imageai/mobilenetv2/__init__.py ================================================ import os, warnings from pathlib import Path from typing import List, Tuple import torch, torchvision import torch.nn.functional as F from torchvision import transforms from PIL import Image warnings.filterwarnings("once", category=ResourceWarning) class MobileNetV2Pretrained: """ An implementation that allows for easy classification of images using the state of the art MobileNet computer vision model. """ def __init__(self, label_path : str) -> None: self.__model = torchvision.models.mobilenet_v2(pretrained=False) self.__classes = self.__load_classes(label_path) self.__has_loaded_weights = False self.__device = "cuda" if torch.cuda.is_available() else "cpu" self.__model_path = "" def __load_classes(self, path : str) -> List[str]: with open(path) as f: unique_classes = [c.strip() for c in f.readlines()] return unique_classes def __load_image(self, image_path : str) -> Tuple[List[str], torch.Tensor]: """ Loads image/images from the given path. If image_path is a directory, this function only load the images in the directory (it does not visit the sub- directories). This function also convert the loaded image/images to the specification expected by the MobileNetV2 architecture. """ allowed_file_extensions = ["jpg", "jpeg", "png"] images = [] fnames = [] preprocess = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) if os.path.isfile(image_path): img = Image.open(image_path).convert("RGB") images.append(preprocess(img)) fnames.append(os.path.basename(image_path)) elif os.path.isdir(image_path): for file in os.listdir(image_path): if os.path.isfile(os.path.join(image_path, file)) and\ file.rsplit('.')[-1].lower() in allowed_file_extensions: img = Image.open(os.path.join(image_path, file)).convert("RGB") images.append(preprocess(img)) fnames.append(file) if images: return fnames, torch.stack(images) raise RuntimeError( f"Error loading images from {os.path.abspath(image_path)}." "\nEnsure the folder contains images," " allowed file extensions are .jpg, .jpeg, .png" ) # properties model_path = property( fget=lambda self : self.__model_path, fset=lambda self, path: self.set_model_path(path), doc="Path containing the pretrained weight." ) def set_model_path(self, path : str) -> None: """ Sets the path to the pretrained weight. """ if os.path.isfile(path): self.__model_path = path self.__has_loaded_weight = False else: raise ValueError( "parameter path should be a valid path to the pretrianed weight file." ) def load_model(self) -> None: """ Loads the mobilenet vison weight into the model architecture. """ if not self.__has_loaded_weights: try: self.__model.load_state_dict( torch.load(self.__model_path, map_location=self.__device) ) self.__has_loaded_weights = True self.__model.eval() except Exception: print("Weight loading failed.\nEnsure the model path is" " set and the weight file is in the specified model path.") def classify(self, image_path : str, top_n : int = 5, verbose : bool = True) -> List[List[Tuple[str, str]]]: """ Classfies image/images according to the classes provided by imagenet. Parameters: ----------- image_path: a path to a single image or a path to a directory containing images. If image_path is a path to a file, this functions classifies the image according to the categories provided by imagenet, else, if image_path is a path to a directory that contains images, this function classifies all images in the given directory (it doesn't visit the subdirectories). top_n: number of top predictions to return. verbose: if true, it prints the top_n predictions. """ if not self.__has_loaded_weights: if self.__model_path: warnings.warn( "Model path has changed but pretrained weights in the" " new path are yet to be loaded.", ResourceWarning ) else: warnings.warn( "Model path isn't set, pretrained weights aren't used.", ResourceWarning ) fnames, images = self.__load_image(image_path) images = images.to(self.__device) with torch.no_grad(): output = self.__model(images) probabilities = torch.softmax(output, dim=1) top5_prob, top5_catid = torch.topk(probabilities, 5) predictions = [ [ (self.__classes[top5_catid[i][j]], f"{top5_prob[i][j].item()*100:.5f}%") for j in range(top5_prob.shape[1]) ] for i in range(top5_prob.shape[0]) ] if verbose: for idx, pred in enumerate(predictions): print("-"*50, f"Top 5 predictions for {fnames[idx]}", "-"*50, sep="\n") for label, score in pred: print(f"\t{label}:{score: >10}") print("-"*50, "\n") return predictions ================================================ FILE: imageai/resnet50/__init__.py ================================================ import os, warnings from typing import List, Tuple import torch, torchvision import torch.nn.functional as F from torchvision import transforms from PIL import Image warnings.filterwarnings("once", category=ResourceWarning) class ResNet50Pretrained: """ An implementation that allows for easy classification of images using the state of the art MobileNet computer vision model. """ def __init__(self, label_path : str) -> None: self.__model = torchvision.models.resnet50(pretrained=False) self.__classes = self.__load_classes(label_path) self.__has_loaded_weights = False self.__device = "cuda" if torch.cuda.is_available() else "cpu" self.__model_path = "" def __load_classes(self, path : str) -> List[str]: with open(path) as f: unique_classes = [c.strip() for c in f.readlines()] return unique_classes def __load_image(self, image_path : str) -> Tuple[List[str], torch.Tensor]: """ Loads image/images from the given path. If image_path is a directory, this function only load the images in the directory (it does not visit the sub- directories). This function also convert the loaded image/images to the specification expected by the MobileNetV2 architecture. """ allowed_file_extensions = ["jpg", "jpeg", "png"] images = [] fnames = [] preprocess = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) if os.path.isfile(image_path): img = Image.open(image_path).convert("RGB") images.append(preprocess(img)) fnames.append(os.path.basename(image_path)) elif os.path.isdir(image_path): for file in os.listdir(image_path): if os.path.isfile(os.path.join(image_path, file)) and\ file.rsplit('.')[-1].lower() in allowed_file_extensions: img = Image.open(os.path.join(image_path, file)).convert("RGB") images.append(preprocess(img)) fnames.append(file) if images: return fnames, torch.stack(images) raise RuntimeError( f"Error loading images from {os.path.abspath(image_path)}." "\nEnsure the folder contains images," " allowed file extensions are .jpg, .jpeg, .png" ) # properties model_path = property( fget=lambda self : self.__model_path, fset=lambda self, path: self.set_model_path(path), doc="Path containing the pretrained weight." ) def set_model_path(self, path : str) -> None: """ Sets the path to the pretrained weight. """ if os.path.isfile(path): self.__model_path = path self.__has_loaded_weights = False else: raise ValueError( "parameter path should be a path to the pretrianed weight file." ) def load_model(self) -> None: """ Loads the mobilenet vison weight into the model architecture. """ if not self.__has_loaded_weights: try: self.__model.load_state_dict( torch.load(self.__model_path, map_location=self.__device) ) self.__has_loaded_weights = True self.__model.eval() except Exception: print("Weight loading failed.\nEnsure the model path is" " set and the weight file is in the specified model path.") def classify(self, image_path : str, top_n : int = 5, verbose : bool = True) -> List[List[Tuple[str, str]]]: """ Classfies image/images according to the classes provided by imagenet. Parameters: ----------- image_path: a path to a single image or a path to a directory containing images. If image_path is a path to a file, this functions classifies the image according to the categories provided by imagenet, else, if image_path is a path to a directory that contains images, this function classifies all images in the given directory (it doesn't visit the subdirectories). top_n: number of top predictions to return. verbose: if true, it prints the top_n predictions. """ if not self.__has_loaded_weights: if self.__model_path: warnings.warn( "Model path has changed but pretrained weights in the" " new path are yet to be loaded.", ResourceWarning ) else: warnings.warn( "Model path isn't set, pretrained weights aren't used.", ResourceWarning ) fnames, images = self.__load_image(image_path) images = images.to(self.__device) with torch.no_grad(): output = self.__model(images) probabilities = torch.softmax(output, dim=1) top5_prob, top5_catid = torch.topk(probabilities, 5) predictions = [ [ (self.__classes[top5_catid[i][j]], f"{top5_prob[i][j].item()*100:.5f}%") for j in range(top5_prob.shape[1]) ] for i in range(top5_prob.shape[0]) ] if verbose: for idx, pred in enumerate(predictions): print("-"*50, f"Top 5 predictions for {fnames[idx]}", "-"*50, sep="\n") for label, score in pred: print(f"\t{label}:{score: >10}") print("-"*50, "\n") return predictions ================================================ FILE: imageai/retinanet/__init__.py ================================================ ================================================ FILE: imageai/retinanet/utils.py ================================================ from torchvision.io import ImageReadMode import torch from PIL import Image, ImageColor, ImageDraw, ImageFont from typing import List, Optional, Union, Tuple, BinaryIO import numpy as np import math import warnings import pathlib def read_file(path: str) -> torch.Tensor: """ Reads and outputs the bytes contents of a file as a uint8 Tensor with one dimension. Args: path (str): the path to the file to be read Returns: data (Tensor) """ data = torch.ops.image.read_file(path) return data def decode_image(input: torch.Tensor, mode: ImageReadMode = ImageReadMode.UNCHANGED) -> torch.Tensor: """ Detects whether an image is a JPEG or PNG and performs the appropriate operation to decode the image into a 3 dimensional RGB or grayscale Tensor. Optionally converts the image to the desired format. The values of the output tensor are uint8 in [0, 255]. Args: input (Tensor): a one dimensional uint8 tensor containing the raw bytes of the PNG or JPEG image. mode (ImageReadMode): the read mode used for optionally converting the image. Default: ``ImageReadMode.UNCHANGED``. See ``ImageReadMode`` class for more information on various available modes. Returns: output (Tensor[image_channels, image_height, image_width]) """ output = torch.ops.image.decode_image(input, mode.value) return output def read_image(path: str, mode: ImageReadMode = ImageReadMode.UNCHANGED) -> torch.Tensor: """ Reads a JPEG or PNG image into a 3 dimensional RGB or grayscale Tensor. Optionally converts the image to the desired format. The values of the output tensor are uint8 in [0, 255]. Args: path (str): path of the JPEG or PNG image. mode (ImageReadMode): the read mode used for optionally converting the image. Default: ``ImageReadMode.UNCHANGED``. See ``ImageReadMode`` class for more information on various available modes. Returns: output (Tensor[image_channels, image_height, image_width]) """ data = read_file(path) return decode_image(data, mode) def _generate_color_palette(num_objects: int): palette = torch.tensor([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1]) return [tuple((i * palette) % 255) for i in range(num_objects)] @torch.no_grad() def make_grid( tensor: Union[torch.Tensor, List[torch.Tensor]], nrow: int = 8, padding: int = 2, normalize: bool = False, value_range: Optional[Tuple[int, int]] = None, scale_each: bool = False, pad_value: float = 0.0, **kwargs, ) -> torch.Tensor: """ Make a grid of images. Args: tensor (Tensor or list): 4D mini-batch Tensor of shape (B x C x H x W) or a list of images all of the same size. nrow (int, optional): Number of images displayed in each row of the grid. The final grid size is ``(B / nrow, nrow)``. Default: ``8``. padding (int, optional): amount of padding. Default: ``2``. normalize (bool, optional): If True, shift the image to the range (0, 1), by the min and max values specified by ``value_range``. Default: ``False``. value_range (tuple, optional): tuple (min, max) where min and max are numbers, then these numbers are used to normalize the image. By default, min and max are computed from the tensor. range (tuple. optional): .. warning:: This parameter was deprecated in ``0.12`` and will be removed in ``0.14``. Please use ``value_range`` instead. scale_each (bool, optional): If ``True``, scale each image in the batch of images separately rather than the (min, max) over all images. Default: ``False``. pad_value (float, optional): Value for the padded pixels. Default: ``0``. Returns: grid (Tensor): the tensor containing grid of images. """ if not (torch.is_tensor(tensor) or (isinstance(tensor, list) and all(torch.is_tensor(t) for t in tensor))): raise TypeError(f"tensor or list of tensors expected, got {type(tensor)}") if "range" in kwargs.keys(): warnings.warn( "The parameter 'range' is deprecated since 0.12 and will be removed in 0.14. " "Please use 'value_range' instead." ) value_range = kwargs["range"] # if list of tensors, convert to a 4D mini-batch Tensor if isinstance(tensor, list): tensor = torch.stack(tensor, dim=0) if tensor.dim() == 2: # single image H x W tensor = tensor.unsqueeze(0) if tensor.dim() == 3: # single image if tensor.size(0) == 1: # if single-channel, convert to 3-channel tensor = torch.cat((tensor, tensor, tensor), 0) tensor = tensor.unsqueeze(0) if tensor.dim() == 4 and tensor.size(1) == 1: # single-channel images tensor = torch.cat((tensor, tensor, tensor), 1) if normalize is True: tensor = tensor.clone() # avoid modifying tensor in-place if value_range is not None: assert isinstance( value_range, tuple ), "value_range has to be a tuple (min, max) if specified. min and max are numbers" def norm_ip(img, low, high): img.clamp_(min=low, max=high) img.sub_(low).div_(max(high - low, 1e-5)) def norm_range(t, value_range): if value_range is not None: norm_ip(t, value_range[0], value_range[1]) else: norm_ip(t, float(t.min()), float(t.max())) if scale_each is True: for t in tensor: # loop over mini-batch dimension norm_range(t, value_range) else: norm_range(tensor, value_range) assert isinstance(tensor, torch.Tensor) if tensor.size(0) == 1: return tensor.squeeze(0) # make the mini-batch of images into a grid nmaps = tensor.size(0) xmaps = min(nrow, nmaps) ymaps = int(math.ceil(float(nmaps) / xmaps)) height, width = int(tensor.size(2) + padding), int(tensor.size(3) + padding) num_channels = tensor.size(1) grid = tensor.new_full((num_channels, height * ymaps + padding, width * xmaps + padding), pad_value) k = 0 for y in range(ymaps): for x in range(xmaps): if k >= nmaps: break # Tensor.copy_() is a valid method but seems to be missing from the stubs # https://pytorch.org/docs/stable/tensors.html#torch.Tensor.copy_ grid.narrow(1, y * height + padding, height - padding).narrow( # type: ignore[attr-defined] 2, x * width + padding, width - padding ).copy_(tensor[k]) k = k + 1 return grid @torch.no_grad() def draw_bounding_boxes_and_labels( image: torch.Tensor, boxes: torch.Tensor, draw_boxes: bool, labels: Optional[List[str]] = None, label_color: Optional[Union[List[Union[str, Tuple[int, int, int]]], str, Tuple[int, int, int]]] = None, box_color: Optional[Union[List[Union[str, Tuple[int, int, int]]], str, Tuple[int, int, int]]] = None, fill: Optional[bool] = False, width: int = 1, font: Optional[str] = None, font_size: int = 10, ) -> torch.Tensor: """ Draws bounding boxes on given image. The values of the input image should be uint8 between 0 and 255. If fill is True, Resulting Tensor should be saved as PNG image. Args: image (Tensor): Tensor of shape (C x H x W) and dtype uint8. boxes (Tensor): Tensor of size (N, 4) containing bounding boxes in (xmin, ymin, xmax, ymax) format. Note that the boxes are absolute coordinates with respect to the image. In other words: `0 <= xmin < xmax < W` and `0 <= ymin < ymax < H`. labels (List[str]): List containing the labels of bounding boxes. colors (color or list of colors, optional): List containing the colors of the boxes or single color for all boxes. The color can be represented as PIL strings e.g. "red" or "#FF00FF", or as RGB tuples e.g. ``(240, 10, 157)``. By default, random colors are generated for boxes. fill (bool): If `True` fills the bounding box with specified color. width (int): Width of bounding box. font (str): A filename containing a TrueType font. If the file is not found in this filename, the loader may also search in other directories, such as the `fonts/` directory on Windows or `/Library/Fonts/`, `/System/Library/Fonts/` and `~/Library/Fonts/` on macOS. font_size (int): The requested font size in points. Returns: img (Tensor[C, H, W]): Image Tensor of dtype uint8 with bounding boxes plotted. """ if not isinstance(image, torch.Tensor): raise TypeError(f"Tensor expected, got {type(image)}") elif image.dtype != torch.uint8: raise ValueError(f"Tensor uint8 expected, got {image.dtype}") elif image.dim() != 3: raise ValueError("Pass individual images, not batches") elif image.size(0) not in {1, 3}: raise ValueError("Only grayscale and RGB images are supported") num_boxes = boxes.shape[0] if labels is None: labels: Union[List[str], List[None]] = [None] * num_boxes # type: ignore[no-redef] elif len(labels) != num_boxes: raise ValueError( f"Number of boxes ({num_boxes}) and labels ({len(labels)}) mismatch. Please specify labels for each box." ) # Handle Grayscale images if image.size(0) == 1: image = torch.tile(image, (3, 1, 1)) ndarr = image.permute(1, 2, 0).cpu().numpy() img_to_draw = Image.fromarray(ndarr) img_boxes = boxes.to(torch.int64).tolist() if fill: draw = ImageDraw.Draw(img_to_draw, "RGBA") else: draw = ImageDraw.Draw(img_to_draw) txt_font = ImageFont.load_default() if font is None else ImageFont.truetype(font=font, size=font_size) for bbox, label in zip(img_boxes, labels): if draw_boxes: if fill: fill_color = label_color + (100,) draw.rectangle(bbox, width=width, outline=label_color, fill=fill_color) else: draw.rectangle(bbox, width=width, outline=box_color) if label is not None: margin = width + 1 draw.text((bbox[0] + margin, bbox[1] + margin), label, fill=label_color, font=txt_font) return torch.from_numpy(np.array(img_to_draw)).permute(2, 0, 1).to(dtype=torch.uint8) @torch.no_grad() def tensor_to_ndarray( tensor: Union[torch.Tensor, List[torch.Tensor]], **kwargs, ) -> None: """ Convert a Tensor into ndarray and return the array Args: tensor (Tensor or list): Image to be saved. If given a mini-batch tensor, saves the tensor as a grid of images by calling ``make_grid``. fp (string or file object): A filename or a file object format(Optional): If omitted, the format to use is determined from the filename extension. If a file object was used instead of a filename, this parameter should always be used. **kwargs: Other arguments are documented in ``make_grid``. """ grid = make_grid(tensor, **kwargs) # Add 0.5 after unnormalizing to [0, 255] to round to nearest integer ndarr = grid.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy() return ndarr ================================================ FILE: imageai/yolov3/__init__.py ================================================ ================================================ FILE: imageai/yolov3/tiny_yolov3.py ================================================ from typing import Union, List, Tuple, Optional import torch import torch.nn as nn import numpy as np from .yolov3 import DetectionLayer, ConvLayer class YoloV3Tiny(nn.Module): def __init__( self, anchors : Union[List[int], Tuple[int,...]], num_classes : int=80, device : str="cpu" ): super().__init__() # Network Layers self.conv1 = ConvLayer(3, 16) self.maxpool1 = nn.MaxPool2d(2, 2) self.conv2 = ConvLayer(16, 32) self.maxpool2 = nn.MaxPool2d(2, 2) self.conv3 = ConvLayer(32, 64) self.maxpool3 = nn.MaxPool2d(2, 2) self.conv4 = ConvLayer(64, 128) self.maxpool4 = nn.MaxPool2d(2, 2) self.conv5 = ConvLayer(128, 256) self.maxpool5 = nn.MaxPool2d(2, 2) self.conv6 = ConvLayer(256, 512) self.zeropad = nn.ZeroPad2d((0, 1, 0, 1)) self.maxpool6 = nn.MaxPool2d(2, 1) self.conv7 = ConvLayer(512, 1024) self.conv8 = ConvLayer(1024, 256, 1, 1) self.conv9 = ConvLayer(256, 512) self.conv10 = ConvLayer( 512, (3 * (5+num_classes)), 1, 1, use_batch_norm=False, activation="linear" ) self.yolo1 = DetectionLayer( num_classes=num_classes, anchors=anchors, anchor_masks=(3, 4, 5), device=device, layer=1 ) # self.__route_layer(conv8) self.conv11 = ConvLayer(256, 128, 1, 1) self.upsample1 = nn.Upsample( scale_factor=2, mode="nearest" #align_corners=True ) # self.__route_layer(upsample1, conv5) self.conv12 = ConvLayer(384, 256) self.conv13 = ConvLayer( 256, (3 * (5 + num_classes)), 1, 1, use_batch_norm=False, activation="linear" ) self.yolo2 = DetectionLayer( num_classes=num_classes, anchors=anchors, anchor_masks=(0, 1, 2), device=device, layer=2 ) def get_loss_layers(self) -> List[torch.Tensor]: return [self.yolo1, self.yolo2] def __route_layer(self, y1 : torch.Tensor, y2 : Optional[torch.Tensor]=None) -> torch.Tensor: if isinstance(y2, torch.Tensor): return torch.cat([y1, y2], 1) return y1 def forward(self, x : torch.Tensor) -> torch.Tensor: y = self.maxpool2(self.conv2(self.maxpool1(self.conv1(x)))) y = self.maxpool4(self.conv4(self.maxpool3(self.conv3(y)))) r1 = self.conv5(y) # route layer y = self.zeropad(self.conv6(self.maxpool5(r1))) y = self.conv7(self.maxpool6(y)) r2 = self.conv8(y) # route layer y = self.conv10(self.conv9(r2)) # first detection layer out = self.yolo1(y) y = self.conv11(self.__route_layer(r2)) y = self.__route_layer(self.upsample1(y), r1) y = self.conv13(self.conv12(y)) # second detection layer out = torch.cat([out, self.yolo2(y)], 1) return out ================================================ FILE: imageai/yolov3/utils.py ================================================ import math from typing import Union, List, Tuple import torch import numpy as np import cv2 as cv from torchvision.ops import batched_nms def draw_bbox_and_label(x : torch.Tensor, label : str, img : np.ndarray) -> np.ndarray: """ Draws the predicted bounding boxes on the original image. """ x1,y1,x2,y2 = tuple(map(int, x)) if x is not None: img = cv.rectangle(img, (x1,y1), (x2,y2), (0, 255, 0), 1) t_size = cv.getTextSize(label, cv.FONT_HERSHEY_PLAIN, 1, 1)[0] c2 = (x1 + t_size[0] + 3, y1 + t_size[1] + 4) img = cv.putText(img, label, (x1, y1+t_size[1]+4), cv.FONT_HERSHEY_PLAIN, 1, (0,0,255), 1) return img def letterbox_image( image : np.ndarray, inp_dim : Tuple[int, int]) -> np.ndarray: """ Resizes images into the dimension expected by the network. This function fills extra spaces in the image with grayscale, if the image is smaller than the expected dimesion. This implementation keeps the aspect ration of the original image. """ img_w, img_h = image.shape[1], image.shape[0] # original image dimension net_w, net_h = inp_dim # the dimension expected by the network. # calculate the new dimension with same aspect ration as # the original image. scale_factor = min(net_w/img_w, net_h/img_h) new_w = int(round(img_w * scale_factor)) new_h = int(round(img_h * scale_factor)) resized_image = cv.resize(image, (new_w, new_h), interpolation=cv.INTER_CUBIC) canvas = np.full((net_w, net_h, 3), 128) canvas[(net_h - new_h)//2 : (net_h - new_h)//2 + new_h, (net_w - new_w)//2 : (net_w - new_w)//2 + new_w, :] = resized_image return canvas def prepare_image( image : np.ndarray, inp_dim : Tuple[int, int]) -> torch.Tensor: """ Prepared the input to match the expectation of the network. """ img = letterbox_image(image, inp_dim) img = img[:, :, ::-1].transpose((2, 0, 1)).copy() img = torch.from_numpy(img).float().div(255.0).unsqueeze(0) return img def bbox_iou(bbox1 : torch.Tensor, bbox2 : torch.Tensor, device="cpu"): """ Returns the IoU value of overlapping boxes """ b1_x1, b1_y1, b1_x2, b1_y2 = bbox1[:, 0], bbox1[:, 1], bbox1[:, 2], bbox1[:, 3] b2_x1, b2_y1, b2_x2, b2_y2 = bbox2[:, 0], bbox2[:, 1], bbox2[:, 2], bbox2[:, 3] # intersections inter_rect_x1 = torch.max(b1_x1, b2_x1) inter_rect_y1 = torch.max(b1_y1, b2_y1) inter_rect_x2 = torch.min(b1_x2, b2_x2) inter_rect_y2 = torch.min(b1_y2, b2_y2) inter_area = torch.max(inter_rect_x2 - inter_rect_x1+1, torch.zeros(inter_rect_x2.shape, device=device)) * \ torch.max(inter_rect_y2 - inter_rect_y1+1, torch.zeros(inter_rect_y2.shape, device=device)) b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) return inter_area / (b1_area + b2_area - inter_area) def transform_prediction( pred : torch.Tensor, inp_dim : int, anchors : Union[List[int], Tuple[int, ...], torch.Tensor], num_classes : int, device : str = "cpu" ) -> torch.Tensor: """ Transforms the predictions of the convolutional layers from batch_size x (3 * 5+num_classes) x grid_size x grid_size to batch_size x (grid_size * grid_size * anchors) x num_classes aids the concatenation of the prediction at the three detection layers and also for easy representation of the predicted bounding boxes. Also, transforms the bounding box predictions and the objectness score to match the discription specified in the paper: Bx = sigmoid(Tx) + Cx By = sigmoid(Ty) + Cy Bw = Pw(exp(Tw)) Bh = Ph(exp(Th)) Parameters: ----------- pred: prediction of the convolutional layer inp_dim: the dimension of images expected by the yolo neural network anchors: a list of anchors num_classes: the numbers of unique classes as specified by COCO. Returns: -------- the transformed input. """ batch_size = pred.shape[0] grid_size = pred.shape[2] stride = inp_dim // grid_size bbox_attrs = 5 + num_classes num_anchors = len(anchors) # transform input shape pred = pred.view(batch_size, bbox_attrs*num_anchors, grid_size*grid_size) pred = pred.transpose(1, 2).contiguous() pred = pred.view(batch_size, grid_size*grid_size*num_anchors, bbox_attrs) # since the dimensions of the anchors are in accordance with the original # dimension of the image, it's required to scale the dimension of the # anchors to match the dimension of the output of the convolutional # layer anchors = [(a[0] / stride, a[1] / stride) for a in anchors] # sigmoid the center_x, center_y and the objectness score pred[:, :, 0] = torch.sigmoid(pred[:, :, 0]) pred[:, :, 1] = torch.sigmoid(pred[:, :, 1]) pred[:, :, 4] = torch.sigmoid(pred[:, :, 4]) # add the center offsets grid = torch.arange(grid_size, dtype=torch.float) grid = np.arange(grid_size) x_o, y_o = np.meshgrid(grid, grid) #x_offset, y_offset = torch.meshgrid(grid, grid) x_offset = torch.FloatTensor(x_o).view(-1, 1).to(device) y_offset = torch.FloatTensor(y_o).view(-1, 1).to(device) #x_offset = x_offset.transpose(0,1).reshape(-1,1).to(device) #y_offset = y_offset.transpose(0,1).reshape(-1,1).to(device) x_y_offset = torch.cat([x_offset, y_offset], dim=1).repeat(1, num_anchors).view(-1,2).unsqueeze(0) pred[:, :, :2] += x_y_offset # transform height and width anchors = torch.FloatTensor(anchors).to(device) anchors = anchors.repeat(grid_size*grid_size, 1).unsqueeze(0) pred[:, :, 2:4] = torch.exp(pred[:, :, 2:4])*anchors # apply sigmoid to class scores pred[:, :, 5:5+num_classes] = torch.sigmoid(pred[:, :, 5:5+num_classes]) # resize bounding box prediction to the original image dimension pred[:, :, :4] *= stride return pred def get_predictions( pred : torch.Tensor, num_classes : int, objectness_confidence : float = 0.5, nms_confidence_level : float = 0.4, device : str = "cpu") -> Union[torch.Tensor, int]: """ This function filters the bounding boxes predicted by the network by first discarding bounding boxes that has low objectness score, and then proceeds to filter overlapping bounding boxes using the non-maximum suppression algorithm. Parameters: ----------- pred: a tensor (predicted output) of shape 'batch_size x num_bboxes x bbox_attrs' num_classes: the number of unique classes as provided by COCO. objectness_confidence_level: probability threshold for bounding boxes containing a valid object. nms_convidence_level: threshold for overlapping bounding boxes Returns: -------- The prediction with reasonable bounding boxes. """ nB = pred.shape[0] # number of batches bbox_attr = pred.shape[2] # center_x, center_y, height, width, class_probabilites nBBOX = pred.shape[1] # number of bounding boxes conf_mask = (pred[:, :, 4] > objectness_confidence).float().unsqueeze(2) pred = pred * conf_mask # transform the predicted centers, height and width to top-left corner and # right bottom corner coordinates to aid the ease computation of the IoU bbox_corner = pred.new(pred.shape) bbox_corner[:, :, 0] = (pred[:, :, 0] - (pred[:, :, 2] / 2)) # top-left_x bbox_corner[:, :, 1] = (pred[:, :, 1] - (pred[:, :, 3] / 2)) # top-left_y bbox_corner[:, :, 2] = (pred[:, :, 0] + (pred[:, :, 2] / 2)) # bottom_right_x bbox_corner[:, :, 3] = (pred[:, :, 1] + (pred[:, :, 3] / 2)) # bottom_right_y pred[:, :, :4] = bbox_corner[:, :, :4] n_pred = pred.view(-1, bbox_attr) idxs = torch.arange(nB).reshape(-1,1).repeat(1, nBBOX).view(-1).to(device) # image indices max_conf, max_idx = torch.max(n_pred[:, 5:5+num_classes], 1) # maximum class score and the index max_conf = max_conf.float().unsqueeze(1).to(device) max_idx = max_idx.float().unsqueeze(1).to(device) n_pred = torch.cat([idxs.unsqueeze(1), n_pred[:, :5], max_conf, max_idx], 1) # batch_idx, x1, y1, x2, y2, objectness_score, class_score, class_idx valid_bbox_indices = batched_nms(n_pred[:, 1:5].clone(), n_pred[:, 5].clone(), n_pred[:, 7].clone(), nms_confidence_level) if len(valid_bbox_indices): return n_pred[valid_bbox_indices, :] return None ================================================ FILE: imageai/yolov3/yolov3.py ================================================ from typing import Union, List, Tuple, Optional import torch import torch.nn as nn import numpy as np from .utils import transform_prediction def noop(x): return x class DetectionLayer(nn.Module): def __init__( self, anchors : Union[List[int], Tuple[int, ...]], anchor_masks : Tuple[int, int, int], layer : int, num_classes : int=80, device : str="cpu" ): super().__init__() self.height = 416 self.width = 416 self.num_classes = num_classes self.ignore_thresh = 0.7 self.truth_thresh = 1 self.rescore = 1 self.device = device self.anchors = self.__get_anchors(anchors, anchor_masks) self.layer = layer self.layer_width = None self.layer_height = None self.layer_output = None self.pred = None self.stride = None self.grid = None self.anchor_grid = None def __get_anchors( self, anchors : Union[List[int], Tuple[int, ...]], anchor_masks : Tuple[int, int, int] ) -> torch.Tensor: a = [(anchors[i], anchors[i+1]) for i in range(0, len(anchors), 2)] return torch.tensor([a[i] for i in anchor_masks]).to(self.device) def forward(self, x : torch.Tensor): self.layer_height, self.layer_width = x.shape[2], x.shape[3] self.stride = self.height // self.layer_height if self.training: batch_size = x.shape[0] grid_size = x.shape[2] bbox_attrs = 5 + self.num_classes num_anchors = len(self.anchors) # transform input shape self.layer_output = x.detach() self.pred = x.view(batch_size, num_anchors, bbox_attrs, grid_size, grid_size).permute(0, 1, 3, 4, 2).contiguous() self.layer_output = self.layer_output.view(batch_size, bbox_attrs*num_anchors, grid_size*grid_size) self.layer_output = self.layer_output.transpose(1, 2).contiguous() self.layer_output = self.layer_output.view(batch_size, grid_size*grid_size*num_anchors, bbox_attrs) else: # transform the output of the network and scale it to match the # network dimension : 416x416 self.layer_output = transform_prediction( x.data, self.width, self.anchors, self.num_classes, self.device ) return self.layer_output class ConvLayer(nn.Module): def __init__(self, in_f : int, out_f : int, kernel_size : int = 3, stride : int = 1, use_batch_norm : bool = True, activation : str ="leaky"): super().__init__() self.conv = nn.Conv2d( in_f, out_f, stride=stride, kernel_size=kernel_size, padding= kernel_size//2, bias=False if use_batch_norm else True ) self.batch_norm = nn.BatchNorm2d(out_f) if use_batch_norm else noop self.leaky_relu = nn.LeakyReLU(0.1, inplace=True) if activation=="leaky" else noop def forward(self, x : torch.Tensor): return self.leaky_relu(self.batch_norm(self.conv(x))) class YoloV3(nn.Module): def __init__( self, anchors : Union[List[int], Tuple[int, ...]], num_classes : int = 80, device : str ="cpu"): super().__init__() # Network Layers self.conv1 = ConvLayer(3, 32) self.conv2 = ConvLayer(32, 64, stride=2) self.conv3 = ConvLayer(64, 32, 1, 1) self.conv4 = ConvLayer(32, 64) # self.__shortcut_layer1(self.conv4, self.conv2) self.conv5 = ConvLayer(64, 128, stride=2) self.conv6 = ConvLayer(128, 64, 1, 1) self.conv7 = ConvLayer(64, 128, stride=1) # self.__shortcut_layer2(self.conv7, self.conv5) self.conv8 = ConvLayer(128, 64, 1, 1) self.conv9 = ConvLayer(64, 128, stride=1) # self.__shortcut_layer3(self.conv9, shortcut2) self.conv10 = ConvLayer(128, 256, stride=2) self.conv11 = ConvLayer(256, 128, 1, 1) self.conv12 = ConvLayer(128, 256) # self.__shortcut_layer4(self.con12, self.conv10) self.conv13 = ConvLayer(256, 128, 1, 1) self.conv14 = ConvLayer(128, 256) # self.__shortcut_layer5(self.conv14, shortcut4) self.conv15 = ConvLayer(256, 128, 1, 1) self.conv16 = ConvLayer(128, 256) # self.__shortcut_layer6(self.conv16, shortcut5) self.conv17 = ConvLayer(256, 128, 1, 1) self.conv18 = ConvLayer(128, 256) # self.__shortcut_layer7(self.conv18, shortcut6) self.conv19 = ConvLayer(256, 128, 1, 1) self.conv20 = ConvLayer(128, 256) # self.__shortcut_layer8(self.conv20, shortcut7) self.conv21 = ConvLayer(256, 128, 1, 1) self.conv22 = ConvLayer(128, 256) # self.__shortcut_layer9(self.conv22, shortcut8) self.conv23 = ConvLayer(256, 128, 1, 1) self.conv24 = ConvLayer(128, 256) # self.__shortcut_layer10(self.conv24, shortcut9) self.conv25 = ConvLayer(256, 128, 1, 1) self.conv26 = ConvLayer(128, 256) # self.__shortcut_layer11(self.conv26, shortcut10) self.conv27 = ConvLayer(256, 512, stride=2) self.conv28 = ConvLayer(512, 256, 1, 1) self.conv29 = ConvLayer(256, 512) # self.__shortcut_layer12(self.conv29, self.conv27) self.conv30 = ConvLayer(512, 256, 1, 1) self.conv31 = ConvLayer(256, 512) # self.__shortcut_layer13(self.conv31, shortcut12) self.conv32 = ConvLayer(512, 256, 1, 1) self.conv33 = ConvLayer(256, 512) # self.__shortcut_layer14(self.conv33, shortcut13) self.conv34 = ConvLayer(512, 256, 1, 1) self.conv35 = ConvLayer(256, 512) # self.__shortcut_layer15(self.conv35, shortcut14) self.conv36 = ConvLayer(512, 256, 1, 1) self.conv37 = ConvLayer(256, 512) # self.__shortcut_layer16(self.conv37, shortcut15) self.conv38 = ConvLayer(512, 256, 1, 1) self.conv39 = ConvLayer(256, 512) # self.__shortcut_layer17(self.conv39, shortcut16) self.conv40 = ConvLayer(512, 256, 1, 1) self.conv41 = ConvLayer(256, 512) # self.__shortcut_layer18(self.conv41, shortcut17) self.conv42 = ConvLayer(512, 256, 1, 1) self.conv43 = ConvLayer(256, 512) # self.__shortcut_layer19(self.conv43, shortcut18) self.conv44 = ConvLayer(512, 1024, stride=2) self.conv45 = ConvLayer(1024, 512, 1, 1) self.conv46 = ConvLayer(512, 1024) # self.__shortcut_layer20(self.conv46, self.conv44) self.conv47 = ConvLayer(1024, 512, 1, 1) self.conv48 = ConvLayer(512, 1024) # self.__shortcut_layer21(self.conv48, shortcut20) self.conv49 = ConvLayer(1024, 512, 1, 1) self.conv50 = ConvLayer(512, 1024) # self.__shortcut_layer22(self.conv50, shortcut21) self.conv51 = ConvLayer(1024, 512, 1, 1) self.conv52 = ConvLayer(512, 1024) # self.__shortcut_layer23(self.conv52, shortcut22) self.conv53 = ConvLayer(1024, 512, 1, 1) self.conv54 = ConvLayer(512, 1024) self.conv55 = ConvLayer(1024, 512, 1, 1) self.conv56 = ConvLayer(512, 1024) self.conv57 = ConvLayer(1024, 512, 1, 1) self.conv58 = ConvLayer(512, 1024) self.conv59 = ConvLayer( 1024, (3 * (5 + num_classes)), 1, 1, use_batch_norm=False, activation="linear" ) # yolo layer self.yolo1 = DetectionLayer( num_classes=num_classes, anchors=anchors, anchor_masks=(6, 7, 8), device=device, layer=1 ) # self.__route_layer(self.conv57) self.conv60 = ConvLayer(512, 256, 1, 1) self.upsample1 = nn.Upsample( scale_factor=2, mode="nearest" #align_corners=True ) # self.__route_layer(self.upsample1, shortcut19) self.conv61 = ConvLayer(768, 256, 1, 1) self.conv62 = ConvLayer(256, 512) self.conv63 = ConvLayer(512, 256, 1, 1) self.conv64 = ConvLayer(256, 512) self.conv65 = ConvLayer(512, 256, 1, 1) self.conv66 = ConvLayer(256, 512) self.conv67 = ConvLayer( 512, (3 * (5 + num_classes)), 1, 1, use_batch_norm=False, activation="linear" ) # yolo layer self.yolo2 = DetectionLayer( num_classes=num_classes, anchors=anchors, anchor_masks=(3, 4, 5), device=device, layer=2 ) # self.__route_layer(self.conv65) self.conv68 = ConvLayer(256, 128, 1, 1) self.upsample2 = nn.Upsample( scale_factor=2, mode="nearest" #align_corners=True ) # self.__route_layer(self.upsample2, shortcut11) self.conv69 = ConvLayer(384, 128, 1, 1) self.conv70 = ConvLayer(128, 256) self.conv71 = ConvLayer(256, 128, 1, 1) self.conv72 = ConvLayer(128, 256) self.conv73 = ConvLayer(256, 128, 1, 1) self.conv74 = ConvLayer(128, 256) self.conv75 = ConvLayer( 256, (3 * (5 + num_classes)), 1, 1, use_batch_norm=False, activation="linear" ) # yolo layer self.yolo3 = DetectionLayer( num_classes=num_classes, anchors=anchors, anchor_masks=(0, 1, 2), device=device, layer=3 ) def get_loss_layers(self) -> List[torch.Tensor]: return [self.yolo1, self.yolo2, self.yolo3] def __route_layer(self, y1 : torch.Tensor, y2 : Optional[torch.Tensor]=None): if isinstance(y2, torch.Tensor): return torch.cat([y1, y2], 1) return y1 def __shortcut_layer(self, y1 : torch.Tensor, y2 : torch.Tensor, activation : str="linear" ) -> torch.Tensor: actv = noop if activation=="linear" else nn.LeakyReLU(0.1) return actv(y1 + y2) def forward(self, x : torch.Tensor) -> torch.Tensor: y = self.conv2(self.conv1(x)) # shortcut1 y = self.conv5(self.__shortcut_layer(self.conv4(self.conv3(y)), y)) y2 = self.conv7(self.conv6(y)) # shortcut2 y = self.__shortcut_layer(y2, y) y2 = self.conv9(self.conv8(y)) # shortcut3 y2 = self.conv10(self.__shortcut_layer(y2, y)) y = self.conv12(self.conv11(y2)) # shortcut4 y2 = self.__shortcut_layer(y, y2) y = self.conv14(self.conv13(y2)) # shortcut5 y2 = self.__shortcut_layer(y, y2) y = self.conv16(self.conv15(self.__shortcut_layer(y2, y))) # shortcut6 y2 = self.__shortcut_layer(y, y2) y = self.conv18(self.conv17(y2)) # shortcut7 y2 = self.__shortcut_layer(y, y2) y = self.conv20(self.conv19(y2)) # shortcut8 y2 = self.__shortcut_layer(y, y2) y = self.conv22(self.conv21(y2)) # shortcut9 y2 = self.__shortcut_layer(y, y2) y = self.conv24(self.conv23(y2)) # shortcut10 y2 = self.__shortcut_layer(y, y2) y = self.conv26(self.conv25(y2)) # shortcut11 r1 = self.__shortcut_layer(y, y2) # route_layer y = self.conv27(r1) y2 = self.conv29(self.conv28(y)) # shortcut12 y = self.__shortcut_layer(y2, y) y2 = self.conv31(self.conv30(y)) # shortcut13 y = self.__shortcut_layer(y2, y) y2 = self.conv33(self.conv32(y)) # shortcut14 y = self.__shortcut_layer(y2, y) y2 = self.conv35(self.conv34(y)) # shortcut15 y = self.__shortcut_layer(y2, y) y2 = self.conv37(self.conv36(y)) # shortcut16 y = self.__shortcut_layer(y2, y) y2 = self.conv39(self.conv38(y)) # shortcut17 y = self.__shortcut_layer(y2, y) y2 = self.conv41(self.conv40(y)) # shortcut18 y = self.__shortcut_layer(y2, y) y2 = self.conv43(self.conv42(y)) # shortcut19 r2 = self.__shortcut_layer(y2, y) # route_layer y2 = self.conv44(r2) y = self.conv46(self.conv45(y2)) # shortcut20 y2 = self.__shortcut_layer(y, y2) y = self.conv48(self.conv47(y2)) # shortcut21 y2 = self.__shortcut_layer(y, y2) y = self.conv50(self.conv49(y2)) # shortcut22 y2 = self.__shortcut_layer(y, y2) y = self.conv52(self.conv51(y2)) # shortcut23 y2 = self.__shortcut_layer(y, y2) y = self.conv54(self.conv53(y2)) r3 = self.conv57(self.conv56(self.conv55(y))) # route_layer y = self.conv59(self.conv58(r3)) # first detection layer out = self.yolo1(y) y = self.conv60(self.__route_layer(r3)) y = self.conv62(self.conv61(self.__route_layer(self.upsample1(y), r2))) r4 = self.conv65(self.conv64(self.conv63(y))) # route_layer y = self.conv67(self.conv66(r4)) # second detection layer out = torch.cat([out, self.yolo2(y)], dim=1) y = self.conv68(self.__route_layer(r4)) y = self.conv70(self.conv69(self.__route_layer(self.upsample2(y), r1))) y = self.conv75(self.conv74(self.conv73(self.conv72(self.conv71(y))))) # third detection layer out = torch.cat([out, self.yolo3(y)], dim=1) return out ================================================ FILE: imageai_tf_deprecated/Classification/CUSTOMCLASSIFICATION.md ================================================ # ImageAI : Custom Image Classification A **DeepQuest AI** project https://deepquestai.com

--- ImageAI provides 4 different algorithms and model types to perform custom image prediction using your custom models. You will be able to use your model trained with **ImageAI** and the corresponding model_class JSON file to predict custom objects that you have trained the model on. ### TABLE OF CONTENTS - :white_square_button: Custom Model Prediction - :white_square_button: Custom Model Prediction with Full Model (NEW) - :white_square_button: Custom Prediction with multiple models (NEW) - :white_square_button: Convert custom model to Tensorflow's format (NEW) - :white_square_button: Convert custom model to DeepStack's format (NEW) ### Custom Model Prediction
In this example, we will be using the model trained for 20 experiments on **IdenProf**, a dataset of uniformed professionals and achieved 65.17% accuracy on the test dataset. (You can use your own trained model and generated JSON file. This 'class' is provided mainly for the purpose to use your own custom models.) Download the ResNet model of the model and JSON files in links below: - [**ResNet50**](https://github.com/OlafenwaMoses/ImageAI/releases/download/essentials-v5/idenprof_resnet_ex-056_acc-0.993062.h5) _(Size = 90.4 mb)_ - [**IdenProf model_class.json file**](https://github.com/OlafenwaMoses/ImageAI/releases/download/essentials-v5/idenprof.json) Great! Once you have downloaded this model file and the JSON file, start a new python project, and then copy the model file and the JSON file to your project folder where your python files (.py files) will be. Download the image below, or take any image on your computer that include any of the following professionals(Chef, Doctor, Engineer, Farmer, Fireman, Judge, Mechanic, Pilot, Police and Waiter) and copy it to your python project's folder. Then create a python file and give it a name; an example is **FirstCustomPrediction.py**. Then write the code below into the python file: ### FirstCustomPrediction.py ```python from imageai.Classification.Custom import CustomImageClassification import os execution_path = os.getcwd() prediction = CustomImageClassification() prediction.setModelTypeAsResNet50() prediction.setModelPath(os.path.join(execution_path, "idenprof_resnet_ex-056_acc-0.993062.h5")) prediction.setJsonPath(os.path.join(execution_path, "idenprof.json")) prediction.loadModel(num_objects=10) predictions, probabilities = prediction.classifyImage(os.path.join(execution_path, "4.jpg"), result_count=5) for eachPrediction, eachProbability in zip(predictions, probabilities): print(eachPrediction + " : " + eachProbability) ``` **Sample Result:** ![Sample Result](../../data-images/4.jpg) ``` mechanic : 76.82620286941528 chef : 10.106072574853897 waiter : 4.036874696612358 police : 2.6663416996598244 pilot : 2.239348366856575 ``` The code above works as follows: ```python from imageai.Classification.Custom import CustomImageClassification import os ``` The code above imports the **ImageAI** library for custom image prediction and the python **os** class. ```python execution_path = os.getcwd() ``` The above line obtains the path to the folder that contains your python file (in this example, your FirstCustomPrediction.py). ```python prediction = CustomImageClassification() prediction.setModelTypeAsResNet50() prediction.setModelPath(os.path.join(execution_path, "idenprof_resnet_ex-056_acc-0.993062.h5")) prediction.setJsonPath(os.path.join(execution_path, "idenprof.json")) prediction.loadModel(num_objects=10) ``` In the lines above, we created and instance of the `CustomImageClassification()` class in the first line, then we set the model type of the prediction object to ResNet by caling the `.setModelTypeAsResNet50()` in the second line, we set the model path of the prediction object to the path of the custom model file (`idenprof_resnet_ex-056_acc-0.993062.h5`) we copied to the python file folder in the third line, we set the path to the model_class.json of the model, we load the model and parse the number of objected that can be predicted in the model. ```python predictions, probabilities = prediction.classifyImage(os.path.join(execution_path, "4.jpg"), result_count=5) ``` In the above line, we defined 2 variables to be equal to the function called to predict an image, which is the `.classifyImage()` function, into which we parsed the path to our image and also state the number of prediction results we want to have (values from 1 to 10 in this case) parsing `result_count=5`. The `.classifyImage()` function will return 2 array objects with the first (**predictions**) being an array of predictions and the second (**percentage_probabilities**) being an array of the corresponding percentage probability for each prediction. ```python for eachPrediction, eachProbability in zip(predictions, probabilities): print(eachPrediction + " : " + eachProbability) ``` The above line obtains each object in the **predictions** array, and also obtains the corresponding percentage probability from the **percentage_probabilities**, and finally prints the result of both to console. **CustomImageClassification** class also supports the multiple predictions, input types and prediction speeds that are contained in the **ImageClassification** class. Follow this [link](README.md) to see all the details. ### Custom Prediction with multiple models
In previous versions of **ImageAI**, running more than one custom model at once wasn't supported. Now you can run multiple custom models, as many as your computer memory can accommodate. See the example code below for running multiple custom prediction models. ```python from imageai.Classification.Custom import CustomImageClassification import os execution_path = os.getcwd() predictor = CustomImageClassification() predictor.setModelPath(model_path=os.path.join(execution_path, "idenprof_resnet.h5")) predictor.setJsonPath(model_json=os.path.join(execution_path, "idenprof.json")) predictor.setModelTypeAsResNet50() predictor.loadModel(num_objects=10) predictor2 = CustomImageClassification() predictor2.setModelPath(model_path=os.path.join(execution_path, "idenprof_inception_0.719500.h5")) predictor2.setJsonPath(model_json=os.path.join(execution_path, "idenprof.json")) predictor2.setModelTypeAsInceptionV3() predictor2.loadModel(num_objects=10) results, probabilities = predictor.classifyImage(image_input=os.path.join(execution_path, "9.jpg"), result_count=5) print(results) print(probabilities) results2, probabilities2 = predictor3.classifyImage(image_input=os.path.join(execution_path, "9.jpg"), result_count=5) print(results2) print(probabilities2) print("-------------------------------") ``` ### Documentation We have provided full documentation for all **ImageAI** classes and functions in 3 major languages. Find links below:** * Documentation - **English Version [https://imageai.readthedocs.io](https://imageai.readthedocs.io)** * Documentation - **Chinese Version [https://imageai-cn.readthedocs.io](https://imageai-cn.readthedocs.io)** * Documentation - **French Version [https://imageai-fr.readthedocs.io](https://imageai-fr.readthedocs.io)** ================================================ FILE: imageai_tf_deprecated/Classification/CUSTOMTRAINING.md ================================================ # ImageAI : Custom Prediction Model Training --- **ImageAI** provides the most simple and powerful approach to training custom image prediction models using state-of-the-art SqueezeNet, ResNet50, InceptionV3 and DenseNet which you can load into the `imageai.Classification.Custom.CustomImageClassification` class. This allows you to train your own model on any set of images that corresponds to any type of objects/persons. The training process generates a JSON file that maps the objects types in your image dataset and creates lots of models. You will then pick the model with the highest accuracy and perform custom image prediction using the model and the JSON file generated. ### TABLE OF CONTENTS - :white_square_button: Custom Model Training Prediction - :white_square_button: Saving Full Custom Model - :white_square_button: Training on the IdenProf Dataset - :white_square_button: Continuous Model Training - :white_square_button: Transfer Learning (Training from a pre-trained model) ### Custom Model Training
Because model training is a compute intensive tasks, we strongly advise you perform this experiment using a computer with a NVIDIA GPU and the GPU version of Tensorflow installed. Performing model training on CPU will my take hours or days. With NVIDIA GPU powered computer system, this will take a few hours. You can use Google Colab for this experiment as it has an NVIDIA K80 GPU available. To train a custom prediction model, you need to prepare the images you want to use to train the model. You will prepare the images as follows: 1. Create a dataset folder with the name you will like your dataset to be called (e.g pets) 2. In the dataset folder, create a folder by the name **train** 3. In the dataset folder, create a folder by the name **test** 4. In the train folder, create a folder for each object you want to the model to predict and give the folder a name that corresponds to the respective object name (e.g dog, cat, squirrel, snake) 5. In the test folder, create a folder for each object you want to the model to predict and give the folder a name that corresponds to the respective object name (e.g dog, cat, squirrel, snake) 6. In each folder present in the train folder, put the images of each object in its respective folder. This images are the ones to be used to train the model To produce a model that can perform well in practical applications, I recommend you about 500 or more images per object. 1000 images per object is just great 7. In each folder present in the test folder, put about 100 to 200 images of each object in its respective folder. These images are the ones to be used to test the model as it trains 8. Once you have done this, the structure of your image dataset folder should look like below: ``` pets//train//dog//dog-train-images pets//train//cat//cat-train-images pets//train//squirrel//squirrel-train-images pets//train//snake//snake-train-images pets//test//dog//dog-test-images pets//test//cat//cat-test-images pets//test//squirrel//squirrel-test-images pets//test//snake//snake-test-images ``` 9. Then your training code goes as follows: ```python from imageai.Classification.Custom import ClassificationModelTrainer model_trainer = ClassificationModelTrainer() model_trainer.setModelTypeAsResNet50() model_trainer.setDataDirectory("pets") model_trainer.trainModel(num_objects=4, num_experiments=100, enhance_data=True, batch_size=32, show_network_summary=True) ``` Yes! Just 5 lines of code and you can train any of the available 4 state-of-the-art Deep Learning algorithms on your custom dataset. Now lets take a look at how the code above works. ```python from imageai.Classification.Custom import ClassificationModelTrainer model_trainer = ClassificationModelTrainer() model_trainer.setModelTypeAsResNet50() model_trainer.setDataDirectory("pets") ``` In the first line, we import the **ImageAI** model training class, then we define the model trainer in the second line, we set the network type in the third line and set the path to the image dataset we want to train the network on. ```python model_trainer.trainModel(num_objects=4, num_experiments=100, enhance_data=True, batch_size=32, show_network_summary=True) ``` In the code above, we start the training process. The parameters stated in the function are as below: - **num_objects** : this is to state the number of object types in the image dataset - **num_experiments** : this is to state the number of times the network will train over all the training images, which is also called epochs - **enhance_data (optional)** : This is used to state if we want the network to produce modified copies of the training images for better performance. - **batch_size** : This is to state the number of images the network will process at ones. The images are processed in batches until they are exhausted per each experiment performed. - **show_network_summary** : This is to state if the network should show the structure of the training network in the console. When you start the training, you should see something like this in the console: ``` Total params: 23,608,202 Trainable params: 23,555,082 Non-trainable params: 53,120 ____________________________________________________________________________________________________ Using Enhanced Data Generation Found 4000 images belonging to 4 classes. Found 800 images belonging to 4 classes. JSON Mapping for the model classes saved to C:\Users\User\PycharmProjects\ImageAITest\pets\json\model_class.json Number of experiments (Epochs) : 100 ``` When the training progress progresses, you will see results as follows in the console: ``` Epoch 1/100 1/25 [>.............................] - ETA: 52s - loss: 2.3026 - acc: 0.2500 2/25 [=>............................] - ETA: 41s - loss: 2.3027 - acc: 0.1250 3/25 [==>...........................] - ETA: 37s - loss: 2.2961 - acc: 0.1667 4/25 [===>..........................] - ETA: 36s - loss: 2.2980 - acc: 0.1250 5/25 [=====>........................] - ETA: 33s - loss: 2.3178 - acc: 0.1000 6/25 [======>.......................] - ETA: 31s - loss: 2.3214 - acc: 0.0833 7/25 [=======>......................] - ETA: 30s - loss: 2.3202 - acc: 0.0714 8/25 [========>.....................] - ETA: 29s - loss: 2.3207 - acc: 0.0625 9/25 [=========>....................] - ETA: 27s - loss: 2.3191 - acc: 0.0556 10/25 [===========>..................] - ETA: 25s - loss: 2.3167 - acc: 0.0750 11/25 [============>.................] - ETA: 23s - loss: 2.3162 - acc: 0.0682 12/25 [=============>................] - ETA: 21s - loss: 2.3143 - acc: 0.0833 13/25 [==============>...............] - ETA: 20s - loss: 2.3135 - acc: 0.0769 14/25 [===============>..............] - ETA: 18s - loss: 2.3132 - acc: 0.0714 15/25 [=================>............] - ETA: 16s - loss: 2.3128 - acc: 0.0667 16/25 [==================>...........] - ETA: 15s - loss: 2.3121 - acc: 0.0781 17/25 [===================>..........] - ETA: 13s - loss: 2.3116 - acc: 0.0735 18/25 [====================>.........] - ETA: 12s - loss: 2.3114 - acc: 0.0694 19/25 [=====================>........] - ETA: 10s - loss: 2.3112 - acc: 0.0658 20/25 [=======================>......] - ETA: 8s - loss: 2.3109 - acc: 0.0625 21/25 [========================>.....] - ETA: 7s - loss: 2.3107 - acc: 0.0595 22/25 [=========================>....] - ETA: 5s - loss: 2.3104 - acc: 0.0568 23/25 [==========================>...] - ETA: 3s - loss: 2.3101 - acc: 0.0543 24/25 [===========================>..] - ETA: 1s - loss: 2.3097 - acc: 0.0625Epoch 00000: saving model to C:\Users\Moses\Documents\Moses\W7\AI\Custom Datasets\IDENPROF\idenprof-small-test\idenprof\models\model_ex-000_acc-0.100000.h5 25/25 [==============================] - 51s - loss: 2.3095 - acc: 0.0600 - val_loss: 2.3026 - val_acc: 0.1000 ``` Let us explain the details shown above: 1. The line **Epoch 1/100** means the network is training the first experiment of the targeted 100 2. The line `1/25 [>.............................] - ETA: 52s - loss: 2.3026 - acc: 0.2500` represents the number of batches that has been trained in the present experiment 3. The line `Epoch 00000: saving model to C:\Users\User\PycharmProjects\ImageAITest\pets\models\model_ex-000_acc-0.100000.h5` refers to the model saved after the present experiment. The **ex_000** represents the experiment at this stage while the **acc_0.100000** and **val_acc: 0.1000** represents the accuracy of the model on the test images after the present experiment (maximum value value of accuracy is 1.0). This result helps to know the best performed model you can use for custom image prediction. Once you are done training your custom model, you can use the "CustomImagePrediction" class to perform image prediction with your model. Simply follow the link below. [imageai/Classification/CUSTOMCLASSIFICATION.md](https://github.com/OlafenwaMoses/ImageAI/blob/master/imageai/Classification/CUSTOMCLASSIFICATION.md) ### Training on the IdenProf data A sample from the IdenProf Dataset used to train a Model for predicting professionals. ![](../../data-images/idenprof.jpg) Below we provide a sample code to train on **IdenProf**, a dataset which contains images of 10 uniformed professionals. The code below will download the dataset and initiate the training: ```python from io import open import requests import shutil from zipfile import ZipFile import os from imageai.Classification.Custom import ClassificationModelTrainer execution_path = os.getcwd() TRAIN_ZIP_ONE = os.path.join(execution_path, "idenprof-train1.zip") TRAIN_ZIP_TWO = os.path.join(execution_path, "idenprof-train2.zip") TEST_ZIP = os.path.join(execution_path, "idenprof-test.zip") DATASET_DIR = os.path.join(execution_path, "idenprof") DATASET_TRAIN_DIR = os.path.join(DATASET_DIR, "train") DATASET_TEST_DIR = os.path.join(DATASET_DIR, "test") if(os.path.exists(DATASET_DIR) == False): os.mkdir(DATASET_DIR) if(os.path.exists(DATASET_TRAIN_DIR) == False): os.mkdir(DATASET_TRAIN_DIR) if(os.path.exists(DATASET_TEST_DIR) == False): os.mkdir(DATASET_TEST_DIR) if(len(os.listdir(DATASET_TRAIN_DIR)) < 10): if(os.path.exists(TRAIN_ZIP_ONE) == False): print("Downloading idenprof-train1.zip") data = requests.get("https://github.com/OlafenwaMoses/IdenProf/releases/download/v1.0/idenprof-train1.zip", stream = True) with open(TRAIN_ZIP_ONE, "wb") as file: shutil.copyfileobj(data.raw, file) del data if (os.path.exists(TRAIN_ZIP_TWO) == False): print("Downloading idenprof-train2.zip") data = requests.get("https://github.com/OlafenwaMoses/IdenProf/releases/download/v1.0/idenprof-train2.zip", stream=True) with open(TRAIN_ZIP_TWO, "wb") as file: shutil.copyfileobj(data.raw, file) del data print("Extracting idenprof-train1.zip") extract1 = ZipFile(TRAIN_ZIP_ONE) extract1.extractall(DATASET_TRAIN_DIR) extract1.close() print("Extracting idenprof-train2.zip") extract2 = ZipFile(TRAIN_ZIP_TWO) extract2.extractall(DATASET_TRAIN_DIR) extract2.close() if(len(os.listdir(DATASET_TEST_DIR)) < 10): if (os.path.exists(TEST_ZIP) == False): print("Downloading idenprof-test.zip") data = requests.get("https://github.com/OlafenwaMoses/IdenProf/releases/download/v1.0/idenprof-test.zip", stream=True) with open(TEST_ZIP, "wb") as file: shutil.copyfileobj(data.raw, file) del data print("Extracting idenprof-test.zip") extract = ZipFile(TEST_ZIP) extract.extractall(DATASET_TEST_DIR) extract.close() model_trainer = ClassificationModelTrainer() model_trainer.setModelTypeAsResNet50() model_trainer.setDataDirectory(DATASET_DIR) model_trainer.trainModel(num_objects=10, num_experiments=100, enhance_data=True, batch_size=32, show_network_summary=True) ``` ### Continuous Model Training
**ImageAI** now allows you to continue training your custom model on your previously saved model. This is useful in cases of incomplete training due compute time limits/large size of dataset or should you intend to further train your model. Kindly note that **continuous training** is for using a previously saved model to train on the same dataset the model was trained on. All you need to do is specify the `continue_from_model` parameter to the path of the previously saved model in your `trainModel()` function. See an example code below. ```python from imageai.Classification.Custom import ClassificationModelTrainer import os trainer = ClassificationModelTrainer() trainer.setModelTypeAsDenseNet121() trainer.setDataDirectory("idenprof") trainer.trainModel(num_objects=10, num_experiments=50, enhance_data=True, batch_size=8, show_network_summary=True, continue_from_model="idenprof_densenet-0.763500.h5") ``` ### Transfer Learning (Training from a pre-trained model)
From the feedbacks we have received over the past months, we discovered most custom models trained with **ImageAI** were based on datasets with few number of images as they fall short the minimum recommendation of 500 images per each class of objects, for a achieving a viable accuracy. To ensure they can still train very accurate custom models using few number of images, **ImageAI** now allows you to train by leveraging **transfer learning** . This means you can take any pre-trained **ResNet50**, **Squeezenet**, **InceptionV3** and **DenseNet121** model trained on larger datasets and use it to kickstart your custom model training. All you need to do is specify the `transfer_from_model` parameter to the path of the pre-trained model, `initial_num_objects` parameter which corresponds to the number of objects in the previous dataset the pre-trained model was trained on, all in your `trainModel()` function. See an example code below, showing how to perform transfer learning from a ResNet50 model trained on the ImageNet dataset. ```python from imageai.Classification.Custom import ClassificationModelTrainer import os trainer = ClassificationModelTrainer() trainer.setModelTypeAsResNet50() trainer.setDataDirectory("idenprof") trainer.trainModel(num_objects=10, num_experiments=50, enhance_data=True, batch_size=32, show_network_summary=True,transfer_from_model="resnet50_imagenet_tf.2.0.h5", initial_num_objects=1000) ``` ### Contact Developer - **Moses Olafenwa** * _Email:_ guymodscientist@gmail.com * _Website:_ [https://moses.aicommons.science](https://moses.aicommons.science) * _Twitter:_ [@OlafenwaMoses](https://twitter.com/OlafenwaMoses) * _Medium:_ [@guymodscientist](https://medium.com/@guymodscientist) * _Facebook:_ [moses.olafenwa](https://facebook.com/moses.olafenwa) ### Documentation We have provided full documentation for all **ImageAI** classes and functions in 3 major languages. Find links below: * Documentation - **English Version [https://imageai.readthedocs.io](https://imageai.readthedocs.io)** * Documentation - **Chinese Version [https://imageai-cn.readthedocs.io](https://imageai-cn.readthedocs.io)** * Documentation - **French Version [https://imageai-fr.readthedocs.io](https://imageai-fr.readthedocs.io)** ================================================ FILE: imageai_tf_deprecated/Classification/Custom/__init__.py ================================================ import tensorflow as tf from PIL import Image import time import numpy as np import os import warnings from matplotlib.cbook import deprecated import json class ClassificationModelTrainer: """ This is the Classification Model training class, that allows you to define a deep learning network from the 4 available networks types supported by ImageAI which are MobileNetv2, ResNet50, InceptionV3 and DenseNet121. """ def __init__(self): self.__modelType = "" self.__use_pretrained_model = False self.__data_dir = "" self.__train_dir = "" self.__test_dir = "" self.__logs_dir = "" self.__num_epochs = 10 self.__trained_model_dir = "" self.__model_class_dir = "" self.__initial_learning_rate = 1e-3 self.__model_collection = [] def setModelTypeAsSqueezeNet(self): raise ValueError("ImageAI no longer support SqueezeNet. You can use MobileNetV2 instead by downloading the MobileNetV2 model and call the function 'setModelTypeAsMobileNetV2'") def setModelTypeAsMobileNetV2(self): """ 'setModelTypeAsMobileNetV2()' is used to set the model type to the MobileNetV2 model for the training instance object . :return: """ self.__modelType = "mobilenetv2" @deprecated(since="2.1.6", message="'.setModelTypeAsResNet()' has been deprecated! Please use 'setModelTypeAsResNet50()' instead.") def setModelTypeAsResNet(self): return self.setModelTypeAsResNet50() def setModelTypeAsResNet50(self): """ 'setModelTypeAsResNet()' is used to set the model type to the ResNet model for the training instance object . :return: """ self.__modelType = "resnet50" @deprecated(since="2.1.6", message="'.setModelTypeAsDenseNet()' has been deprecated! Please use 'setModelTypeAsDenseNet121()' instead.") def setModelTypeAsDenseNet(self): return self.setModelTypeAsDenseNet121() def setModelTypeAsDenseNet121(self): """ 'setModelTypeAsDenseNet()' is used to set the model type to the DenseNet model for the training instance object . :return: """ self.__modelType = "densenet121" def setModelTypeAsInceptionV3(self): """ 'setModelTypeAsInceptionV3()' is used to set the model type to the InceptionV3 model for the training instance object . :return: """ self.__modelType = "inceptionv3" def setDataDirectory(self, data_directory="", train_subdirectory="train", test_subdirectory="test", models_subdirectory="models", json_subdirectory="json"): """ 'setDataDirectory()' - data_directory , is required to set the path to which the data/dataset to be used for training is kept. The directory can have any name, but it must have 'train' and 'test' sub-directory. In the 'train' and 'test' sub-directories, there must be sub-directories with each having it's name corresponds to the name/label of the object whose images are to be kept. The structure of the 'test' and 'train' folder must be as follows: >> train >> class1 >> class1_train_images >> class2 >> class2_train_images >> class3 >> class3_train_images >> class4 >> class4_train_images >> class5 >> class5_train_images >> test >> class1 >> class1_test_images >> class2 >> class2_test_images >> class3 >> class3_test_images >> class4 >> class4_test_images >> class5 >> class5_test_images - train_subdirectory (optional), subdirectory within 'data_directory' where the training set is. Defaults to 'train'. - test_subdirectory (optional), subdirectory within 'data_directory' where the testing set is. Defaults to 'test'. - models_subdirectory (optional), subdirectory within 'data_directory' where the output models will be saved. Defaults to 'models'. - json_subdirectory (optional), subdirectory within 'data_directory' where the model classes json file will be saved. Defaults to 'json'. :param data_directory: :param train_subdirectory: :param test_subdirectory: :param models_subdirectory: :param json_subdirectory: :return: """ self.__data_dir = data_directory self.__train_dir = os.path.join(self.__data_dir, train_subdirectory) self.__test_dir = os.path.join(self.__data_dir, test_subdirectory) self.__trained_model_dir = os.path.join(self.__data_dir, models_subdirectory) self.__model_class_dir = os.path.join(self.__data_dir, json_subdirectory) self.__logs_dir = os.path.join(self.__data_dir, "logs") def lr_schedule(self, epoch): # Learning Rate Schedule lr = self.__initial_learning_rate total_epochs = self.__num_epochs check_1 = int(total_epochs * 0.9) check_2 = int(total_epochs * 0.8) check_3 = int(total_epochs * 0.6) check_4 = int(total_epochs * 0.4) if epoch > check_1: lr *= 1e-4 elif epoch > check_2: lr *= 1e-3 elif epoch > check_3: lr *= 1e-2 elif epoch > check_4: lr *= 1e-1 return lr def trainModel(self, num_objects, num_experiments=200, enhance_data=False, batch_size = 32, initial_learning_rate=1e-3, show_network_summary=False, training_image_size = 224, continue_from_model=None, transfer_from_model=None, transfer_with_full_training=True, initial_num_objects = None, save_full_model = False): """ 'trainModel()' function starts the model actual training. It accepts the following values: - num_objects , which is the number of classes present in the dataset that is to be used for training - num_experiments , also known as epochs, it is the number of times the network will train on all the training dataset - enhance_data (optional) , this is used to modify the dataset and create more instance of the training set to enhance the training result - batch_size (optional) , due to memory constraints, the network trains on a batch at once, until all the training set is exhausted. The value is set to 32 by default, but can be increased or decreased depending on the meormory of the compute used for training. The batch_size is conventionally set to 16, 32, 64, 128. - initial_learning_rate(optional) , this value is used to adjust the weights generated in the network. You rae advised to keep this value as it is if you don't have deep understanding of this concept. - show_network_summary(optional) , this value is used to show the structure of the network should you desire to see it. It is set to False by default - training_image_size(optional) , this value is used to define the image size on which the model will be trained. The value is 224 by default and is kept at a minimum of 100. - continue_from_model (optional) , this is used to set the path to a model file trained on the same dataset. It is primarily for continuos training from a previously saved model. - transfer_from_model (optional) , this is used to set the path to a model file trained on another dataset. It is primarily used to perform tramsfer learning. - transfer_with_full_training (optional) , this is used to set the pre-trained model to be re-trained across all the layers or only at the top layers. - initial_num_objects (required if 'transfer_from_model' is set ), this is used to set the number of objects the model used for transfer learning is trained on. If 'transfer_from_model' is set, this must be set as well. - save_full_model ( optional ), this is used to save the trained models with their network types. Any model saved by this specification can be loaded without specifying the network type. :param num_objects: :param num_experiments: :param enhance_data: :param batch_size: :param initial_learning_rate: :param show_network_summary: :param training_image_size: :param continue_from_model: :param transfer_from_model: :param initial_num_objects: :param save_full_model: :return: """ self.__num_epochs = num_experiments self.__initial_learning_rate = initial_learning_rate lr_scheduler = tf.keras.callbacks.LearningRateScheduler(self.lr_schedule) if(training_image_size < 100): warnings.warn("The specified training_image_size {} is less than 100. Hence the training_image_size will default to 100.".format(training_image_size)) training_image_size = 100 if (self.__modelType == "mobilenetv2"): if (continue_from_model != None): model = tf.keras.applications.MobileNetV2(input_shape=(training_image_size, training_image_size, 3), weights=continue_from_model, classes=num_objects, include_top=True) if (show_network_summary == True): print("Training using weights from a previouly model") elif (transfer_from_model != None): base_model = tf.keras.applications.MobileNetV2(input_shape=(training_image_size, training_image_size, 3), weights= transfer_from_model, include_top=False, pooling="avg") network = base_model.output network = tf.keras.layers.Dense(num_objects, activation='softmax', use_bias=True)(network) model = tf.keras.model.Models(inputs=base_model.input, outputs=network) if (show_network_summary == True): print("Training using weights from a pre-trained ImageNet model") else: base_model = tf.keras.applications.MobileNetV2(input_shape=(training_image_size, training_image_size, 3), weights= None, classes=num_objects, include_top=False, pooling="avg") network = base_model.output network = tf.keras.layers.Dense(num_objects, activation='softmax', use_bias=True)(network) model = tf.keras.models.Model(inputs=base_model.input, outputs=network) elif (self.__modelType == "resnet50"): if (continue_from_model != None): model = tf.keras.applications.ResNet50(input_shape=(training_image_size, training_image_size, 3), weights=continue_from_model, classes=num_objects, include_top=True) if (show_network_summary == True): print("Training using weights from a previouly model") elif (transfer_from_model != None): base_model = tf.keras.applications.ResNet50(input_shape=(training_image_size, training_image_size, 3), weights= transfer_from_model, include_top=False, pooling="avg") network = base_model.output network = tf.keras.layers.Dense(num_objects, activation='softmax', use_bias=True)(network) model = tf.keras.model.Models(inputs=base_model.input, outputs=network) if (show_network_summary == True): print("Training using weights from a pre-trained ImageNet model") else: base_model = tf.keras.applications.ResNet50(input_shape=(training_image_size, training_image_size, 3), weights= None, classes=num_objects, include_top=False, pooling="avg") network = base_model.output network = tf.keras.layers.Dense(num_objects, activation='softmax', use_bias=True)(network) model = tf.keras.models.Model(inputs=base_model.input, outputs=network) elif (self.__modelType == "inceptionv3"): if (continue_from_model != None): model = tf.keras.applications.InceptionV3(input_shape=(training_image_size, training_image_size, 3), weights=continue_from_model, classes=num_objects, include_top=True) if (show_network_summary == True): print("Training using weights from a previouly model") elif (transfer_from_model != None): base_model = tf.keras.applications.InceptionV3(input_shape=(training_image_size, training_image_size, 3), weights= transfer_from_model, include_top=False, pooling="avg") network = base_model.output network = tf.keras.layers.Dense(num_objects, activation='softmax', use_bias=True)(network) model = tf.keras.model.Models(inputs=base_model.input, outputs=network) if (show_network_summary == True): print("Training using weights from a pre-trained ImageNet model") else: base_model = tf.keras.applications.InceptionV3(input_shape=(training_image_size, training_image_size, 3), weights= None, classes=num_objects, include_top=False, pooling="avg") network = base_model.output network = tf.keras.layers.Dense(num_objects, activation='softmax', use_bias=True)(network) model = tf.keras.models.Model(inputs=base_model.input, outputs=network) base_model = tf.keras.applications.InceptionV3(input_shape=(training_image_size, training_image_size, 3), weights= None, classes=num_objects, include_top=False, pooling="avg") elif (self.__modelType == "densenet121"): if (continue_from_model != None): model = tf.keras.applications.DenseNet121(input_shape=(training_image_size, training_image_size, 3), weights=continue_from_model, classes=num_objects, include_top=True) if (show_network_summary == True): print("Training using weights from a previouly model") elif (transfer_from_model != None): base_model = tf.keras.applications.DenseNet121(input_shape=(training_image_size, training_image_size, 3), weights= transfer_from_model, include_top=False, pooling="avg") network = base_model.output network = tf.keras.layers.Dense(num_objects, activation='softmax', use_bias=True)(network) model = tf.keras.model.Models(inputs=base_model.input, outputs=network) if (show_network_summary == True): print("Training using weights from a pre-trained ImageNet model") else: base_model = tf.keras.applications.DenseNet121(input_shape=(training_image_size, training_image_size, 3), weights= None, classes=num_objects, include_top=False, pooling="avg") network = base_model.output network = tf.keras.layers.Dense(num_objects, activation='softmax', use_bias=True)(network) model = tf.keras.models.Model(inputs=base_model.input, outputs=network) base_model = tf.keras.applications.DenseNet121(input_shape=(training_image_size, training_image_size, 3), weights= None, classes=num_objects, include_top=False, pooling="avg") optimizer = tf.keras.optimizers.Adam(lr=self.__initial_learning_rate, decay=1e-4) model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"]) if (show_network_summary == True): model.summary() model_name = 'model_ex-{epoch:03d}_acc-{accuracy:03f}.h5' log_name = '{}_lr-{}_{}'.format(self.__modelType, initial_learning_rate, time.strftime("%Y-%m-%d-%H-%M-%S")) if not os.path.isdir(self.__trained_model_dir): os.makedirs(self.__trained_model_dir) if not os.path.isdir(self.__model_class_dir): os.makedirs(self.__model_class_dir) if not os.path.isdir(self.__logs_dir): os.makedirs(self.__logs_dir) model_path = os.path.join(self.__trained_model_dir, model_name) logs_path = os.path.join(self.__logs_dir, log_name) if not os.path.isdir(logs_path): os.makedirs(logs_path) save_weights_condition = True if(save_full_model == True ): save_weights_condition = False checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath=model_path, monitor='accuracy', verbose=1, save_weights_only=save_weights_condition, save_best_only=True, period=1) tensorboard = tf.keras.callbacks.TensorBoard(log_dir=logs_path, histogram_freq=0, write_graph=False, write_images=False) if (enhance_data == True): print("Using Enhanced Data Generation") height_shift = 0 width_shift = 0 if (enhance_data == True): height_shift = 0.1 width_shift = 0.1 train_datagen = tf.keras.preprocessing.image.ImageDataGenerator( rescale=1. / 255, horizontal_flip=enhance_data, height_shift_range=height_shift, width_shift_range=width_shift) test_datagen = tf.keras.preprocessing.image.ImageDataGenerator( rescale=1. / 255) train_generator = train_datagen.flow_from_directory(self.__train_dir, target_size=(training_image_size, training_image_size), batch_size=batch_size, class_mode="categorical") test_generator = test_datagen.flow_from_directory(self.__test_dir, target_size=(training_image_size, training_image_size), batch_size=batch_size, class_mode="categorical") class_indices = train_generator.class_indices class_json = {} for eachClass in class_indices: class_json[str(class_indices[eachClass])] = eachClass with open(os.path.join(self.__model_class_dir, "model_class.json"), "w+") as json_file: json.dump(class_json, json_file, indent=4, separators=(",", " : "), ensure_ascii=True) json_file.close() print("JSON Mapping for the model classes saved to ", os.path.join(self.__model_class_dir, "model_class.json")) num_train = len(train_generator.filenames) num_test = len(test_generator.filenames) print("Number of experiments (Epochs) : ", self.__num_epochs) model.fit_generator(train_generator, steps_per_epoch=int(num_train / batch_size), epochs=self.__num_epochs, validation_data=test_generator, validation_steps=int(num_test / batch_size), callbacks=[checkpoint, lr_scheduler]) class CustomImageClassification: """ This is the image classification class for custom models trained with the 'ClassificationModelTrainer' class. It provides support for 4 different models which are: ResNet50, MobileNetV2, DenseNet121 and Inception V3. After instantiating this class, you can set it's properties and make image classification using it's pre-defined functions. The following functions are required to be called before a classification can be made * setModelPath() , path to your custom model * setJsonPath , , path to your custom model's corresponding JSON file * At least of of the following and it must correspond to the model set in the setModelPath() [setModelTypeAsMobileNetV2(), setModelTypeAsResNet50(), setModelTypeAsDenseNet121, setModelTypeAsInceptionV3] * loadModel() [This must be called once only before making a classification] Once the above functions have been called, you can call the classifyImage() function of the classification instance object at anytime to predict an image. """ def __init__(self): self.__modelType = "" self.modelPath = "" self.jsonPath = "" self.numObjects = 10 self.__model_classes = dict() self.__modelLoaded = False self.__model_collection = [] self.__input_image_size = 224 def setModelPath(self, model_path): """ 'setModelPath()' function is required and is used to set the file path to the model adopted from the list of the available 4 model types. The model path must correspond to the model type set for the classification instance object. :param model_path: :return: """ self.modelPath = model_path def setJsonPath(self, model_json): """ 'setJsonPath()' :param model_path: :return: """ self.jsonPath = model_json def setModelTypeAsMobileNetV2(self): """ 'setModelTypeAsMobileNetV2()' is used to set the model type to the MobileNetV2 model for the classification instance object . :return: """ self.__modelType = "mobilenetv2" def setModelTypeAsResNet50(self): """ 'setModelTypeAsResNet50()' is used to set the model type to the ResNet50 model for the classification instance object . :return: """ self.__modelType = "resnet50" def setModelTypeAsDenseNet121(self): """ 'setModelTypeAsDenseNet121()' is used to set the model type to the DenseNet121 model for the classification instance object . :return: """ self.__modelType = "densenet121" def setModelTypeAsInceptionV3(self): """ 'setModelTypeAsInceptionV3()' is used to set the model type to the InceptionV3 model for the classification instance object . :return: """ self.__modelType = "inceptionv3" def loadModel(self, classification_speed="normal", num_objects=10): """ 'loadModel()' function is used to load the model structure into the program from the file path defined in the setModelPath() function. This function receives an optional value which is "classification_speed". The value is used to reduce the time it takes to classify an image, down to about 50% of the normal time, with just slight changes or drop in classification accuracy, depending on the nature of the image. * classification_speed (optional); Acceptable values are "normal", "fast", "faster" and "fastest" :param classification_speed : :return: """ self.__model_classes = json.load(open(self.jsonPath)) if(classification_speed=="normal"): self.__input_image_size = 224 elif(classification_speed=="fast"): self.__input_image_size = 160 elif(classification_speed=="faster"): self.__input_image_size = 120 elif (classification_speed == "fastest"): self.__input_image_size = 100 if (self.__modelLoaded == False): image_input = tf.keras.layers.Input(shape=(self.__input_image_size, self.__input_image_size, 3)) if(self.__modelType == "" ): raise ValueError("You must set a valid model type before loading the model.") elif(self.__modelType == "mobilenetv2"): model = tf.keras.applications.MobileNetV2(input_shape=(self.__input_image_size, self.__input_image_size, 3), weights=self.modelPath, classes = num_objects ) self.__model_collection.append(model) self.__modelLoaded = True try: None except: raise ValueError("An error occured. Ensure your model file is a MobileNetV2 Model and is located in the path {}".format(self.modelPath)) elif(self.__modelType == "resnet50"): try: model = tf.keras.applications.ResNet50(input_shape=(self.__input_image_size, self.__input_image_size, 3), weights=None, classes = num_objects ) model.load_weights(self.modelPath) self.__model_collection.append(model) self.__modelLoaded = True except: raise ValueError("An error occured. Ensure your model file is a ResNet50 Model and is located in the path {}".format(self.modelPath)) elif (self.__modelType == "densenet121"): try: model = tf.keras.applications.DenseNet121(input_shape=(self.__input_image_size, self.__input_image_size, 3), weights=self.modelPath, classes = num_objects) self.__model_collection.append(model) self.__modelLoaded = True except: raise ValueError("An error occured. Ensure your model file is a DenseNet121 Model and is located in the path {}".format(self.modelPath)) elif (self.__modelType == "inceptionv3"): try: model = tf.keras.applications.InceptionV3(input_shape=(self.__input_image_size, self.__input_image_size, 3), weights=self.modelPath, classes = num_objects ) self.__model_collection.append(model) self.__modelLoaded = True except: raise ValueError("An error occured. Ensure your model file is in {}".format(self.modelPath)) def loadFullModel(self, classification_speed="normal", num_objects=10): """ 'loadFullModel()' function is used to load the model structure into the program from the file path defined in the setModelPath() function. As opposed to the 'loadModel()' function, you don't need to specify the model type. This means you can load any Keras model trained with or without ImageAI and perform image prediction. - prediction_speed (optional), Acceptable values are "normal", "fast", "faster" and "fastest" - num_objects (required), the number of objects the model is trained to recognize :param prediction_speed: :param num_objects: :return: """ self.numObjects = num_objects self.__model_classes = json.load(open(self.jsonPath)) if (classification_speed == "normal"): self.__input_image_size = 224 elif (classification_speed == "fast"): self.__input_image_size = 160 elif (classification_speed == "faster"): self.__input_image_size = 120 elif (classification_speed == "fastest"): self.__input_image_size = 100 if (self.__modelLoaded == False): model = tf.keras.models.load_model(filepath=self.modelPath) self.__model_collection.append(model) self.__modelLoaded = True self.__modelType = "full" def getModels(self): """ 'getModels()' provides access to the internal model collection. Helpful if models are used down the line with tools like lime. :return: """ return self.__model_collection def classifyImage(self, image_input, result_count=5, input_type="file"): """ 'classifyImage()' function is used to classify a given image by receiving the following arguments: * input_type (optional) , the type of input to be parsed. Acceptable values are "file", "array" and "stream" * image_input , file path/numpy array/image file stream of the image. * result_count (optional) , the number of classifications to be sent which must be whole numbers between 1 and 1000. The default is 5. This function returns 2 arrays namely 'classification_results' and 'classification_probabilities'. The 'classification_results' contains possible objects classes arranged in descending of their percentage probabilities. The 'classification_probabilities' contains the percentage probability of each object class. The position of each object class in the 'classification_results' array corresponds with the positions of the percentage probability in the 'classification_probabilities' array. :param input_type: :param image_input: :param result_count: :return classification_results, classification_probabilities: """ classification_results = [] classification_probabilities = [] if (self.__modelLoaded == False): raise ValueError("You must call the loadModel() function before making classification.") else: if (input_type == "file"): try: image_to_predict = tf.keras.preprocessing.image.load_img(image_input, target_size=(self.__input_image_size, self.__input_image_size)) image_to_predict = tf.keras.preprocessing.image.img_to_array(image_to_predict, data_format="channels_last") image_to_predict = np.expand_dims(image_to_predict, axis=0) except: raise ValueError("You have set a path to an invalid image file.") elif (input_type == "array"): try: image_input = Image.fromarray(np.uint8(image_input)) image_input = image_input.resize((self.__input_image_size, self.__input_image_size)) image_input = np.expand_dims(image_input, axis=0) image_to_predict = image_input.copy() image_to_predict = np.asarray(image_to_predict, dtype=np.float64) except: raise ValueError("You have parsed in a wrong numpy array for the image") elif (input_type == "stream"): try: image_input = Image.open(image_input) image_input = image_input.resize((self.__input_image_size, self.__input_image_size)) image_input = np.expand_dims(image_input, axis=0) image_to_predict = image_input.copy() image_to_predict = np.asarray(image_to_predict, dtype=np.float64) except: raise ValueError("You have parsed in a wrong stream for the image") if (self.__modelType == "mobilenetv2"): image_to_predict = tf.keras.applications.mobilenet_v2.preprocess_input(image_to_predict) elif (self.__modelType == "full"): image_to_predict = tf.keras.applications.mobilenet_v2.preprocess_input(image_to_predict) elif (self.__modelType == "inceptionv3"): image_to_predict = tf.keras.applications.inception_v3.preprocess_input(image_to_predict) elif (self.__modelType == "densenet121"): image_to_predict = tf.keras.applications.densenet.preprocess_input(image_to_predict) try: model = self.__model_collection[0] prediction = model.predict(image_to_predict, steps=1) predictiondata = [] for pred in prediction: top_indices = pred.argsort()[-result_count:][::-1] for i in top_indices: each_result = [] each_result.append(self.__model_classes[str(i)]) each_result.append(pred[i]) predictiondata.append(each_result) for result in predictiondata: classification_results.append(str(result[0])) classification_probabilities.append(result[1] * 100) except: raise ValueError("Error. Ensure your input image is valid") return classification_results, classification_probabilities @deprecated(since="2.1.6", message="'.predictImage()' has been deprecated! Please use 'classifyImage()' instead.") def predictImage(self, image_input, result_count=5, input_type="file"): return self.classifyImage(image_input, result_count, input_type) ================================================ FILE: imageai_tf_deprecated/Classification/README.md ================================================ # ImageAI : Image Prediction A **DeepQuest AI** project [https://deepquestai.com](https://deepquestai.com) --- ### TABLE OF CONTENTS - :white_square_button: First Prediction - :white_square_button: Prediction Speed - :white_square_button: Image Input Types - :white_square_button: Prediction in MultiThreading - :white_square_button: Documentation ImageAI provides 4 different algorithms and model types to perform image prediction. To perform image prediction on any picture, take the following simple steps. The 4 algorithms provided for image prediction include **MobileNetV2**, **ResNet50**, **InceptionV3** and **DenseNet121**. Each of these algorithms have individual model files which you must use depending on the choice of your algorithm. To download the model file for your choice of algorithm, click on any of the links below: - **[MobileNetV2](https://github.com/OlafenwaMoses/ImageAI/releases/download/essentials-v5/mobilenet_v2.h5)** _(Size = 4.82 mb, fastest prediction time and moderate accuracy)_ - **[ResNet50](https://github.com/OlafenwaMoses/ImageAI/releases/download/essentials-v5/resnet50_imagenet_tf.2.0.h5)** by Microsoft Research _(Size = 98 mb, fast prediction time and high accuracy)_ - **[InceptionV3](https://github.com/OlafenwaMoses/ImageAI/releases/download/1.0/inception_v3_weights_tf_dim_ordering_tf_kernels.h5)** by Google Brain team _(Size = 91.6 mb, slow prediction time and higher accuracy)_ - **[DenseNet121](https://github.com/OlafenwaMoses/ImageAI/releases/download/1.0/DenseNet-BC-121-32.h5)** by Facebook AI Research _(Size = 31.6 mb, slower prediction time and highest accuracy)_ Great! Once you have downloaded this model file, start a new python project, and then copy the model file to your project folder where your python files (.py files) will be . Download the image below, or take any image on your computer and copy it to your python project's folder. Then create a python file and give it a name; an example is `FirstPrediction.py`. Then write the code below into the python file: ### FirstPrediction.py
```python from imageai.Classification import ImageClassification import os execution_path = os.getcwd() prediction = ImageClassification() prediction.setModelTypeAsResNet50() prediction.setModelPath(os.path.join(execution_path, "resnet50_imagenet_tf.2.0.h5")) prediction.loadModel() predictions, probabilities = prediction.classifyImage(os.path.join(execution_path, "1.jpg"), result_count=5 ) for eachPrediction, eachProbability in zip(predictions, probabilities): print(eachPrediction , " : " , eachProbability) ``` Sample Result: ![](../../data-images/1.jpg) ``` convertible : 52.459555864334106 sports_car : 37.61284649372101 pickup : 3.1751200556755066 car_wheel : 1.817505806684494 minivan : 1.7487050965428352 ``` The code above works as follows: ```python from imageai.Classification import ImageClassification import os ``` The code above imports the `ImageAI` library and the python `os` class. ```python execution_path = os.getcwd() ``` The above line obtains the path to the folder that contains your python file (in this example, your FirstPrediction.py). ```python prediction = ImageClassification() prediction.setModelTypeAsResNet50() prediction.setModelPath(os.path.join(execution_path, "resnet50_imagenet_tf.2.0.h5")) ``` In the lines above, we created and instance of the `ImagePrediction()` class in the first line, then we set the model type of the prediction object to ResNet by caling the `.setModelTypeAsResNet50()` in the second line and then we set the model path of the prediction object to the path of the model file (`resnet50_imagenet_tf.2.0.h5`) we copied to the python file folder in the third line. ```python predictions, probabilities = prediction.classifyImage(os.path.join(execution_path, "1.jpg"), result_count=5 ) ``` In the above line, we defined 2 variables to be equal to the function called to predict an image, which is the `.classifyImage()` function, into which we parsed the path to our image and also state the number of prediction results we want to have (values from 1 to 1000) parsing `result_count=5`. The `.classifyImage()` function will return 2 array objects with the first (**predictions**) being an array of predictions and the second (**percentage_probabilities**) being an array of the corresponding percentage probability for each prediction. ```python for eachPrediction, eachProbability in zip(predictions, probabilities): print(eachPrediction, " : " , eachProbability) ``` The above line obtains each object in the **predictions** array, and also obtains the corresponding percentage probability from the **percentage_probabilities**, and finally prints the result of both to console. ### Prediction Speed
**ImageAI** now provides prediction speeds for all image prediction tasks. The prediction speeds allow you to reduce the time of prediction at a rate between 20% - 60%, and yet having just slight changes but accurate prediction results. The available prediction speeds are **"normal"**(default), **"fast"**, **"faster"** and **"fastest"**. All you need to do is to state the speed mode you desire when loading the model as seen below. ```python prediction.loadModel(prediction_speed="fast") ``` To observe the differences in the prediction speeds, look below for each speed applied to multiple prediction with time taken to predict and predictions given. The results below are obtained from predictions performed on a Windows 8 laptop with Intel Celeron N2820 CPU, with processor speed of 2.13GHz **Prediction Speed = "normal" , Prediction Time = 5.9 seconds** ``` convertible : 52.459555864334106 sports_car : 37.61284649372101 pickup : 3.1751200556755066 car_wheel : 1.817505806684494 minivan : 1.7487050965428352 ----------------------- toilet_tissue : 13.99008333683014 jeep : 6.842949986457825 car_wheel : 6.71963095664978 seat_belt : 6.704962253570557 minivan : 5.861184373497963 ----------------------- bustard : 52.03368067741394 vulture : 20.936034619808197 crane : 10.620515048503876 kite : 10.20539253950119 white_stork : 1.6472270712256432 ----------------------- ``` **Prediction Speed = "fast" , Prediction Time = 3.4 seconds** ``` sports_car : 55.5136501789093 pickup : 19.860029220581055 convertible : 17.88402795791626 tow_truck : 2.357563190162182 car_wheel : 1.8646160140633583 ----------------------- drum : 12.241223454475403 toilet_tissue : 10.96322312951088 car_wheel : 10.776633024215698 dial_telephone : 9.840480983257294 toilet_seat : 8.989936858415604 ----------------------- vulture : 52.81011462211609 bustard : 45.628002285957336 kite : 0.8065823465585709 goose : 0.3629807382822037 crane : 0.21266008261591196 ----------------------- ``` **Prediction Speed = "faster" , Prediction Time = 2.7 seconds** ``` sports_car : 79.90474104881287 tow_truck : 9.751049429178238 convertible : 7.056044787168503 racer : 1.8735893070697784 car_wheel : 0.7379394955933094 ----------------------- oil_filter : 73.52778315544128 jeep : 11.926891654729843 reflex_camera : 7.9965077340602875 Polaroid_camera : 0.9798810817301273 barbell : 0.8661789819598198 ----------------------- vulture : 93.00530552864075 bustard : 6.636220961809158 kite : 0.15161558985710144 bald_eagle : 0.10513027664273977 crane : 0.05982434959150851 ----------------------- ``` **Prediction Speed = "fastest" , Prediction Time = 2.2 seconds** ``` tow_truck : 62.5033438205719 sports_car : 31.26143217086792 racer : 2.2139860317111015 fire_engine : 1.7813067883253098 ambulance : 0.8790366351604462 ----------------------- reflex_camera : 94.00787949562073 racer : 2.345871739089489 jeep : 1.6016140580177307 oil_filter : 1.4121259562671185 lens_cap : 0.1283118617720902 ----------------------- kite : 98.5377550125122 vulture : 0.7469987496733665 bustard : 0.36855682265013456 bald_eagle : 0.2437378279864788 great_grey_owl : 0.0699841941241175 ----------------------- ``` **PLEASE NOTE:** When adjusting speed modes, it is best to use models that have higher accuracies like the DenseNet or InceptionV3 models, or use it in case scenarios where the images predicted are iconic. ### Image Input Types
Previous version of **ImageAI** supported only file inputs and accepts file paths to an image for image prediction. Now, **ImageAI** supports 3 input types which are **file path to image file**(default), **numpy array of image** and **image file stream**. This means you can now perform image prediction in production applications such as on a web server and system that returns file in any of the above stated formats. To perform image prediction with numpy array or file stream input, you just need to state the input type in the `.classifyImage()` function. See example below. ```python predictions, probabilities = prediction.classifyImage(image_array, result_count=5 , input_type="array" ) # For numpy array input type predictions, probabilities = prediction.classifyImage(image_stream, result_count=5 , input_type="stream" ) # For file stream input type ``` ### Prediction in MultiThreading
When developing programs that run heavy task on the deafult thread like User Interfaces (UI), you should consider running your predictions in a new thread. When running image prediction using ImageAI in a new thread, you must take note the following: - You can create your prediction object, set its model type, set model path and json path outside the new thread. - The `.loadModel()` must be in the new thread and image prediction (`classifyImage()`) must take place in th new thread. Take a look of a sample code below on image prediction using multithreading: ```python from imageai.Prediction import ImageClassification import os import threading execution_path = os.getcwd() prediction = ImageClassification() prediction.setModelTypeAsResNet() prediction.setModelPath( os.path.join(execution_path, "resnet50_imagenet_tf.2.0.h5")) picturesfolder = os.environ["USERPROFILE"] + "\\Pictures\\" allfiles = os.listdir(picturesfolder) class PredictionThread(threading.Thread): def __init__(self): threading.Thread.__init__(self) def run(self): prediction.loadModel() for eachPicture in allfiles: if eachPicture.endswith(".png") or eachPicture.endswith(".jpg"): predictions, percentage_probabilities = prediction.predictImage(picturesfolder + eachPicture, result_count=1) for prediction, percentage_probability in zip(predictions, probabilities): print(prediction , " : " , percentage_probability) predictionThread = PredictionThread () predictionThread.start() ``` ### Documentation We have provided full documentation for all **ImageAI** classes and functions in 3 major languages. Find links below:** * Documentation - **English Version [https://imageai.readthedocs.io](https://imageai.readthedocs.io)** * Documentation - **Chinese Version [https://imageai-cn.readthedocs.io](https://imageai-cn.readthedocs.io)** * Documentation - **French Version [https://imageai-fr.readthedocs.io](https://imageai-fr.readthedocs.io)** ================================================ FILE: imageai_tf_deprecated/Classification/__init__.py ================================================ import tensorflow as tf from PIL import Image import numpy as np from matplotlib.cbook import deprecated class ImageClassification: """ This is the image classification class in the ImageAI library. It provides support for 4 different models which are: ResNet, MobileNetV2, DenseNet and Inception V3. After instantiating this class, you can set it's properties and make image classification using it's pre-defined functions. The following functions are required to be called before a classification can be made * setModelPath() * At least of of the following and it must correspond to the model set in the setModelPath() [setModelTypeAsMobileNetv2(), setModelTypeAsResNet(), setModelTypeAsDenseNet, setModelTypeAsInceptionV3] * loadModel() [This must be called once only before making a classification] Once the above functions have been called, you can call the classifyImage() function of the classification instance object at anytime to classify an image. """ def __init__(self): self.__modelType = "" self.modelPath = "" self.__modelLoaded = False self.__model_collection = [] self.__input_image_size = 224 def setModelPath(self, model_path): """ 'setModelPath()' function is required and is used to set the file path to the model adopted from the list of the available 4 model types. The model path must correspond to the model type set for the classification instance object. :param model_path: :return: """ self.modelPath = model_path def setModelTypeAsSqueezeNet(self): raise ValueError("ImageAI no longer support SqueezeNet. You can use MobileNetV2 instead by downloading the MobileNetV2 model and call the function 'setModelTypeAsMobileNetV2'") def setModelTypeAsMobileNetV2(self): """ 'setModelTypeAsMobileNetV2()' is used to set the model type to the MobileNetV2 model for the classification instance object . :return: """ self.__modelType = "mobilenetv2" @deprecated(since="2.1.6", message="'.setModelTypeAsResNet()' has been deprecated! Please use 'setModelTypeAsResNet50()' instead.") def setModelTypeAsResNet(self): return self.setModelTypeAsResNet50() def setModelTypeAsResNet50(self): """ 'setModelTypeAsResNet50()' is used to set the model type to the ResNet50 model for the classification instance object . :return: """ self.__modelType = "resnet50" @deprecated(since="2.1.6", message="'.setModelTypeAsDenseNet()' has been deprecated! Please use 'setModelTypeAsDenseNet121()' instead.") def setModelTypeAsDenseNet(self): return self.setModelTypeAsDenseNet121() def setModelTypeAsDenseNet121(self): """ 'setModelTypeAsDenseNet121()' is used to set the model type to the DenseNet121 model for the classification instance object . :return: """ self.__modelType = "densenet121" def setModelTypeAsInceptionV3(self): """ 'setModelTypeAsInceptionV3()' is used to set the model type to the InceptionV3 model for the classification instance object . :return: """ self.__modelType = "inceptionv3" def loadModel(self, classification_speed="normal"): """ 'loadModel()' function is used to load the model structure into the program from the file path defined in the setModelPath() function. This function receives an optional value which is "classification_speed". The value is used to reduce the time it takes to classify an image, down to about 50% of the normal time, with just slight changes or drop in classification accuracy, depending on the nature of the image. * classification_speed (optional); Acceptable values are "normal", "fast", "faster" and "fastest" :param classification_speed : :return: """ if(classification_speed=="normal"): self.__input_image_size = 224 elif(classification_speed=="fast"): self.__input_image_size = 160 elif(classification_speed=="faster"): self.__input_image_size = 120 elif (classification_speed == "fastest"): self.__input_image_size = 100 if (self.__modelLoaded == False): if(self.__modelType == "" ): raise ValueError("You must set a valid model type before loading the model.") elif(self.__modelType == "mobilenetv2"): model = tf.keras.applications.MobileNetV2(input_shape=(self.__input_image_size, self.__input_image_size, 3), weights=None, classes = 1000 ) model.load_weights(self.modelPath) self.__model_collection.append(model) self.__modelLoaded = True try: None except: raise ValueError("An error occured. Ensure your model file is a MobileNetV2 Model and is located in the path {}".format(self.modelPath)) elif(self.__modelType == "resnet50"): try: model = tf.keras.applications.ResNet50(input_shape=(self.__input_image_size, self.__input_image_size, 3), weights=None, classes = 1000 ) model.load_weights(self.modelPath) self.__model_collection.append(model) self.__modelLoaded = True except Exception as e: raise ValueError("An error occured. Ensure your model file is a ResNet50 Model and is located in the path {}".format(self.modelPath)) elif (self.__modelType == "densenet121"): try: model = tf.keras.applications.DenseNet121(input_shape=(self.__input_image_size, self.__input_image_size, 3), weights=None, classes = 1000 ) model.load_weights(self.modelPath) self.__model_collection.append(model) self.__modelLoaded = True except: raise ValueError("An error occured. Ensure your model file is a DenseNet121 Model and is located in the path {}".format(self.modelPath)) elif (self.__modelType == "inceptionv3"): try: model = tf.keras.applications.InceptionV3(input_shape=(self.__input_image_size, self.__input_image_size, 3), weights=None, classes = 1000 ) model.load_weights(self.modelPath) self.__model_collection.append(model) self.__modelLoaded = True except: raise ValueError("An error occured. Ensure your model file is in {}".format(self.modelPath)) def classifyImage(self, image_input, result_count=5, input_type="file"): """ 'classifyImage()' function is used to classify a given image by receiving the following arguments: * input_type (optional) , the type of input to be parsed. Acceptable values are "file", "array" and "stream" * image_input , file path/numpy array/image file stream of the image. * result_count (optional) , the number of classifications to be sent which must be whole numbers between 1 and 1000. The default is 5. This function returns 2 arrays namely 'classification_results' and 'classification_probabilities'. The 'classification_results' contains possible objects classes arranged in descending of their percentage probabilities. The 'classification_probabilities' contains the percentage probability of each object class. The position of each object class in the 'classification_results' array corresponds with the positions of the percentage probability in the 'classification_probabilities' array. :param input_type: :param image_input: :param result_count: :return classification_results, classification_probabilities: """ classification_results = [] classification_probabilities = [] if (self.__modelLoaded == False): raise ValueError("You must call the loadModel() function before making classification.") else: if (input_type == "file"): try: image_to_predict = tf.keras.preprocessing.image.load_img(image_input, target_size=(self.__input_image_size, self.__input_image_size)) image_to_predict = tf.keras.preprocessing.image.img_to_array(image_to_predict, data_format="channels_last") image_to_predict = np.expand_dims(image_to_predict, axis=0) except: raise ValueError("You have set a path to an invalid image file.") elif (input_type == "array"): try: image_input = Image.fromarray(np.uint8(image_input)) image_input = image_input.resize((self.__input_image_size, self.__input_image_size)) image_input = np.expand_dims(image_input, axis=0) image_to_predict = image_input.copy() image_to_predict = np.asarray(image_to_predict, dtype=np.float64) except: raise ValueError("You have parsed in a wrong numpy array for the image") elif (input_type == "stream"): try: image_input = Image.open(image_input) image_input = image_input.resize((self.__input_image_size, self.__input_image_size)) image_input = np.expand_dims(image_input, axis=0) image_to_predict = image_input.copy() image_to_predict = np.asarray(image_to_predict, dtype=np.float64) except: raise ValueError("You have parsed in a wrong stream for the image") if (self.__modelType == "mobilenetv2"): image_to_predict = tf.keras.applications.mobilenet_v2.preprocess_input(image_to_predict) elif (self.__modelType == "densenet121"): image_to_predict = tf.keras.applications.densenet.preprocess_input(image_to_predict) elif (self.__modelType == "inceptionv3"): image_to_predict = tf.keras.applications.inception_v3.preprocess_input(image_to_predict) try: model = self.__model_collection[0] prediction = model.predict(image_to_predict, steps=1) if (self.__modelType == "mobilenetv2"): predictiondata = tf.keras.applications.mobilenet_v2.decode_predictions(prediction, top=int(result_count)) elif (self.__modelType == "resnet50"): predictiondata = tf.keras.applications.resnet50.decode_predictions(prediction, top=int(result_count)) elif (self.__modelType == "inceptionv3"): predictiondata = tf.keras.applications.inception_v3.decode_predictions(prediction, top=int(result_count)) elif (self.__modelType == "densenet121"): predictiondata = tf.keras.applications.densenet.decode_predictions(prediction, top=int(result_count)) for results in predictiondata: for result in results: classification_results.append(str(result[1])) classification_probabilities.append(result[2] * 100) except: raise ValueError("An error occured! Try again.") return classification_results, classification_probabilities @deprecated(since="2.1.6", message="'.predictImage()' has been deprecated! Please use 'classifyImage()' instead.") def predictImage(self, image_input, result_count=5, input_type="file"): return self.classifyImage(image_input, result_count, input_type) ================================================ FILE: imageai_tf_deprecated/Detection/Custom/CUSTOMDETECTION.md ================================================ # ImageAI : Custom Object Detection An **DeepQuest AI** project [https://deepquestai.com](https://deepquestai.com) --- ### TABLE OF CONTENTS - :white_square_button: Custom Object Detection - :white_square_button: Object Detection, Extraction and Fine-tune - :white_square_button: Hiding/Showing Object Name and Probability - :white_square_button: Image Input & Output Types - :white_square_button: Documentation ImageAI provides very convenient and powerful methods to perform object detection on images and extract each object from the image using your own **custom YOLOv3 model** and the corresponding **detection_config.json** generated during the training. To test the custom object detection, you can download a sample custom model we have trained to detect the Hololens headset and its **detection_config.json** file via the links below: * [**hololens-ex-60--loss-2.76.h5**](https://github.com/OlafenwaMoses/ImageAI/releases/download/essential-v4/hololens-ex-60--loss-2.76.h5) _(Size = 236 mb)_ * [**detection_config.json**](https://github.com/OlafenwaMoses/ImageAI/releases/download/essential-v4/detection_config.json) Once you download the custom object detection model file, you should copy the model file to the your project folder where your **.py** files will be. Then create a python file and give it a name; an example is FirstCustomDetection.py. Then write the code below into the python file: ### FirstCustomDetection.py
```python from imageai.Detection.Custom import CustomObjectDetection detector = CustomObjectDetection() detector.setModelTypeAsYOLOv3() detector.setModelPath("hololens-ex-60--loss-2.76.h5") detector.setJsonPath("detection_config.json") detector.loadModel() detections = detector.detectObjectsFromImage(input_image="holo2.jpg", output_image_path="holo2-detected.jpg") for detection in detections: print(detection["name"], " : ", detection["percentage_probability"], " : ", detection["box_points"]) ``` Sample Result - Input: ![Input](../../../data-images/holo2.jpg) Output: ![Output](../../../data-images/holo2-detected.jpg) ``` hololens : 39.69653248786926 : [611, 74, 751, 154] hololens : 87.6643180847168 : [23, 46, 90, 79] hololens : 89.25175070762634 : [191, 66, 243, 95] hololens : 64.49641585350037 : [437, 81, 514, 133] hololens : 91.78624749183655 : [380, 113, 423, 138] ``` Let us make a breakdown of the object detection code that we used above. ```python from imageai.Detection.Custom import CustomObjectDetection detector = CustomObjectDetection() detector.setModelTypeAsYOLOv3() ``` In the 3 lines above , we import the **ImageAI custom object detection** class in the first line, created the class instance on the second line and set the model type to YOLOv3. ```python detector.setModelPath("hololens-ex-60--loss-2.76.h5") detector.setJsonPath("detection_config.json") detector.loadModel() ``` In the 3 lines above, we specified the file path to our downloaded model file in the first line , specified the path to our **detection_config.json** file in the second line and loaded the model on the third line. ```python detections = detector.detectObjectsFromImage(input_image="holo2.jpg", output_image_path="holo2-detected.jpg") for detection in detections: print(detection["name"], " : ", detection["percentage_probability"], " : ", detection["box_points"]) ``` In the 3 lines above, we ran the `detectObjectsFromImage()` function and parse in the path to our test image, and the path to the new image which the function will save. Then the function returns an array of dictionaries with each dictionary corresponding to the number of objects detected in the image. Each dictionary has the properties `name` (name of the object), `percentage_probability` (percentage probability of the detection) and `box_points` (the x1,y1,x2 and y2 coordinates of the bounding box of the object). ### Object Detection, Extraction and Fine-tune
In the examples we used above, we ran the object detection on an image and it returned the detected objects in an array as well as save a new image with rectangular markers drawn on each object. In our next examples, we will be able to extract each object from the input image and save it independently. In the example code below which is very identical to the previous object detection code, we will save each object detected as a separate image. ```python from imageai.Detection.Custom import CustomObjectDetection detector = CustomObjectDetection() detector.setModelTypeAsYOLOv3() detector.setModelPath("hololens-ex-60--loss-2.76.h5") detector.setJsonPath("detection_config.json") detector.loadModel() detections, extracted_objects_array = detector.detectObjectsFromImage(input_image="holo2.jpg", output_image_path="holo2-detected.jpg", extract_detected_objects=True) for detection, object_path in zip(detections, extracted_objects_array): print(object_path) print(detection["name"], " : ", detection["percentage_probability"], " : ", detection["box_points"]) print("---------------") ``` Sample Result: Output Images ![](../../../data-images/holo2-detected-objects/hololens-1.jpg) ![](../../../data-images/holo2-detected-objects/hololens-2.jpg) ![](../../../data-images/holo2-detected-objects/hololens-3.jpg) ![](../../../data-images/holo2-detected-objects/hololens-4.jpg) ![](../../../data-images/holo2-detected-objects/hololens-5.jpg) ![](../../../data-images/holo2-detected-objects/hololens-6.jpg) ![](../../../data-images/holo2-detected-objects/hololens-7.jpg) Let us review the part of the code that perform the object detection and extract the images: ```python detections, extracted_objects_array = detector.detectObjectsFromImage(input_image="holo2.jpg", output_image_path="holo2-detected.jpg", extract_detected_objects=True) for detection, object_path in zip(detections, extracted_objects_array): print(object_path) print(detection["name"], " : ", detection["percentage_probability"], " : ", detection["box_points"]) print("---------------") ``` In the above above lines, we called the `detectObjectsFromImage()` , parse in the input image path, output image part, and an extra parameter `extract_detected_objects=True`. This parameter states that the function should extract each object detected from the image and save it has a seperate image. The parameter is false by default. Once set to `true`, the function will create a directory which is the `output image path + "-objects"`. Then it saves all the extracted images into this new directory with each image's name being the `detected object name + "-" + a number` which corresponds to the order at which the objects were detected. This new parameter we set to extract and save detected objects as an image will make the function to return 2 values. The first is the array of dictionaries with each dictionary corresponding to a detected object. The second is an array of the paths to the saved images of each object detected and extracted, and they are arranged in order at which the objects are in the first array. ### And one important feature you need to know! You will recall that the percentage probability for each detected object is sent back by the `detectObjectsFromImage()` function. The function has a parameter `minimum_percentage_probability` , whose default value is `30` (value ranges between 0 - 100) , but it set to 30 in this example. That means the function will only return a detected object if it's percentage probability is **30 or above**. The value was kept at this number to ensure the integrity of the detection results. You fine-tune the object detection by setting `minimum_percentage_probability` equal to a smaller value to detect more number of objects or higher value to detect less number of objects. ### Hiding/Showing Object Name and Probability
**ImageAI** provides options to hide the name of objects detected and/or the percentage probability from being shown on the saved/returned detected image. Using the `detectObjectsFromImage()` and `detectCustomObjectsFromImage()` functions, the parameters `'display_object_name'` and `'display_percentage_probability'` can be set to True of False individually. Take a look at the code below: ```python detections = detector.detectObjectsFromImage(input_image=os.path.join(execution_path , "holo2.jpg"), output_image_path=os.path.join(execution_path , "holo2_nodetails.jpg"), minimum_percentage_probability=30, display_percentage_probability=False, display_object_name=False) ``` In the above code, we specified that both the object name and percentage probability should not be shown. As you can see in the result below, both the names of the objects and their individual percentage probability is not shown in the detected image. **Result** ![](../../../data-images/holo2-nodetails.jpg) ### Image Input & Output Types
**ImageAI** custom object detection supports 2 input types of inputs which are **file path to image file**(default) and **numpy array of an image** as well as 2 types of output which are image **file**(default) and numpy **array **. This means you can now perform object detection in production applications such as on a web server and system that returns file in any of the above stated formats. To perform object detection with numpy array input, you just need to state the input type in the `.detectObjectsFromImage()` function. See example below. ```python detections = detector.detectObjectsFromImage(input_type="array", input_image=image_array , output_image_path=os.path.join(execution_path , "holo2-detected.jpg")) # For numpy array input type ``` To perform object detection with numpy array output you just need to state the output type in the `.detectObjectsFromImage()` function. See example below. ```python detected_image_array, detections = detector.detectObjectsFromImage(output_type="array", input_image="holo2.jpg" ) # For numpy array output type ``` ### Documentation
We have provided full documentation for all **ImageAI** classes and functions in 3 major languages. Find links below: * Documentation - **English Version** [https://imageai.readthedocs.io](https://imageai.readthedocs.io)** * Documentation - **Chinese Version** [https://imageai-cn.readthedocs.io](https://imageai-cn.readthedocs.io)** * Documentation - **French Version** [https://imageai-fr.readthedocs.io](https://imageai-fr.readthedocs.io)** ================================================ FILE: imageai_tf_deprecated/Detection/Custom/CUSTOMDETECTIONTRAINING.md ================================================ # ImageAI : Custom Detection Model Training --- **ImageAI** provides the most simple and powerful approach to training custom object detection models using the YOLOv3 architeture, which which you can load into the `imageai.Detection.Custom.CustomObjectDetection` class. This allows you to train your own model on any set of images that corresponds to any type of objects of interest. The training process generates a JSON file that maps the objects names in your image dataset and the detection anchors, as well as creates lots of models. In choosing the best model for your custom object detection task, an `evaluateModel()` function has been provided to compute the **mAP** of your saved models by allowing you to state your desired **IoU** and **Non-maximum Suppression** values. Then you can perform custom object detection using the model and the JSON file generated. ### TABLE OF CONTENTS - :white_square_button: Preparing your custom dataset - :white_square_button: Training on your custom Dataset - :white_square_button: Evaluating your saved detection models' mAP ### Preparing your custom dataset
To train a custom detection model, you need to prepare the images you want to use to train the model. You will prepare the images as follows: 1. Decide the type of object(s) you want to detect and collect about **200 (minimum recommendation)** or more picture of each of the object(s) 2. Once you have collected the images, you need to annotate the object(s) in the images. **ImageAI** uses the **Pascal VOC format** for image annotation. You can generate this annotation for your images using the easy to use [**LabelImg**](https://github.com/tzutalin/labelImg) image annotation tool, available for Windows, Linux and MacOS systems. Open the link below to install the annotation tool. See: [https://github.com/tzutalin/labelImg](https://github.com/tzutalin/labelImg) 3. When you are done annotating your images, **annotation XML** files will be generated for each image in your dataset. The **annotation XML** file describes each or **all** of the objects in the image. For example, if each image your image names are **image(1).jpg**, **image(2).jpg**, **image(3).jpg** till **image(z).jpg**; the corresponding annotation for each of the images will be **image(1).xml**, **image(2).xml**, **image(3).xml** till **image(z).xml**. 4. Once you have the annotations for all your images, create a folder for your dataset (E.g headsets) and in this parent folder, create child folders **train** and **validation** 5. In the train folder, create **images** and **annotations** sub-folders. Put about 70-80% of your dataset of each object's images in the **images** folder and put the corresponding annotations for these images in the **annotations** folder. 6. In the validation folder, create **images** and **annotations** sub-folders. Put the rest of your dataset images in the **images** folder and put the corresponding annotations for these images in the **annotations** folder. 7. Once you have done this, the structure of your image dataset folder should look like below: ``` >> train >> images >> img_1.jpg (shows Object_1) >> images >> img_2.jpg (shows Object_2) >> images >> img_3.jpg (shows Object_1, Object_3 and Object_n) >> annotations >> img_1.xml (describes Object_1) >> annotations >> img_2.xml (describes Object_2) >> annotations >> img_3.xml (describes Object_1, Object_3 and Object_n) >> validation >> images >> img_151.jpg (shows Object_1, Object_3 and Object_n) >> images >> img_152.jpg (shows Object_2) >> images >> img_153.jpg (shows Object_1) >> annotations >> img_151.xml (describes Object_1, Object_3 and Object_n) >> annotations >> img_152.xml (describes Object_2) >> annotations >> img_153.xml (describes Object_1) ``` 8. You can train your custom detection model completely from scratch or use transfer learning (recommended for better accuracy) from a pre-trained YOLOv3 model. Also, we have provided a sample annotated Hololens and Headsets (Hololens and Oculus) dataset for you to train with. Download the pre-trained YOLOv3 model and the sample datasets in the link below. [https://github.com/OlafenwaMoses/ImageAI/releases/tag/essential-v4](https://github.com/OlafenwaMoses/ImageAI/releases/tag/essential-v4) ### Training on your custom dataset
Before you start training your custom detection model, kindly take note of the following: - The default **batch_size** is 4. If you are training with **Google Colab**, this will be fine. However, I will advice you use a more powerful GPU than the K80 offered by Colab as the higher your **batch_size (8, 16)**, the better the accuracy of your detection model. - If you experience '_TfDeviceCaptureOp' object has no attribute '_set_device_from_string' error in Google Colab, it is due to a bug in **Tensorflow**. You can solve this by installing **Tensorflow GPU 1.13.1**. ```bash pip3 install tensorflow-gpu==1.13.1 ``` Then your training code goes as follows: ```python from imageai.Detection.Custom import DetectionModelTrainer trainer = DetectionModelTrainer() trainer.setModelTypeAsYOLOv3() trainer.setDataDirectory(data_directory="hololens") trainer.setTrainConfig(object_names_array=["hololens"], batch_size=4, num_experiments=200, train_from_pretrained_model="pretrained-yolov3.h5") # In the above,when training for detecting multiple objects, #set object_names_array=["object1", "object2", "object3",..."objectz"] trainer.trainModel() ``` Yes! Just 6 lines of code and you can train object detection models on your custom dataset. Now lets take a look at how the code above works. ```python from imageai.Detection.Custom import DetectionModelTrainer trainer = DetectionModelTrainer() trainer.setModelTypeAsYOLOv3() trainer.setDataDirectory(data_directory="hololens") ``` In the first line, we import the **ImageAI** detection model training class, then we define the model trainer in the second line, we set the network type in the third line and set the path to the image dataset we want to train the network on. ```python trainer.setTrainConfig(object_names_array=["hololens"], batch_size=4, num_experiments=200, train_from_pretrained_model="pretrained-yolov3.h5") ``` In the line above, we configured our detection model trainer. The parameters we stated in the function as as below: - **num_objects** : this is an array containing the names of the objects in our dataset - **batch_size** : this is to state the batch size for the training - **num_experiments** : this is to state the number of times the network will train over all the training images, which is also called epochs - **train_from_pretrained_model(optional)** : this is to train using transfer learning from a pre-trained **YOLOv3** model ```python trainer.trainModel() ``` When you start the training, you should see something like this in the console: ``` Using TensorFlow backend. Generating anchor boxes for training images and annotation... Average IOU for 9 anchors: 0.78 Anchor Boxes generated. Detection configuration saved in hololens/json/detection_config.json Training on: ['hololens'] Training with Batch Size: 4 Number of Experiments: 200 Epoch 1/200 480/480 [==============================] - 395s 823ms/step - loss: 36.9000 - yolo_layer_1_loss: 3.2970 - yolo_layer_2_loss: 9.4923 - yolo_layer_3_loss: 24.1107 - val_loss: 15.6321 - val_yolo_layer_1_loss: 2.0275 - val_yolo_layer_2_loss: 6.4191 - val_yolo_layer_3_loss: 7.1856 Epoch 2/200 480/480 [==============================] - 293s 610ms/step - loss: 11.9330 - yolo_layer_1_loss: 1.3968 - yolo_layer_2_loss: 4.2894 - yolo_layer_3_loss: 6.2468 - val_loss: 7.9868 - val_yolo_layer_1_loss: 1.7054 - val_yolo_layer_2_loss: 2.9156 - val_yolo_layer_3_loss: 3.3657 Epoch 3/200 480/480 [==============================] - 293s 610ms/step - loss: 7.1228 - yolo_layer_1_loss: 1.0583 - yolo_layer_2_loss: 2.2863 - yolo_layer_3_loss: 3.7782 - val_loss: 6.4964 - val_yolo_layer_1_loss: 1.1391 - val_yolo_layer_2_loss: 2.2058 - val_yolo_layer_3_loss: 3.1514 Epoch 4/200 480/480 [==============================] - 297s 618ms/step - loss: 5.5802 - yolo_layer_1_loss: 0.9742 - yolo_layer_2_loss: 1.8916 - yolo_layer_3_loss: 2.7144 - val_loss: 6.4275 - val_yolo_layer_1_loss: 1.6153 - val_yolo_layer_2_loss: 2.1203 - val_yolo_layer_3_loss: 2.6919 Epoch 5/200 480/480 [==============================] - 295s 615ms/step - loss: 4.8717 - yolo_layer_1_loss: 0.7568 - yolo_layer_2_loss: 1.6641 - yolo_layer_3_loss: 2.4508 - val_loss: 6.3723 - val_yolo_layer_1_loss: 1.6434 - val_yolo_layer_2_loss: 2.1188 - val_yolo_layer_3_loss: 2.6101 Epoch 6/200 480/480 [==============================] - 300s 624ms/step - loss: 4.7989 - yolo_layer_1_loss: 0.8708 - yolo_layer_2_loss: 1.6683 - yolo_layer_3_loss: 2.2598 - val_loss: 5.8672 - val_yolo_layer_1_loss: 1.2349 - val_yolo_layer_2_loss: 2.0504 - val_yolo_layer_3_loss: 2.5820 Epoch 7/200 ``` Let us explain the details shown above: ``` Using TensorFlow backend. Generating anchor boxes for training images and annotation... Average IOU for 9 anchors: 0.78 Anchor Boxes generated. Detection configuration saved in hololens/json/detection_config.json Training on: ['hololens'] Training with Batch Size: 4 Number of Experiments: 200 ``` The above details signifies the following: - **ImageAI** autogenerates the best match detection **anchor boxes** for your image dataset. - The anchor boxes and the object names mapping are saved in **json/detection_config.json** path of in the image dataset folder. Please note that for every new training you start, a new **detection_config.json** file is generated and is only compatible with the model saved during that training. ``` Epoch 1/200 480/480 [==============================] - 395s 823ms/step - loss: 36.9000 - yolo_layer_1_loss: 3.2970 - yolo_layer_2_loss: 9.4923 - yolo_layer_3_loss: 24.1107 - val_loss: 15.6321 - val_yolo_layer_1_loss: 2.0275 - val_yolo_layer_2_loss: 6.4191 - val_yolo_layer_3_loss: 7.1856 Epoch 2/200 480/480 [==============================] - 293s 610ms/step - loss: 11.9330 - yolo_layer_1_loss: 1.3968 - yolo_layer_2_loss: 4.2894 - yolo_layer_3_loss: 6.2468 - val_loss: 7.9868 - val_yolo_layer_1_loss: 1.7054 - val_yolo_layer_2_loss: 2.9156 - val_yolo_layer_3_loss: 3.3657 Epoch 3/200 480/480 [==============================] - 293s 610ms/step - loss: 7.1228 - yolo_layer_1_loss: 1.0583 - yolo_layer_2_loss: 2.2863 - yolo_layer_3_loss: 3.7782 - val_loss: 6.4964 - val_yolo_layer_1_loss: 1.1391 - val_yolo_layer_2_loss: 2.2058 - val_yolo_layer_3_loss: 3.1514 Epoch 4/200 480/480 [==============================] - 297s 618ms/step - loss: 5.5802 - yolo_layer_1_loss: 0.9742 - yolo_layer_2_loss: 1.8916 - yolo_layer_3_loss: 2.7144 - val_loss: 6.4275 - val_yolo_layer_1_loss: 1.6153 - val_yolo_layer_2_loss: 2.1203 - val_yolo_layer_3_loss: 2.6919 Epoch 5/200 480/480 [==============================] - 295s 615ms/step - loss: 4.8717 - yolo_layer_1_loss: 0.7568 - yolo_layer_2_loss: 1.6641 - yolo_layer_3_loss: 2.4508 - val_loss: 6.3723 - val_yolo_layer_1_loss: 1.6434 - val_yolo_layer_2_loss: 2.1188 - val_yolo_layer_3_loss: 2.6101 Epoch 6/200 480/480 [==============================] - 300s 624ms/step - loss: 4.7989 - yolo_layer_1_loss: 0.8708 - yolo_layer_2_loss: 1.6683 - yolo_layer_3_loss: 2.2598 - val_loss: 5.8672 - val_yolo_layer_1_loss: 1.2349 - val_yolo_layer_2_loss: 2.0504 - val_yolo_layer_3_loss: 2.5820 Epoch 7/200 ``` - The above signifies the progress of the training. - For each experiment (Epoch), the general total validation loss (E.g - loss: 4.7582) is reported. - For each drop in the loss after an experiment, a model is saved in the **hololens/models** folder. The lower the loss, the better the model. - **Tensorboard** report file for the training will be saved in the **hololens/logs** folder. Once you are done training, you can visit the link below for performing object detection with your **custom detection model** and **detection_config.json** file. [Detection/Custom/CUSTOMDETECTION.md](./CUSTOMDETECTION.md) ### Evaluating your saved detection models' mAP
After training on your custom dataset, you can evaluate the mAP of your saved models by specifying your desired IoU and Non-maximum suppression values. See details as below: - **Single Model Evaluation:** To evaluate a single model, simply use the example code below with the path to your dataset directory, the model file and the **detection_config.json** file saved during the training. In the example, we used an **object_threshold** of 0.3 ( percentage_score >= 30% ), **IoU** of 0.5 and **Non-maximum suppression** value of 0.5. ```python from imageai.Detection.Custom import DetectionModelTrainer trainer = DetectionModelTrainer() trainer.setModelTypeAsYOLOv3() trainer.setDataDirectory(data_directory="hololens") metrics = trainer.evaluateModel(model_path="detection_model-ex-60--loss-2.76.h5", json_path="detection_config.json", iou_threshold=0.5, object_threshold=0.3, nms_threshold=0.5) ``` Consider that `trainer.evaluateModel` method will show the metrics on standard output as shown below, but also returns a list of dicts containing all the information that is displayed. Sample Result: ``` Model File: hololens_detection_model-ex-09--loss-4.01.h5 Using IoU : 0.5 Using Object Threshold : 0.3 Using Non-Maximum Suppression : 0.5 hololens: 0.9613 mAP: 0.9613 =============================== ``` Let's see how those metrics looks like: ``` [{ 'average_precision': {'hololens': 0.9613334437735249}, 'map': 0.9613334437735249, 'model_file': 'hololens_detection_model-ex-09--loss-4.01.h5', 'using_iou': 0.5, 'using_non_maximum_suppression': 0.5, 'using_object_threshold': 0.3 }] ``` - **Multi Model Evaluation:** To evaluate all your saved models, simply parse in the path to the folder containing the models as the **model_path** as seen in the example below: ```python from imageai.Detection.Custom import DetectionModelTrainer trainer = DetectionModelTrainer() trainer.setModelTypeAsYOLOv3() trainer.setDataDirectory(data_directory="hololens") metrics = trainer.evaluateModel(model_path="hololens/models", json_path="hololens/json/detection_config.json", iou_threshold=0.5, object_threshold=0.3, nms_threshold=0.5) ``` Sample Result: ``` Model File: hololens/models/detection_model-ex-07--loss-4.42.h5 Using IoU : 0.5 Using Object Threshold : 0.3 Using Non-Maximum Suppression : 0.5 hololens: 0.9231 mAP: 0.9231 =============================== Model File: hololens/models/detection_model-ex-10--loss-3.95.h5 Using IoU : 0.5 Using Object Threshold : 0.3 Using Non-Maximum Suppression : 0.5 hololens: 0.9725 mAP: 0.9725 =============================== Model File: hololens/models/detection_model-ex-05--loss-5.26.h5 Using IoU : 0.5 Using Object Threshold : 0.3 Using Non-Maximum Suppression : 0.5 hololens: 0.9204 mAP: 0.9204 =============================== Model File: hololens/models/detection_model-ex-03--loss-6.44.h5 Using IoU : 0.5 Using Object Threshold : 0.3 Using Non-Maximum Suppression : 0.5 hololens: 0.8120 mAP: 0.8120 =============================== Model File: hololens/models/detection_model-ex-18--loss-2.96.h5 Using IoU : 0.5 Using Object Threshold : 0.3 Using Non-Maximum Suppression : 0.5 hololens: 0.9431 mAP: 0.9431 =============================== Model File: hololens/models/detection_model-ex-17--loss-3.10.h5 Using IoU : 0.5 Using Object Threshold : 0.3 Using Non-Maximum Suppression : 0.5 hololens: 0.9404 mAP: 0.9404 =============================== Model File: hololens/models/detection_model-ex-08--loss-4.16.h5 Using IoU : 0.5 Using Object Threshold : 0.3 Using Non-Maximum Suppression : 0.5 hololens: 0.9725 mAP: 0.9725 =============================== ``` Let's see how those metrics looks like: ``` [{ 'average_precision': {'hololens': 0.9231334437735249}, 'map': 0.9231334437735249, 'model_file': 'hololens/models/detection_model-ex-07--loss-4.42.h5', 'using_iou': 0.5, 'using_non_maximum_suppression': 0.5, 'using_object_threshold': 0.3 }, { 'average_precision': {'hololens': 0.9725334437735249}, 'map': 0.97251334437735249, 'model_file': 'hololens/models/detection_model-ex-10--loss-3.95.h5', 'using_iou': 0.5, 'using_non_maximum_suppression': 0.5, 'using_object_threshold': 0.3 }, { 'average_precision': {'hololens': 0.92041334437735249}, 'map': 0.92041334437735249, 'model_file': 'hololens/models/detection_model-ex-05--loss-5.26.h5', 'using_iou': 0.5, 'using_non_maximum_suppression': 0.5, 'using_object_threshold': 0.3 }, { 'average_precision': {'hololens': 0.81201334437735249}, 'map': 0.81201334437735249, 'model_file': 'hololens/models/detection_model-ex-03--loss-6.44.h5', 'using_iou': 0.5, 'using_non_maximum_suppression': 0.5, 'using_object_threshold': 0.3 }, { 'average_precision': {'hololens': 0.94311334437735249}, 'map': 0.94311334437735249, 'model_file': 'hololens/models/detection_model-ex-18--loss-2.96.h5', 'using_iou': 0.5, 'using_non_maximum_suppression': 0.5, 'using_object_threshold': 0.3 }, { 'average_precision': {'hololens': 0.94041334437735249}, 'map': 0.94041334437735249, 'model_file': 'hololens/models/detection_model-ex-17--loss-3.10.h5', 'using_iou': 0.5, 'using_non_maximum_suppression': 0.5, 'using_object_threshold': 0.3 }, { 'average_precision': {'hololens': 0.97251334437735249}, 'map': 0.97251334437735249, 'model_file': 'hololens/models/detection_model-ex-08--loss-4.16.h5', 'using_iou': 0.5, 'using_non_maximum_suppression': 0.5, 'using_object_threshold': 0.3 } ] ``` ### >> Documentation
We have provided full documentation for all **ImageAI** classes and functions in 3 major languages. Find links below: * Documentation - **English Version** [https://imageai.readthedocs.io](https://imageai.readthedocs.io) * Documentation - **Chinese Version** [https://imageai-cn.readthedocs.io](https://imageai-cn.readthedocs.io) * Documentation - **French Version** [https://imageai-fr.readthedocs.io](https://imageai-fr.readthedocs.io) ================================================ FILE: imageai_tf_deprecated/Detection/Custom/CUSTOMVIDEODETECTION.md ================================================ # ImageAI : Custom Video Object Detection, Tracking and Analysis An **DeepQuest AI** project [https://deepquestai.com](https://deepquestai.com) --- ### TABLE OF CONTENTS - :white_square_button: First Custom Video Object Detection - :white_square_button: Camera / Live Stream Video Detection - :white_square_button: Video Analysis - :white_square_button: Hiding/Showing Object Name and Probability - :white_square_button: Frame Detection Intervals - :white_square_button: Video Detection Timeout (NEW) - :white_square_button: Documentation ImageAI provides convenient, flexible and powerful methods to perform object detection on videos using your own **custom YOLOv3 model** and the corresponding **detection_config.json** generated during the training. This version of **ImageAI** provides commercial grade video objects detection features, which include but not limited to device/IP camera inputs, per frame, per second, per minute and entire video analysis for storing in databases and/or real-time visualizations and for future insights. To test the custom video object detection,you can download a sample custom model we have trained to detect the Hololens headset and its **detection_config.json** file via the links below: - [**hololens-ex-60--loss-2.76.h5**](https://github.com/OlafenwaMoses/ImageAI/releases/download/essential-v4/hololens-ex-60--loss-2.76.h5) _(Size = 236 mb)_ - [**detection_config.json**](https://github.com/OlafenwaMoses/ImageAI/releases/download/essential-v4/detection_config.json) Because video object detection is a compute intensive tasks, we advise you perform this experiment using a computer with a NVIDIA GPU and the GPU version of Tensorflow installed. Performing Video Object Detection CPU will be slower than using an NVIDIA GPU powered computer. You can use Google Colab for this experiment as it has an NVIDIA K80 GPU available for free.
Once you download the custom object detection model and JSON files, you should copy the model and the JSON files to the your project folder where your .py files will be. Then create a python file and give it a name; an example is FirstCustomVideoObjectDetection.py. Then write the code below into the python file:
### FirstCustomVideoObjectDetection.py
```python from imageai.Detection.Custom import CustomVideoObjectDetection import os execution_path = os.getcwd() video_detector = CustomVideoObjectDetection() video_detector.setModelTypeAsYOLOv3() video_detector.setModelPath("hololens-ex-60--loss-2.76.h5") video_detector.setJsonPath("detection_config.json") video_detector.loadModel() video_detector.detectObjectsFromVideo(input_file_path="holo1.mp4", output_file_path=os.path.join(execution_path, "holo1-detected3"), frames_per_second=20, minimum_percentage_probability=40, log_progress=True) ``` [**Input Video**](../../../data-videos/holo1.mp4) [![Input Video](../../../data-images/holo-video.jpg)](../../../data-videos/holo1.mp4) [**Output Video**](https://www.youtube.com/watch?v=4o5GyAR4Mpw) [![Output Video](../../../data-images/holo-video-detected.jpg)](https://www.youtube.com/watch?v=4o5GyAR4Mpw) Let us make a breakdown of the object detection code that we used above. ```python from imageai.Detection.Custom import CustomVideoObjectDetection import os execution_path = os.getcwd() ``` In the 3 lines above , we import the **ImageAI custom video object detection** class in the first line, import the **os** in the second line and obtained the path to folder where our python file runs. ```python video_detector = CustomVideoObjectDetection() video_detector.setModelTypeAsYOLOv3() video_detector.setModelPath("hololens-ex-60--loss-2.76.h5") video_detector.setJsonPath("detection_config.json") video_detector.loadModel() ``` In the 4 lines above, we created a new instance of the `CustomVideoObjectDetection` class in the first line, set the model type to YOLOv3 in the second line, set the model path to our custom YOLOv3 model file in the third line, specified the path to the model's corresponding **detection_config.json** in the fourth line and load the model in the fifth line. ```python video_detector.detectObjectsFromVideo(input_file_path="holo1.mp4", output_file_path=os.path.join(execution_path, "holo1-detected3"), frames_per_second=20, minimum_percentage_probability=40, log_progress=True) ``` In the code above, we ran the `detectObjectsFromVideo()` function and parse in the path to our video,the path to the new video (without the extension, it saves a .avi video by default) which the function will save, the number of frames per second (fps) that you we desire the output video to have and option to log the progress of the detection in the console. Then the function returns a the path to the saved video which contains boxes and percentage probabilities rendered on objects detected in the video. ### Camera / Live Stream Video Detection
**ImageAI** now allows live-video detection with support for camera inputs. Using **OpenCV**'s **VideoCapture()** function, you can load live-video streams from a device camera, cameras connected by cable or IP cameras, and parse it into **ImageAI**'s **detectObjectsFromVideo()** function. All features that are supported for detecting objects in a video file is also available for detecting objects in a camera's live-video feed. Find below an example of detecting live-video feed from the device camera. ```python from imageai.Detection.Custom import CustomVideoObjectDetection import os import cv2 execution_path = os.getcwd() camera = cv2.VideoCapture(0) video_detector = CustomVideoObjectDetection() video_detector.setModelTypeAsYOLOv3() video_detector.setModelPath("hololens-ex-60--loss-2.76.h5") video_detector.setJsonPath("detection_config.json") video_detector.loadModel() video_detector.detectObjectsFromVideo(camera_input=camera, output_file_path=os.path.join(execution_path, "holo1-detected3"), frames_per_second=20, minimum_percentage_probability=40, log_progress=True) ``` The difference in the code above and the code for the detection of a video file is that we defined an **OpenCV VideoCapture** instance and loaded the default device camera into it. Then we parsed the camera we defined into the parameter **camera_input** which replaces the **input_file_path** that is used for video file. ### Video Analysis
**ImageAI** now provide commercial-grade video analysis in the Custom Video Object Detection class, for both video file inputs and camera inputs. This feature allows developers to obtain deep insights into any video processed with **ImageAI**. This insights can be visualized in real-time, stored in a NoSQL database for future review or analysis.
For video analysis, the **detectObjectsFromVideo()** now allows you to state your own defined functions which will be executed for every frame, seconds and/or minute of the video detected as well as a state a function that will be executed at the end of a video detection. Once this functions are stated, they will receive raw but comprehensive analytical data on the index of the frame/second/minute, objects detected (name, percentage_probability and box_points), number of instances of each unique object detected and average number of occurrence of each unique object detected over a second/minute and entire video. To obtain the video analysis, all you need to do is specify a function, state the corresponding parameters it will be receiving and parse the function name into the **per_frame_function**, **per_second_function**, **per_minute_function** and **video_complete_function** parameters in the detection function. Find below examples of video analysis functions. ```python def forFrame(frame_number, output_array, output_count): print("FOR FRAME " , frame_number) print("Output for each object : ", output_array) print("Output count for unique objects : ", output_count) print("------------END OF A FRAME --------------") def forSeconds(second_number, output_arrays, count_arrays, average_output_count): print("SECOND : ", second_number) print("Array for the outputs of each frame ", output_arrays) print("Array for output count for unique objects in each frame : ", count_arrays) print("Output average count for unique objects in the last second: ", average_output_count) print("------------END OF A SECOND --------------") def forMinute(minute_number, output_arrays, count_arrays, average_output_count): print("MINUTE : ", minute_number) print("Array for the outputs of each frame ", output_arrays) print("Array for output count for unique objects in each frame : ", count_arrays) print("Output average count for unique objects in the last minute: ", average_output_count) print("------------END OF A MINUTE --------------") video_detector = CustomVideoObjectDetection() video_detector.setModelTypeAsYOLOv3() video_detector.setModelPath("hololens-ex-60--loss-2.76.h5") video_detector.setJsonPath("detection_config.json") video_detector.loadModel() video_detector.detectObjectsFromVideo(camera_input=camera, output_file_path=os.path.join(execution_path, "holo1-detected3"), frames_per_second=20, per_second_function=forSeconds, per_frame_function = forFrame, per_minute_function= forMinute, minimum_percentage_probability=40, log_progress=True) ``` **ImageAI** also allows you to obtain complete analysis of the entire video processed. All you need is to define a function like the forSecond or forMinute function and set the **video_complete_function** parameter into your **.detectObjectsFromVideo()** function. The same values for the per_second-function and per_minute_function will be returned. The difference is that no index will be returned and the other 3 values will be returned, and the 3 values will cover all frames in the video. Below is a sample function: ```python def forFull(output_arrays, count_arrays, average_output_count): #Perform action on the 3 parameters returned into the function video_detector.detectObjectsFromVideo(camera_input=camera, output_file_path=os.path.join(execution_path, "holo1-detected3"), video_complete_function=forFull, minimum_percentage_probability=40, log_progress=True) ``` **FINAL NOTE ON VIDEO ANALYSIS** : **ImageAI** allows you to obtain the detected video frame as a Numpy array at each frame, second and minute function. All you need to do is specify one more parameter in your function and set **return_detected_frame=True** in your **detectObjectsFromVideo()** function. Once this is set, the extra parameter you sepecified in your function will be the Numpy array of the detected frame. See a sample below: ```python def forFrame(frame_number, output_array, output_count, detected_frame): print("FOR FRAME " , frame_number) print("Output for each object : ", output_array) print("Output count for unique objects : ", output_count) print("Returned Objects is : ", type(detected_frame)) print("------------END OF A FRAME --------------") video_detector.detectObjectsFromVideo(camera_input=camera, output_file_path=os.path.join(execution_path, "holo1-detected3"), per_frame_function=forFrame, minimum_percentage_probability=40, log_progress=True, return_detected_frame=True) ``` ### Frame Detection Intervals
The above video objects detection task are optimized for frame-real-time object detections that ensures that objects in every frame of the video is detected. **ImageAI** provides you the option to adjust the video frame detections which can speed up your video detection process. When calling the `.detectObjectsFromVideo()`, you can specify at which frame interval detections should be made. By setting the **frame_detection_interval** parameter to be equal to 5 or 20, that means the object detections in the video will be updated after 5 frames or 20 frames. If your output video **frames_per_second** is set to 20, that means the object detections in the video will be updated once in every quarter of a second or every second. This is useful in case scenarios where the available compute is less powerful and speeds of moving objects are low. This ensures you can have objects detected as second-real-time , half-a-second-real-time or whichever way suits your needs. ### Custom Video Detection Timeout
**ImageAI** now allows you to set a timeout in seconds for detection of objects in videos or camera live feed. To set a timeout for your video detection code, all you need to do is specify the `detection_timeout` parameter in the `detectObjectsFromVideo()` function to the number of desired seconds. In the example code below, we set `detection_timeout` to 120 seconds (2 minutes). ```python from imageai.Detection.Custom import CustomVideoObjectDetection import os import cv2 execution_path = os.getcwd() camera = cv2.VideoCapture(0) video_detector = CustomVideoObjectDetection() video_detector.setModelTypeAsYOLOv3() video_detector.setModelPath("hololens-ex-60--loss-2.76.h5") video_detector.setJsonPath("detection_config.json") video_detector.loadModel() video_detector.detectObjectsFromVideo(camera_input=camera, output_file_path=os.path.join(execution_path, "holo1-detected3"), frames_per_second=20, minimum_percentage_probability=40, detection_timeout=120) ``` ### >> Documentation
We have provided full documentation for all **ImageAI** classes and functions in 3 major languages. Find links below: * Documentation - **English Version** [https://imageai.readthedocs.io](https://imageai.readthedocs.io)** * Documentation - **Chinese Version** [https://imageai-cn.readthedocs.io](https://imageai-cn.readthedocs.io)** * Documentation - **French Version** [https://imageai-fr.readthedocs.io](https://imageai-fr.readthedocs.io)** ================================================ FILE: imageai_tf_deprecated/Detection/Custom/__init__.py ================================================ import os import re import numpy as np import json from imageai.Detection.Custom.voc import parse_voc_annotation from imageai.Detection.YOLO.yolov3 import yolov3_main, yolov3_train, dummy_loss from imageai.Detection.Custom.generator import BatchGenerator from imageai.Detection.Custom.utils.utils import normalize, evaluate, makedirs from tensorflow.keras.callbacks import ReduceLROnPlateau from tensorflow.keras.optimizers import Adam from imageai.Detection.Custom.callbacks import CustomModelCheckpoint from imageai.Detection.Custom.utils.multi_gpu_model import multi_gpu_model from imageai.Detection.Custom.gen_anchors import generateAnchors import tensorflow as tf from tensorflow.keras.models import load_model from tensorflow.keras import Input from tensorflow.keras.callbacks import TensorBoard import tensorflow.keras.backend as K import cv2 tf.config.run_functions_eagerly(True) os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" class DetectionModelTrainer: """ This is the Detection Model training class, which allows you to train object detection models on image datasets that are in Pascal VOC annotation format, using the YOLOv3. """ def __init__(self): self.__model_type = "" self.__training_mode = True self.__model_min_input_size = 288 self.__model_max_input_size = 448 self.__model_anchors = [] self.__inference_anchors = [] self.__json_directory = "" self.__model_labels = [] self.__num_objects = 0 self.__pre_trained_model = "" self.__train_images_folder = "" self.__train_annotations_folder = "" self.__train_cache_file = "" self.__train_times = 8 self.__train_batch_size = 4 self.__train_learning_rate = 1e-4 self.__train_epochs = 100 self.__train_warmup_epochs = 3 self.__train_ignore_treshold = 0.5 self.__train_gpus = "0" self.__train_grid_scales = [1, 1, 1] self.__train_obj_scale = 5 self.__train_noobj_scale = 1 self.__train_xywh_scale = 1 self.__train_class_scale = 1 self.__model_directory = "" self.__train_weights_name = "" self.__train_debug = True self.__logs_directory = "" self.__validation_images_folder = "" self.__validation_annotations_folder = "" self.__validation_cache_file = "" self.__validation_times = 1 def setModelTypeAsYOLOv3(self): """ 'setModelTypeAsYOLOv3()' is used to set the model type to the YOLOv3 model for the training instance object . :return: """ self.__model_type = "yolov3" def setDataDirectory(self, data_directory): """ 'setDataDirectory()' is required to set the path to which the data/dataset to be used for training is kept. The directory can have any name, but it must have 'train' and 'validation' sub-directory. In the 'train' and 'validation' sub-directories, there must be 'images' and 'annotations' sub-directories respectively. The 'images' folder will contain the pictures for the dataset and the 'annotations' folder will contain the XML files with details of the annotations for each image in the 'images folder'. N.B: Strictly take note that the filenames (without the extension) of the pictures in the 'images folder' must be the same as the filenames (without the extension) of their corresponding annotation XML files in the 'annotations' folder. The structure of the 'train' and 'validation' folder must be as follows: >> train >> images >> img_1.jpg >> images >> img_2.jpg >> images >> img_3.jpg >> annotations >> img_1.xml >> annotations >> img_2.xml >> annotations >> img_3.xml >> validation >> images >> img_151.jpg >> images >> img_152.jpg >> images >> img_153.jpg >> annotations >> img_151.xml >> annotations >> img_152.xml >> annotations >> img_153.xml :param data_directory: :return: """ self.__train_images_folder = os.path.join(data_directory, "train", "images") self.__train_annotations_folder = os.path.join(data_directory, "train", "annotations") self.__validation_images_folder = os.path.join(data_directory, "validation", "images") self.__validation_annotations_folder = os.path.join(data_directory, "validation", "annotations") os.makedirs(os.path.join(data_directory, "cache"), exist_ok=True) self.__train_cache_file = os.path.join(data_directory, "cache", "detection_train_data.pkl") self.__validation_cache_file = os.path.join(data_directory, "cache", "detection_test_data.pkl") os.makedirs(os.path.join(data_directory, "models"), exist_ok=True) os.makedirs(os.path.join(data_directory, "json"), exist_ok=True) os.makedirs(os.path.join(data_directory, "logs"), exist_ok=True) self.__model_directory = os.path.join(data_directory, "models") self.__train_weights_name = os.path.join(self.__model_directory, "detection_model-") self.__json_directory = os.path.join(data_directory, "json") self.__logs_directory = os.path.join(data_directory, "logs") def setGpuUsage(self, train_gpus): """ 'setGpuUsage' function allows you to set the GPUs to be used while training train_gpu can be: - an integer, indicating the number of GPUs to use - a list of integers, indicating the id of the GPUs to be used - a string, indicating the it og the id of the GPUs to be used, separated by commas :param train_gpus: gpus where to run :return: """ # train_gpus, could be a string separated by comma, or a list of int or the number of GPUs to be used if type(train_gpus) == str: train_gpus = train_gpus.split(',') if type(train_gpus) == int: train_gpus = range(train_gpus) # let it as a string separated by commas self.__train_gpus = ','.join([str(gpu) for gpu in train_gpus]) def setTrainConfig(self, object_names_array, batch_size=4, num_experiments=100, train_from_pretrained_model=""): """ 'setTrainConfig()' function allows you to set the properties for the training instances. It accepts the following values: - object_names_array , this is an array of the names of the different objects in your dataset - batch_size (optional), this is the batch size for the training instance - num_experiments (optional), also known as epochs, it is the number of times the network will train on all the training dataset - train_from_pretrained_model (optional), this is used to perform transfer learning by specifying the path to a pre-trained YOLOv3 model :param object_names_array: :param batch_size: :param num_experiments: :param train_from_pretrained_model: :return: """ # Remove cache files if os.path.isfile(self.__train_cache_file) == True: os.remove(self.__train_cache_file) if os.path.isfile(self.__validation_cache_file) == True: os.remove(self.__validation_cache_file) self.__model_anchors, self.__inference_anchors = generateAnchors(self.__train_annotations_folder, self.__train_images_folder, self.__train_cache_file, self.__model_labels) self.__model_labels = sorted(object_names_array) self.__num_objects = len(object_names_array) self.__train_batch_size = batch_size self.__train_epochs = num_experiments self.__pre_trained_model = train_from_pretrained_model json_data = dict() json_data["labels"] = self.__model_labels json_data["anchors"] = self.__inference_anchors with open(os.path.join(self.__json_directory, "detection_config.json"), "w+") as json_file: json.dump(json_data, json_file, indent=4, separators=(",", " : "), ensure_ascii=True) print("Detection configuration saved in ", os.path.join(self.__json_directory, "detection_config.json")) def trainModel(self): """ 'trainModel()' function starts the actual model training. Once the training starts, the training instance creates 3 sub-folders in your dataset folder which are: - json, where the JSON configuration file for using your trained model is stored - models, where your trained models are stored once they are generated after each improved experiments - cache , where temporary traing configuraton files are stored :return: """ train_ints, valid_ints, labels, max_box_per_image = self._create_training_instances( self.__train_annotations_folder, self.__train_images_folder, self.__train_cache_file, self.__validation_annotations_folder, self.__validation_images_folder, self.__validation_cache_file, self.__model_labels ) if self.__training_mode: print('Training on: \t' + str(labels) + '') print("Training with Batch Size: ", self.__train_batch_size) print("Number of Training Samples: ", len(train_ints)) print("Number of Validation Samples: ", len(valid_ints)) print("Number of Experiments: ", self.__train_epochs) ############################### # Create the generators ############################### train_generator = BatchGenerator( instances=train_ints, anchors=self.__model_anchors, labels=labels, downsample=32, # ratio between network input's size and network output's size, 32 for YOLOv3 max_box_per_image=max_box_per_image, batch_size=self.__train_batch_size, min_net_size=self.__model_min_input_size, max_net_size=self.__model_max_input_size, shuffle=True, jitter=0.3, norm=normalize ) valid_generator = BatchGenerator( instances=valid_ints, anchors=self.__model_anchors, labels=labels, downsample=32, # ratio between network input's size and network output's size, 32 for YOLOv3 max_box_per_image=max_box_per_image, batch_size=self.__train_batch_size, min_net_size=self.__model_min_input_size, max_net_size=self.__model_max_input_size, shuffle=True, jitter=0.0, norm=normalize ) ############################### # Create the model ############################### if os.path.exists(self.__pre_trained_model): self.__train_warmup_epochs = 0 warmup_batches = self.__train_warmup_epochs * (self.__train_times * len(train_generator)) os.environ['CUDA_VISIBLE_DEVICES'] = self.__train_gpus multi_gpu = [int(gpu) for gpu in self.__train_gpus.split(',')] """train_model, infer_model = self._create_model( nb_class=len(labels), anchors=self.__model_anchors, max_box_per_image=max_box_per_image, max_grid=[self.__model_max_input_size, self.__model_max_input_size], batch_size=self.__train_batch_size, warmup_batches=warmup_batches, ignore_thresh=self.__train_ignore_treshold, multi_gpu=multi_gpu, lr=self.__train_learning_rate, grid_scales=self.__train_grid_scales, obj_scale=self.__train_obj_scale, noobj_scale=self.__train_noobj_scale, xywh_scale=self.__train_xywh_scale, class_scale=self.__train_class_scale, )""" train_model, infer_model = self._create_model( nb_class=len(labels), anchors=self.__model_anchors, max_box_per_image=max_box_per_image, max_grid=[self.__model_max_input_size, self.__model_max_input_size], batch_size=self.__train_batch_size, warmup_batches=warmup_batches, ignore_thresh=self.__train_ignore_treshold, multi_gpu=multi_gpu, lr=self.__train_learning_rate, grid_scales=self.__train_grid_scales, obj_scale=self.__train_obj_scale, noobj_scale=self.__train_noobj_scale, xywh_scale=self.__train_xywh_scale, class_scale=self.__train_class_scale, ) ############################### # Kick off the training ############################### callbacks = self._create_callbacks(self.__train_weights_name, infer_model) train_model.fit_generator( generator=train_generator, steps_per_epoch=len(train_generator) * self.__train_times, validation_data=valid_generator, validation_steps=len(valid_generator) * self.__train_times, epochs=self.__train_epochs + self.__train_warmup_epochs, verbose=1, callbacks=callbacks, workers=4, max_queue_size=8 ) def evaluateModel(self, model_path, json_path, batch_size=4, iou_threshold=0.5, object_threshold=0.2, nms_threshold=0.45): """ 'evaluateModel()' is used to obtain the mAP metrics for your model(s). It accepts the following values: - model_path ( model file or folder), this value can be the part to your model file or the path to the folder containing all your saved model files - json_path , this is the path the the 'detection_config.json' file saved for the dataset during the training - iou_threshold , this value is used to set the desired 'IoU' to obtain the mAP metrics for your model(s) - object_threshold , this is used to set your desired minimum 'class score' to obtain the mAP metrics for your model(s) - nms_threshold , this is used to set your desired 'Non-maximum suppresion' to obtain the mAP metrics for your model(s) :param model_path: :param json_path: :param batch_size: :param iou_threshold: :param object_threshold: :param nms_threshold: :return: list of dictionaries, containing one dict per evaluated model. Each dict contains exactly the same metrics that are printed on standard output """ self.__training_mode = False with open(json_path, 'r') as json_file: detection_model_json = json.load(json_file) temp_anchor_array = [] new_anchor_array = [] temp_anchor_array.append(detection_model_json["anchors"][2]) temp_anchor_array.append(detection_model_json["anchors"][1]) temp_anchor_array.append(detection_model_json["anchors"][0]) for aa in temp_anchor_array: for aaa in aa: new_anchor_array.append(aaa) self.__model_anchors = new_anchor_array self.__model_labels = detection_model_json["labels"] self.__num_objects = len(self.__model_labels) self.__train_batch_size = batch_size self.__train_epochs = 100 print("Starting Model evaluation....") _, valid_ints, labels, max_box_per_image = self._create_training_instances( self.__train_annotations_folder, self.__train_images_folder, self.__train_cache_file, self.__validation_annotations_folder, self.__validation_images_folder, self.__validation_cache_file, self.__model_labels ) if len(valid_ints) == 0: print('Validation samples were not provided.') print('Please, check your validation samples are correctly provided:') print('\tAnnotations: {}\n\tImages: {}'.format(self.__validation_annotations_folder, self.__validation_images_folder)) valid_generator = BatchGenerator( instances=valid_ints, anchors=self.__model_anchors, labels=labels, downsample=32, # ratio between network input's size and network output's size, 32 for YOLOv3 max_box_per_image=max_box_per_image, batch_size=self.__train_batch_size, min_net_size=self.__model_min_input_size, max_net_size=self.__model_max_input_size, shuffle=True, jitter=0.0, norm=normalize ) results = list() if os.path.isfile(model_path): # model_files must be a list containing the complete path to the files, # if a file is given, then the list contains just this file model_files = [model_path] elif os.path.isdir(model_path): # model_files must be a list containing the complete path to the files, # if a folder is given, then the list contains the complete path to each file on that folder model_files = sorted([os.path.join(model_path, file_name) for file_name in os.listdir(model_path)]) # sort the files to make sure we're always evaluating them on same order else: print('model_path must be the path to a .h5 file or a directory. Found {}'.format(model_path)) return results for model_file in model_files: if str(model_file).endswith(".h5"): try: infer_model = load_model(model_file) ############################### # Run the evaluation ############################### # compute mAP for all the classes average_precisions = evaluate(infer_model, valid_generator, iou_threshold=iou_threshold, obj_thresh=object_threshold, nms_thresh=nms_threshold) result_dict = { 'model_file': model_file, 'using_iou': iou_threshold, 'using_object_threshold': object_threshold, 'using_non_maximum_suppression': nms_threshold, 'average_precision': dict(), 'evaluation_samples': len(valid_ints) } # print the score print("Model File: ", model_file, '\n') print("Evaluation samples: ", len(valid_ints)) print("Using IoU: ", iou_threshold) print("Using Object Threshold: ", object_threshold) print("Using Non-Maximum Suppression: ", nms_threshold) for label, average_precision in average_precisions.items(): print(labels[label] + ': {:.4f}'.format(average_precision)) result_dict['average_precision'][labels[label]] = average_precision print('mAP: {:.4f}'.format(sum(average_precisions.values()) / len(average_precisions))) result_dict['map'] = sum(average_precisions.values()) / len(average_precisions) print("===============================") results.append(result_dict) except Exception as e: print('skipping the evaluation of {} because following exception occurred: {}'.format(model_file, e)) continue else: print('skipping the evaluation of {} since it\'s not a .h5 file'.format(model_file)) return results def _create_training_instances(self, train_annot_folder, train_image_folder, train_cache, valid_annot_folder, valid_image_folder, valid_cache, labels, ): # parse annotations of the training set train_ints, train_labels = parse_voc_annotation(train_annot_folder, train_image_folder, train_cache, labels) # parse annotations of the validation set, if any, otherwise split the training set if os.path.exists(valid_annot_folder): valid_ints, valid_labels = parse_voc_annotation(valid_annot_folder, valid_image_folder, valid_cache, labels) print('Evaluating over {} samples taken from {}'.format(len(valid_ints), os.path.dirname(valid_annot_folder))) else: train_portion = 0.8 # use 80% to train and the remaining 20% to evaluate train_valid_split = int(round(train_portion * len(train_ints))) np.random.seed(0) np.random.shuffle(train_ints) valid_ints = train_ints[train_valid_split:] train_ints = train_ints[:train_valid_split] print('Evaluating over {} samples taken as {:5.2f}% of the training set ' 'given at {}'.format(len(valid_ints), (1 - train_portion)*100, os.path.dirname(train_annot_folder))) print('Training over {} samples given at {}'.format(len(train_ints), os.path.dirname(train_annot_folder))) # compare the seen labels with the given labels in config.json if len(labels) > 0: overlap_labels = set(labels).intersection(set(train_labels.keys())) # return None, None, None if some given label is not in the dataset if len(overlap_labels) < len(labels): if self.__training_mode: print('Some labels have no annotations! Please revise the list of labels in your configuration.') return None, None, None, None else: if self.__training_mode: print('No labels are provided. Train on all seen labels.') print(train_labels) labels = train_labels.keys() max_box_per_image = max([len(inst['object']) for inst in (train_ints + valid_ints)]) return train_ints, valid_ints, sorted(labels), max_box_per_image def _create_callbacks(self, saved_weights_name, model_to_save): checkpoint = CustomModelCheckpoint( model_to_save=model_to_save, filepath=saved_weights_name + 'ex-{epoch:03d}--loss-{loss:08.3f}.h5', monitor='loss', verbose=0, save_best_only=True, mode='min', period=1 ) reduce_on_plateau = ReduceLROnPlateau( monitor='loss', factor=0.1, patience=2, verbose=0, mode='min', epsilon=0.01, cooldown=0, min_lr=0 ) tensor_board = TensorBoard( log_dir=self.__logs_directory ) return [checkpoint, reduce_on_plateau, tensor_board] def _create_model( self, nb_class, anchors, max_box_per_image, max_grid, batch_size, warmup_batches, ignore_thresh, multi_gpu, lr, grid_scales, obj_scale, noobj_scale, xywh_scale, class_scale ): if len(multi_gpu) > 1: with tf.device('/cpu:0'): template_model, infer_model = yolov3_train( num_classes=nb_class, anchors=anchors, max_box_per_image=max_box_per_image, max_grid=max_grid, batch_size=batch_size // len(multi_gpu), warmup_batches=warmup_batches, ignore_thresh=ignore_thresh, grid_scales=grid_scales, obj_scale=obj_scale, noobj_scale=noobj_scale, xywh_scale=xywh_scale, class_scale=class_scale ) else: template_model, infer_model = yolov3_train( num_classes=nb_class, anchors=anchors, max_box_per_image=max_box_per_image, max_grid=max_grid, batch_size=batch_size, warmup_batches=warmup_batches, ignore_thresh=ignore_thresh, grid_scales=grid_scales, obj_scale=obj_scale, noobj_scale=noobj_scale, xywh_scale=xywh_scale, class_scale=class_scale ) # load the pretrained weight if exists, otherwise load the backend weight only if len(self.__pre_trained_model) > 3: if self.__training_mode: print("Training with transfer learning from pretrained Model") template_model.load_weights(self.__pre_trained_model, by_name=True) else: if self.__training_mode: print("Pre-trained Model not provided. Transfer learning not in use.") print("Training will start with 3 warmup experiments") if len(multi_gpu) > 1: train_model = multi_gpu_model(template_model, gpus=multi_gpu) else: train_model = template_model optimizer = Adam(lr=lr, clipnorm=0.001) train_model.compile(loss=dummy_loss, optimizer=optimizer) return train_model, infer_model class CustomObjectDetection: """ This is the object detection class for using your custom trained models. It supports your custom trained YOLOv3 model and allows to you to perform object detection in images. """ def __init__(self): self.__model_type = "" self.__model_path = "" self.__model_labels = [] self.__model_anchors = [] self.__detection_config_json_path = "" self.__input_size = 416 self.__object_threshold = 0.4 self.__nms_threshold = 0.4 self.__model = None self.__detection_utils = CustomDetectionUtils(labels=[]) def setModelTypeAsYOLOv3(self): """ 'setModelTypeAsYOLOv3' is used to set your custom detection model as YOLOv3 :return: """ self.__model_type = "yolov3" def setModelPath(self, detection_model_path): """ 'setModelPath' is used to specify the filepath to your custom detection model :param detection_model_path: path to the .h5 model file. Usually is one of those under /models/detection_model-ex-ddd--loss-dddd.ddd.h5 :return: None """ self.__model_path = detection_model_path def setJsonPath(self, configuration_json): """ 'setJsonPath' is used to set the filepath to the configuration JSON file for your custom detection model :param configuration_json: path to the .json file. Usually it is /json/detection_config.json :return: None """ self.__detection_config_json_path = configuration_json def loadModel(self): """ 'loadModel' is used to load the model into the CustomObjectDetection class :return: None """ if self.__model_type == "yolov3": detection_model_json = json.load(open(self.__detection_config_json_path)) self.__model_labels = detection_model_json["labels"] self.__model_anchors = detection_model_json["anchors"] self.__detection_utils = CustomDetectionUtils(labels=self.__model_labels) self.__model = yolov3_main(Input(shape=(None, None, 3)), 3, len(self.__model_labels)) self.__model.load_weights(self.__model_path) def detectObjectsFromImage(self, input_image="", output_image_path="", input_type="file", output_type="file", extract_detected_objects=False, minimum_percentage_probability=50, nms_treshold=0.4, display_percentage_probability=True, display_object_name=True, thread_safe=False): """ 'detectObjectsFromImage()' function is used to detect objects observable in the given image: * input_image , which can be a filepath or image numpy array in BGR * output_image_path (only if output_type = file) , file path to the output image that will contain the detection boxes and label, if output_type="file" * input_type (optional) , filepath/numpy array of the image. Acceptable values are "file" and "array" * output_type (optional) , file path/numpy array/image file stream of the image. Acceptable values are "file" and "array" * extract_detected_objects (optional) , option to save each object detected individually as an image and return an array of the objects' image path. * minimum_percentage_probability (optional, 30 by default) , option to set the minimum percentage probability for nominating a detected object for output. * nms_threshold (optional, o.45 by default) , option to set the Non-maximum suppression for the detection * display_percentage_probability (optional, True by default), option to show or hide the percentage probability of each object in the saved/returned detected image * display_display_object_name (optional, True by default), option to show or hide the name of each object in the saved/returned detected image * thread_safe (optional, False by default), enforce the loaded detection model works across all threads if set to true, made possible by forcing all Keras inference to run on the default graph The values returned by this function depends on the parameters parsed. The possible values returnable are stated as below - If extract_detected_objects = False or at its default value and output_type = 'file' or at its default value, you must parse in the 'output_image_path' as a string to the path you want the detected image to be saved. Then the function will return: 1. an array of dictionaries, with each dictionary corresponding to the objects detected in the image. Each dictionary contains the following property: * name (string) * percentage_probability (float) * box_points (list of x1,y1,x2 and y2 coordinates) - If extract_detected_objects = False or at its default value and output_type = 'array' , Then the function will return: 1. a numpy array of the detected image 2. an array of dictionaries, with each dictionary corresponding to the objects detected in the image. Each dictionary contains the following property: * name (string) * percentage_probability (float) * box_points (list of x1,y1,x2 and y2 coordinates) - If extract_detected_objects = True and output_type = 'file' or at its default value, you must parse in the 'output_image_path' as a string to the path you want the detected image to be saved. Then the function will return: 1. an array of dictionaries, with each dictionary corresponding to the objects detected in the image. Each dictionary contains the following property: * name (string) * percentage_probability (float) * box_points (list of x1,y1,x2 and y2 coordinates) 2. an array of string paths to the image of each object extracted from the image - If extract_detected_objects = True and output_type = 'array', the the function will return: 1. a numpy array of the detected image 2. an array of dictionaries, with each dictionary corresponding to the objects detected in the image. Each dictionary contains the following property: * name (string) * percentage_probability (float) * box_points (list of x1,y1,x2 and y2 coordinates) 3. an array of numpy arrays of each object detected in the image :param input_image: :param output_image_path: :param input_type: :param output_type: :param extract_detected_objects: :param minimum_percentage_probability: :param nms_treshold: :param display_percentage_probability: :param display_object_name: :param thread_safe: :return image_frame: :return output_objects_array: :return detected_objects_image_array: """ if self.__model is None: raise ValueError("You must call the loadModel() function before making object detection.") else: if output_type == "file": # from the image file, lets keep the directory and the filename, but remove its format # if output_image_path is path/to/the/output/image.png # then output_image_folder is path/to/the/output/image # let's check if it is in the appropriated format soon to fail early output_image_folder, n_subs = re.subn(r'\.(?:jpe?g|png|tif|webp|PPM|PGM)$', '', output_image_path, flags=re.I) if n_subs == 0: # if no substitution was done, the given output_image_path is not in a supported format, # raise an error raise ValueError("output_image_path must be the path where to write the image. " "Therefore it must end as one the following: " "'.jpg', '.png', '.tif', '.webp', '.PPM', '.PGM'. {} found".format(output_image_path)) elif extract_detected_objects: # Results must be written as files and need to extract detected objects as images, # let's create a folder to store the object's images objects_dir = output_image_folder + "-objects" os.makedirs(objects_dir, exist_ok=True) self.__object_threshold = minimum_percentage_probability / 100 self.__nms_threshold = nms_treshold output_objects_array = [] detected_objects_image_array = [] if input_type == "file": image = cv2.imread(input_image) elif input_type == "array": image = input_image else: raise ValueError("input_type must be 'file' or 'array'. {} found".format(input_type)) image_frame = image.copy() height, width, channels = image.shape image = cv2.resize(image, (self.__input_size, self.__input_size)) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) image = image.astype("float32") / 255. # expand the image to batch image = np.expand_dims(image, 0) if self.__model_type == "yolov3": if thread_safe == True: with K.get_session().graph.as_default(): yolo_results = self.__model.predict(image) else: yolo_results = self.__model.predict(image) boxes = list() for idx, result in enumerate(yolo_results): box_set = self.__detection_utils.decode_netout(result[0], self.__model_anchors[idx], self.__object_threshold, self.__input_size, self.__input_size) boxes += box_set self.__detection_utils.correct_yolo_boxes(boxes, height, width, self.__input_size, self.__input_size) self.__detection_utils.do_nms(boxes, self.__nms_threshold) all_boxes, all_labels, all_scores = self.__detection_utils.get_boxes(boxes, self.__model_labels, self.__object_threshold) for object_box, object_label, object_score in zip(all_boxes, all_labels, all_scores): each_object_details = dict() each_object_details["name"] = object_label each_object_details["percentage_probability"] = object_score if object_box.xmin < 0: object_box.xmin = 0 if object_box.ymin < 0: object_box.ymin = 0 each_object_details["box_points"] = [object_box.xmin, object_box.ymin, object_box.xmax, object_box.ymax] output_objects_array.append(each_object_details) drawn_image = self.__detection_utils.draw_boxes_and_caption(image_frame.copy(), all_boxes, all_labels, all_scores, show_names=display_object_name, show_percentage=display_percentage_probability) if extract_detected_objects: for cnt, each_object in enumerate(output_objects_array): splitted_image = image_frame[each_object["box_points"][1]:each_object["box_points"][3], each_object["box_points"][0]:each_object["box_points"][2]] if output_type == "file": splitted_image_path = os.path.join(objects_dir, "{}-{:05d}.jpg".format(each_object["name"], cnt)) cv2.imwrite(splitted_image_path, splitted_image) detected_objects_image_array.append(splitted_image_path) elif output_type == "array": detected_objects_image_array.append(splitted_image.copy()) if output_type == "file": # we already validated that the output_image_path is a supported by OpenCV one cv2.imwrite(output_image_path, drawn_image) if extract_detected_objects: if output_type == "file": return output_objects_array, detected_objects_image_array elif output_type == "array": return drawn_image, output_objects_array, detected_objects_image_array else: if output_type == "file": return output_objects_array elif output_type == "array": return drawn_image, output_objects_array class CustomVideoObjectDetection: """ This is the object detection class for videos and camera live stream inputs using your custom trained detection models. It provides support for your custom YOLOv3 models. """ def __init__(self): self.__model_type = "" self.__model_path = "" self.__model_labels = [] self.__model_anchors = [] self.__detection_config_json_path = "" self.__model_loaded = False self.__input_size = 416 self.__object_threshold = 0.4 self.__nms_threshold = 0.4 self.__detector = [] self.__detection_utils = CustomDetectionUtils(labels=[]) def setModelTypeAsYOLOv3(self): """ 'setModelTypeAsYOLOv3' is used to set your custom detection model as YOLOv3 :return: """ self.__model_type = "yolov3" def setModelPath(self, detection_model_path): """ 'setModelPath' is used to specify the filepath to your custom detection model :param detection_model_path: :return: """ self.__model_path = detection_model_path def setJsonPath(self, configuration_json): """ 'setJsonPath' is used to set the filepath to the configuration JSON file for your custom detection model :param configuration_json: :return: """ self.__detection_config_json_path = configuration_json def loadModel(self): """ 'loadModel' is used to load the model into the CustomVideoObjectDetection class :return: """ if (self.__model_loaded == False): if(self.__model_type == "yolov3"): detector = CustomObjectDetection() detector.setModelTypeAsYOLOv3() detector.setModelPath(self.__model_path) detector.setJsonPath(self.__detection_config_json_path) detector.loadModel() self.__detector = detector self.__model_loaded = True def detectObjectsFromVideo(self, input_file_path="", camera_input=None, output_file_path="", frames_per_second=20, frame_detection_interval=1, minimum_percentage_probability=50, log_progress=False, display_percentage_probability=True, display_object_name=True, save_detected_video=True, per_frame_function=None, per_second_function=None, per_minute_function=None, video_complete_function=None, return_detected_frame=False, detection_timeout = None): """ 'detectObjectsFromVideo()' function is used to detect objects observable in the given video path or a camera input: * input_file_path , which is the file path to the input video. It is required only if 'camera_input' is not set * camera_input , allows you to parse in camera input for live video detections * output_file_path , which is the path to the output video. It is required only if 'save_detected_video' is not set to False * frames_per_second , which is the number of frames to be used in the output video * frame_detection_interval (optional, 1 by default) , which is the intervals of frames that will be detected. * minimum_percentage_probability (optional, 50 by default) , option to set the minimum percentage probability for nominating a detected object for output. * log_progress (optional) , which states if the progress of the frame processed is to be logged to console * display_percentage_probability (optional), can be used to hide or show probability scores on the detected video frames * display_object_name (optional), can be used to show or hide object names on the detected video frames * save_save_detected_video (optional, True by default), can be set to or not to save the detected video * per_frame_function (optional), this parameter allows you to parse in a function you will want to execute after each frame of the video is detected. If this parameter is set to a function, after every video frame is detected, the function will be executed with the following values parsed into it: -- position number of the frame -- an array of dictinaries, with each dictinary corresponding to each object detected. Each dictionary contains 'name', 'percentage_probability' and 'box_points' -- a dictionary with with keys being the name of each unique objects and value are the number of instances of the object present -- If return_detected_frame is set to True, the numpy array of the detected frame will be parsed as the fourth value into the function * per_second_function (optional), this parameter allows you to parse in a function you will want to execute after each second of the video is detected. If this parameter is set to a function, after every second of a video is detected, the function will be executed with the following values parsed into it: -- position number of the second -- an array of dictionaries whose keys are position number of each frame present in the last second , and the value for each key is the array for each frame that contains the dictionaries for each object detected in the frame -- an array of dictionaries, with each dictionary corresponding to each frame in the past second, and the keys of each dictionary are the name of the number of unique objects detected in each frame, and the key values are the number of instances of the objects found in the frame -- a dictionary with its keys being the name of each unique object detected throughout the past second, and the key values are the average number of instances of the object found in all the frames contained in the past second -- If return_detected_frame is set to True, the numpy array of the detected frame will be parsed as the fifth value into the function * per_minute_function (optional), this parameter allows you to parse in a function you will want to execute after each minute of the video is detected. If this parameter is set to a function, after every minute of a video is detected, the function will be executed with the following values parsed into it: -- position number of the minute -- an array of dictionaries whose keys are position number of each frame present in the last minute , and the value for each key is the array for each frame that contains the dictionaries for each object detected in the frame -- an array of dictionaries, with each dictionary corresponding to each frame in the past minute, and the keys of each dictionary are the name of the number of unique objects detected in each frame, and the key values are the number of instances of the objects found in the frame -- a dictionary with its keys being the name of each unique object detected throughout the past minute, and the key values are the average number of instances of the object found in all the frames contained in the past minute -- If return_detected_frame is set to True, the numpy array of the detected frame will be parsed as the fifth value into the function * video_complete_function (optional), this parameter allows you to parse in a function you will want to execute after all of the video frames have been detected. If this parameter is set to a function, after all of frames of a video is detected, the function will be executed with the following values parsed into it: -- an array of dictionaries whose keys are position number of each frame present in the entire video , and the value for each key is the array for each frame that contains the dictionaries for each object detected in the frame -- an array of dictionaries, with each dictionary corresponding to each frame in the entire video, and the keys of each dictionary are the name of the number of unique objects detected in each frame, and the key values are the number of instances of the objects found in the frame -- a dictionary with its keys being the name of each unique object detected throughout the entire video, and the key values are the average number of instances of the object found in all the frames contained in the entire video * return_detected_frame (optionally, False by default), option to obtain the return the last detected video frame into the per_per_frame_function, per_per_second_function or per_per_minute_function * detection_timeout (optionally, None by default), option to state the number of seconds of a video that should be detected after which the detection function stop processing the video :param input_file_path: :param camera_input: :param output_file_path: :param frames_per_second: :param frame_detection_interval: :param minimum_percentage_probability: :param log_progress: :param display_percentage_probability: :param display_object_name: :param save_detected_video: :param per_frame_function: :param per_second_function: :param per_minute_function: :param video_complete_function: :param return_detected_frame: :param detection_timeout: :return output_video_filepath: :return counting: :return output_objects_array: :return output_objects_count: :return detected_copy: :return this_second_output_object_array: :return this_second_counting_array: :return this_second_counting: :return this_minute_output_object_array: :return this_minute_counting_array: :return this_minute_counting: :return this_video_output_object_array: :return this_video_counting_array: :return this_video_counting: """ output_frames_dict = {} output_frames_count_dict = {} input_video = cv2.VideoCapture(input_file_path) if (camera_input != None): input_video = camera_input output_video_filepath = output_file_path + '.avi' frame_width = int(input_video.get(3)) frame_height = int(input_video.get(4)) output_video = cv2.VideoWriter(output_video_filepath, cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), frames_per_second, (frame_width, frame_height)) counting = 0 predicted_numbers = None scores = None detections = None detection_timeout_count = 0 video_frames_count = 0 if(self.__model_type == "yolov3"): while (input_video.isOpened()): ret, frame = input_video.read() if (ret == True): detected_frame = frame.copy() video_frames_count += 1 if (detection_timeout != None): if ((video_frames_count % frames_per_second) == 0): detection_timeout_count += 1 if (detection_timeout_count >= detection_timeout): break output_objects_array = [] counting += 1 if (log_progress == True): print("Processing Frame : ", str(counting)) check_frame_interval = counting % frame_detection_interval if (counting == 1 or check_frame_interval == 0): try: detected_frame, output_objects_array = self.__detector.detectObjectsFromImage( input_image=frame, input_type="array", output_type="array", minimum_percentage_probability=minimum_percentage_probability, display_percentage_probability=display_percentage_probability, display_object_name=display_object_name) except: None output_frames_dict[counting] = output_objects_array output_objects_count = {} for eachItem in output_objects_array: eachItemName = eachItem["name"] try: output_objects_count[eachItemName] = output_objects_count[eachItemName] + 1 except: output_objects_count[eachItemName] = 1 output_frames_count_dict[counting] = output_objects_count if (save_detected_video == True): output_video.write(detected_frame) if (counting == 1 or check_frame_interval == 0): if (per_frame_function != None): if (return_detected_frame == True): per_frame_function(counting, output_objects_array, output_objects_count, detected_frame) elif (return_detected_frame == False): per_frame_function(counting, output_objects_array, output_objects_count) if (per_second_function != None): if (counting != 1 and (counting % frames_per_second) == 0): this_second_output_object_array = [] this_second_counting_array = [] this_second_counting = {} for aa in range(counting): if (aa >= (counting - frames_per_second)): this_second_output_object_array.append(output_frames_dict[aa + 1]) this_second_counting_array.append(output_frames_count_dict[aa + 1]) for eachCountingDict in this_second_counting_array: for eachItem in eachCountingDict: try: this_second_counting[eachItem] = this_second_counting[eachItem] + \ eachCountingDict[eachItem] except: this_second_counting[eachItem] = eachCountingDict[eachItem] for eachCountingItem in this_second_counting: this_second_counting[eachCountingItem] = int(this_second_counting[eachCountingItem] / frames_per_second) if (return_detected_frame == True): per_second_function(int(counting / frames_per_second), this_second_output_object_array, this_second_counting_array, this_second_counting, detected_frame) elif (return_detected_frame == False): per_second_function(int(counting / frames_per_second), this_second_output_object_array, this_second_counting_array, this_second_counting) if (per_minute_function != None): if (counting != 1 and (counting % (frames_per_second * 60)) == 0): this_minute_output_object_array = [] this_minute_counting_array = [] this_minute_counting = {} for aa in range(counting): if (aa >= (counting - (frames_per_second * 60))): this_minute_output_object_array.append(output_frames_dict[aa + 1]) this_minute_counting_array.append(output_frames_count_dict[aa + 1]) for eachCountingDict in this_minute_counting_array: for eachItem in eachCountingDict: try: this_minute_counting[eachItem] = this_minute_counting[eachItem] + \ eachCountingDict[eachItem] except: this_minute_counting[eachItem] = eachCountingDict[eachItem] for eachCountingItem in this_minute_counting: this_minute_counting[eachCountingItem] = int(this_minute_counting[eachCountingItem] / (frames_per_second * 60)) if (return_detected_frame == True): per_minute_function(int(counting / (frames_per_second * 60)), this_minute_output_object_array, this_minute_counting_array, this_minute_counting, detected_frame) elif (return_detected_frame == False): per_minute_function(int(counting / (frames_per_second * 60)), this_minute_output_object_array, this_minute_counting_array, this_minute_counting) else: break if (video_complete_function != None): this_video_output_object_array = [] this_video_counting_array = [] this_video_counting = {} for aa in range(counting): this_video_output_object_array.append(output_frames_dict[aa + 1]) this_video_counting_array.append(output_frames_count_dict[aa + 1]) for eachCountingDict in this_video_counting_array: for eachItem in eachCountingDict: try: this_video_counting[eachItem] = this_video_counting[eachItem] + \ eachCountingDict[eachItem] except: this_video_counting[eachItem] = eachCountingDict[eachItem] for eachCountingItem in this_video_counting: this_video_counting[eachCountingItem] = this_video_counting[ eachCountingItem] / counting video_complete_function(this_video_output_object_array, this_video_counting_array, this_video_counting) input_video.release() output_video.release() if (save_detected_video == True): return output_video_filepath class BoundBox: def __init__(self, xmin, ymin, xmax, ymax, objness=None, classes=None): self.xmin = xmin self.ymin = ymin self.xmax = xmax self.ymax = ymax self.objness = objness self.classes = classes self.label = -1 self.score = -1 def get_label(self): if self.label == -1: self.label = np.argmax(self.classes) return self.label def get_score(self): if self.score == -1: self.score = self.classes[self.get_label()] return self.score class CustomDetectionUtils: def __init__(self, labels): self.__labels = labels self.__colors = [] for i in range(len(labels)): color_space_values = np.random.randint(50, 255, size=(3,)) red, green, blue = color_space_values red, green, blue = int(red), int(green), int(blue) self.__colors.append([red, green, blue]) @staticmethod def _sigmoid(x): return 1. / (1. + np.exp(-x)) def decode_netout(self, netout, anchors, obj_thresh, net_h, net_w): grid_h, grid_w = netout.shape[:2] nb_box = 3 netout = netout.reshape((grid_h, grid_w, nb_box, -1)) nb_class = netout.shape[-1] - 5 boxes = [] netout[..., :2] = self._sigmoid(netout[..., :2]) netout[..., 4:] = self._sigmoid(netout[..., 4:]) netout[..., 5:] = netout[..., 4][..., np.newaxis] * netout[..., 5:] netout[..., 5:] *= netout[..., 5:] > obj_thresh for row in range(grid_h): for col in range(grid_w): for b in range(nb_box): # 4th element is objectness score objectness = netout[row, col, b, 4] if objectness <= obj_thresh: continue # first 4 elements are x, y, w, and h x, y, w, h = netout[row, col, b, :4] x = (col + x) / grid_w # center position, unit: image width y = (row + y) / grid_h # center position, unit: image height w = anchors[2 * b + 0] * np.exp(w) / net_w # unit: image width h = anchors[2 * b + 1] * np.exp(h) / net_h # unit: image height # last elements are class probabilities classes = netout[row, col, b, 5:] box = BoundBox(x - w / 2, y - h / 2, x + w / 2, y + h / 2, objectness, classes) boxes.append(box) return boxes @staticmethod def correct_yolo_boxes(boxes, image_h, image_w, net_h, net_w): new_w, new_h = net_w, net_h for i in range(len(boxes)): x_offset, x_scale = (net_w - new_w) / 2. / net_w, float(new_w) / net_w y_offset, y_scale = (net_h - new_h) / 2. / net_h, float(new_h) / net_h boxes[i].xmin = int((boxes[i].xmin - x_offset) / x_scale * image_w) boxes[i].xmax = int((boxes[i].xmax - x_offset) / x_scale * image_w) boxes[i].ymin = int((boxes[i].ymin - y_offset) / y_scale * image_h) boxes[i].ymax = int((boxes[i].ymax - y_offset) / y_scale * image_h) def _interval_overlap(self, interval_a, interval_b): x1, x2 = interval_a x3, x4 = interval_b if x3 < x1: if x4 < x1: return 0 else: return min(x2, x4) - x1 else: if x2 < x3: return 0 else: return min(x2, x4) - x3 def bbox_iou(self, box1, box2): intersect_w = self._interval_overlap([box1.xmin, box1.xmax], [box2.xmin, box2.xmax]) intersect_h = self._interval_overlap([box1.ymin, box1.ymax], [box2.ymin, box2.ymax]) intersect = intersect_w * intersect_h w1, h1 = box1.xmax - box1.xmin, box1.ymax - box1.ymin w2, h2 = box2.xmax - box2.xmin, box2.ymax - box2.ymin union = w1 * h1 + w2 * h2 - intersect try: result = float(intersect) / float(union) return result except: return 0.0 def do_nms(self, boxes, nms_thresh): if len(boxes) > 0: nb_class = len(boxes[0].classes) else: return for c in range(nb_class): sorted_indices = np.argsort([-box.classes[c] for box in boxes]) for i in range(len(sorted_indices)): index_i = sorted_indices[i] if boxes[index_i].classes[c] == 0: continue for j in range(i + 1, len(sorted_indices)): index_j = sorted_indices[j] if self.bbox_iou(boxes[index_i], boxes[index_j]) >= nms_thresh: boxes[index_j].classes[c] = 0 def get_boxes(self, boxes, labels, thresh): v_boxes, v_labels, v_scores = list(), list(), list() # enumerate all boxes for box in boxes: # enumerate all possible labels for i in range(len(labels)): # check if the threshold for this label is high enough if box.classes[i] > thresh: v_boxes.append(box) v_labels.append(labels[i]) v_scores.append(box.classes[i] * 100) # don't break, many labels may trigger for one box return v_boxes, v_labels, v_scores def label_color(self, label): """ Return a color from a set of predefined colors. Contains 80 colors in total. Args label: The label to get the color for. Returns A list of three values representing a RGB color. If no color is defined for a certain label, the color green is returned and a warning is printed. """ if label < len(self.__colors): return self.__colors[label] else: return 0, 255, 0 def draw_boxes_and_caption(self, image_frame, v_boxes, v_labels, v_scores, show_names=False, show_percentage=False): for i in range(len(v_boxes)): box = v_boxes[i] y1, x1, y2, x2 = box.ymin, box.xmin, box.ymax, box.xmax width, height = x2 - x1, y2 - y1 class_color = self.label_color(self.__labels.index(v_labels[i])) image_frame = cv2.rectangle(image_frame, (x1, y1), (x2, y2), class_color, 2) label = "" if show_names and show_percentage: label = "%s : %.3f" % (v_labels[i], v_scores[i]) elif show_names: label = "%s" % (v_labels[i]) elif show_percentage: label = "%.3f" % (v_scores[i]) if show_names or show_percentage: b = np.array([x1, y1, x2, y2]).astype(int) cv2.putText(image_frame, label, (b[0], b[1] - 10), cv2.FONT_HERSHEY_PLAIN, 1, (200, 0, 0), 3) cv2.putText(image_frame, label, (b[0], b[1] - 10), cv2.FONT_HERSHEY_PLAIN, 1, (255, 255, 255), 2) return image_frame ================================================ FILE: imageai_tf_deprecated/Detection/Custom/callbacks.py ================================================ from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint import tensorflow as tf import numpy as np import warnings class CustomTensorBoard(TensorBoard): """ to log the loss after each batch """ def __init__(self, log_every=1, **kwargs): super(CustomTensorBoard, self).__init__(**kwargs) self.log_every = log_every self.counter = 0 def on_batch_end(self, batch, logs=None): self.counter+=1 if self.counter%self.log_every==0: for name, value in logs.items(): if name in ['batch', 'size']: continue summary = tf.Summary() summary_value = summary.value.add() summary_value.simple_value = value.item() summary_value.tag = name self.writer.add_summary(summary, self.counter) self.writer.flush() super(CustomTensorBoard, self).on_batch_end(batch, logs) class CustomModelCheckpoint(ModelCheckpoint): """ to save the template model, not the multi-GPU model """ def __init__(self, model_to_save, **kwargs): super(CustomModelCheckpoint, self).__init__(**kwargs) self.model_to_save = model_to_save def on_epoch_end(self, epoch, logs=None): logs = logs or {} self.epochs_since_last_save += 1 if self.epochs_since_last_save >= self.period: self.epochs_since_last_save = 0 filepath = self.filepath.format(epoch=epoch + 1, **logs) if self.save_best_only: current = logs.get(self.monitor) if current is None: warnings.warn('Can save best model only with %s available, ' 'skipping.' % (self.monitor), RuntimeWarning) else: if self.monitor_op(current, self.best): if self.verbose > 0: print('\nEpoch %05d: %s improved from %0.5f to %0.5f,' ' saving model to %s' % (epoch + 1, self.monitor, self.best, current, filepath)) self.best = current if self.save_weights_only: self.model_to_save.save_weights(filepath, overwrite=True) else: self.model_to_save.save(filepath, overwrite=True) else: if self.verbose > 0: print('\nEpoch %05d: %s did not improve from %0.5f' % (epoch + 1, self.monitor, self.best)) else: if self.verbose > 0: print('\nEpoch %05d: saving model to %s' % (epoch + 1, filepath)) if self.save_weights_only: self.model_to_save.save_weights(filepath, overwrite=True) else: self.model_to_save.save(filepath, overwrite=True) super(CustomModelCheckpoint, self).on_batch_end(epoch, logs) ================================================ FILE: imageai_tf_deprecated/Detection/Custom/evaluate.py ================================================ #! /usr/bin/env python import argparse import os import json from imageai.Detection.Custom.voc import parse_voc_annotation from imageai.Detection.Custom.generator import BatchGenerator from imageai.Detection.Custom.utils.utils import normalize, evaluate from keras.models import load_model def _main_(args): config_path = args.conf with open(config_path) as config_buffer: config = json.loads(config_buffer.read()) ############################### # Create the validation generator ############################### valid_ints, labels = parse_voc_annotation( config['valid']['valid_annot_folder'], config['valid']['valid_image_folder'], config['valid']['cache_name'], config['model']['labels'] ) labels = labels.keys() if len(config['model']['labels']) == 0 else config['model']['labels'] labels = sorted(labels) valid_generator = BatchGenerator( instances = valid_ints, anchors = config['model']['anchors'], labels = labels, downsample = 32, # ratio between network input's size and network output's size, 32 for YOLOv3 max_box_per_image = 0, batch_size = config['train']['batch_size'], min_net_size = config['model']['min_input_size'], max_net_size = config['model']['max_input_size'], shuffle = True, jitter = 0.0, norm = normalize ) ############################### # Load the model and do evaluation ############################### os.environ['CUDA_VISIBLE_DEVICES'] = config['train']['gpus'] infer_model = load_model(config['train']['saved_weights_name']) # compute mAP for all the classes average_precisions = evaluate(infer_model, valid_generator) # print the score for label, average_precision in average_precisions.items(): print(labels[label] + ': {:.4f}'.format(average_precision)) print('mAP: {:.4f}'.format(sum(average_precisions.values()) / len(average_precisions))) if __name__ == '__main__': argparser = argparse.ArgumentParser(description='Evaluate YOLO_v3 model on any dataset') argparser.add_argument('-c', '--conf', help='path to configuration file') args = argparser.parse_args() _main_(args) ================================================ FILE: imageai_tf_deprecated/Detection/Custom/gen_anchors.py ================================================ import random import numpy as np from imageai.Detection.Custom.voc import parse_voc_annotation def IOU(ann, centroids): w, h = ann similarities = [] for centroid in centroids: c_w, c_h = centroid if c_w >= w and c_h >= h: similarity = w*h/(c_w*c_h) elif c_w >= w and c_h <= h: similarity = w*c_h/(w*h + (c_w-w)*c_h) elif c_w <= w and c_h >= h: similarity = c_w*h/(w*h + c_w*(c_h-h)) else: #means both w,h are bigger than c_w and c_h respectively similarity = (c_w*c_h)/(w*h) similarities.append(similarity) # will become (k,) shape return np.array(similarities) def avg_IOU(anns, centroids): n,d = anns.shape sum = 0. for i in range(anns.shape[0]): sum+= max(IOU(anns[i], centroids)) return sum/n def run_kmeans(ann_dims, anchor_num): ann_num = ann_dims.shape[0] iterations = 0 prev_assignments = np.ones(ann_num)*(-1) iteration = 0 old_distances = np.zeros((ann_num, anchor_num)) indices = [random.randrange(ann_dims.shape[0]) for i in range(anchor_num)] centroids = ann_dims[indices] anchor_dim = ann_dims.shape[1] while True: distances = [] iteration += 1 for i in range(ann_num): d = 1 - IOU(ann_dims[i], centroids) distances.append(d) distances = np.array(distances) # distances.shape = (ann_num, anchor_num) #assign samples to centroids assignments = np.argmin(distances,axis=1) if (assignments == prev_assignments).all() : return centroids #calculate new centroids centroid_sums=np.zeros((anchor_num, anchor_dim), np.float) for i in range(ann_num): centroid_sums[assignments[i]]+=ann_dims[i] for j in range(anchor_num): centroids[j] = centroid_sums[j]/(np.sum(assignments==j) + 1e-6) prev_assignments = assignments.copy() old_distances = distances.copy() def generateAnchors(train_annotation_folder, train_image_folder, train_cache_file, model_labels): print("Generating anchor boxes for training images and annotation...") num_anchors = 9 train_imgs, train_labels = parse_voc_annotation( train_annotation_folder, train_image_folder, train_cache_file, model_labels ) # run k_mean to find the anchors annotation_dims = [] for image in train_imgs: for obj in image['object']: relative_w = (float(obj['xmax']) - float(obj['xmin']))/image['width'] relative_h = (float(obj["ymax"]) - float(obj['ymin']))/image['height'] annotation_dims.append(tuple(map(float, (relative_w,relative_h)))) annotation_dims = np.array(annotation_dims) centroids = run_kmeans(annotation_dims, num_anchors) # write anchors to file print('Average IOU for', num_anchors, 'anchors:', '%0.2f' % avg_IOU(annotation_dims, centroids)) anchors = centroids.copy() widths = anchors[:, 0] sorted_indices = np.argsort(widths) anchor_array = [] reverse_anchor_array = [] out_string = "" r = "anchors: [" for i in sorted_indices: anchor_array.append(int(anchors[i, 0] * 416)) anchor_array.append(int(anchors[i, 1] * 416)) out_string += str(int(anchors[i, 0] * 416)) + ',' + str(int(anchors[i, 1] * 416)) + ', ' reverse_anchor_array.append(anchor_array[12:18]) reverse_anchor_array.append(anchor_array[6:12]) reverse_anchor_array.append(anchor_array[0:6]) print("Anchor Boxes generated.") return anchor_array, reverse_anchor_array ================================================ FILE: imageai_tf_deprecated/Detection/Custom/generator.py ================================================ import cv2 import copy import numpy as np from tensorflow.keras.utils import Sequence from imageai.Detection.Custom.utils.bbox import BoundBox, bbox_iou from imageai.Detection.Custom.utils.image import apply_random_scale_and_crop, random_distort_image, random_flip, correct_bounding_boxes class BatchGenerator(Sequence): def __init__(self, instances, anchors, labels, downsample=32, # ratio between network input's size and network output's size, 32 for YOLOv3 max_box_per_image=30, batch_size=1, min_net_size=320, max_net_size=608, shuffle=True, jitter=True, norm=None ): self.instances = instances self.batch_size = batch_size self.labels = labels self.downsample = downsample self.max_box_per_image = max_box_per_image self.min_net_size = (min_net_size//self.downsample)*self.downsample self.max_net_size = (max_net_size//self.downsample)*self.downsample self.shuffle = shuffle self.jitter = jitter self.norm = norm self.anchors = [BoundBox(0, 0, anchors[2*i], anchors[2*i+1]) for i in range(len(anchors)//2)] self.net_h = 416 self.net_w = 416 if shuffle: np.random.shuffle(self.instances) def __len__(self): return int(np.ceil(float(len(self.instances))/self.batch_size)) def __getitem__(self, idx): # get image input size, change every 10 batches net_h, net_w = self._get_net_size(idx) base_grid_h, base_grid_w = net_h//self.downsample, net_w//self.downsample # determine the first and the last indices of the batch l_bound = idx * self.batch_size r_bound = (idx+1) * self.batch_size if r_bound > len(self.instances): r_bound = len(self.instances) l_bound = r_bound - self.batch_size x_batch = np.zeros((r_bound - l_bound, net_h, net_w, 3)) # input images t_batch = np.zeros((r_bound - l_bound, 1, 1, 1, self.max_box_per_image, 4)) # list of groundtruth boxes # initialize the inputs and the outputs yolo_1 = np.zeros((r_bound - l_bound, 1*base_grid_h, 1*base_grid_w, len(self.anchors)//3, 4+1+len(self.labels))) # desired network output 1 yolo_2 = np.zeros((r_bound - l_bound, 2*base_grid_h, 2*base_grid_w, len(self.anchors)//3, 4+1+len(self.labels))) # desired network output 2 yolo_3 = np.zeros((r_bound - l_bound, 4*base_grid_h, 4*base_grid_w, len(self.anchors)//3, 4+1+len(self.labels))) # desired network output 3 yolos = [yolo_3, yolo_2, yolo_1] dummy_yolo_1 = np.zeros((r_bound - l_bound, 1)) dummy_yolo_2 = np.zeros_like(dummy_yolo_1) dummy_yolo_3 = np.zeros_like(dummy_yolo_1) instance_count = 0 true_box_index = 0 # do the logic to fill in the inputs and the output for train_instance in self.instances[l_bound:r_bound]: # augment input image and fix object's position and size img, all_objs = self._aug_image(train_instance, net_h, net_w) for obj in all_objs: # find the best anchor box for this object max_anchor = None max_index = -1 max_iou = -1 shifted_box = BoundBox(0, 0, obj['xmax']-obj['xmin'], obj['ymax']-obj['ymin']) for i in range(len(self.anchors)): anchor = self.anchors[i] iou = bbox_iou(shifted_box, anchor) if max_iou < iou: max_anchor = anchor max_index = i max_iou = iou # determine the yolo to be responsible for this bounding box yolo = yolos[max_index//3] grid_h, grid_w = yolo.shape[1:3] # determine the position of the bounding box on the grid center_x = .5*(obj['xmin'] + obj['xmax']) center_x = center_x / float(net_w) * grid_w # sigma(t_x) + c_x center_y = .5*(obj['ymin'] + obj['ymax']) center_y = center_y / float(net_h) * grid_h # sigma(t_y) + c_y # determine the sizes of the bounding box w = np.log((obj['xmax'] - obj['xmin']) / float(max_anchor.xmax)) # t_w h = np.log((obj['ymax'] - obj['ymin']) / float(max_anchor.ymax)) # t_h box = [center_x, center_y, w, h] # determine the index of the label obj_indx = self.labels.index(obj['name']) # determine the location of the cell responsible for this object grid_x = int(np.floor(center_x)) grid_y = int(np.floor(center_y)) # assign ground truth x, y, w, h, confidence and class probs to y_batch yolo[instance_count, grid_y, grid_x, max_index%3] = 0 yolo[instance_count, grid_y, grid_x, max_index%3, 0:4] = box yolo[instance_count, grid_y, grid_x, max_index%3, 4 ] = 1. yolo[instance_count, grid_y, grid_x, max_index%3, 5+obj_indx] = 1 # assign the true box to t_batch true_box = [center_x, center_y, obj['xmax'] - obj['xmin'], obj['ymax'] - obj['ymin']] t_batch[instance_count, 0, 0, 0, true_box_index] = true_box true_box_index += 1 true_box_index = true_box_index % self.max_box_per_image # assign input image to x_batch if self.norm != None: x_batch[instance_count] = self.norm(img) else: # plot image and bounding boxes for sanity check for obj in all_objs: cv2.rectangle(img, (obj['xmin'],obj['ymin']), (obj['xmax'],obj['ymax']), (255,0,0), 3) cv2.putText(img, obj['name'], (obj['xmin']+2, obj['ymin']+12), 0, 1.2e-3 * img.shape[0], (0,255,0), 2) x_batch[instance_count] = img # increase instance counter in the current batch instance_count += 1 return [x_batch, t_batch, yolo_1, yolo_2, yolo_3], [dummy_yolo_1, dummy_yolo_2, dummy_yolo_3] def _get_net_size(self, idx): if idx % 10 == 0: net_size = self.downsample*np.random.randint(self.min_net_size/self.downsample, \ self.max_net_size/self.downsample+1) self.net_h, self.net_w = net_size, net_size return self.net_h, self.net_w def _aug_image(self, instance, net_h, net_w): image_name = instance['filename'] image = cv2.imread(image_name) # BGR image if image is None: print('Cannot find ', image_name) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # RGB image image_h, image_w, _ = image.shape # determine the amount of scaling and cropping dw = self.jitter * image_w dh = self.jitter * image_h new_ar = (image_w + np.random.uniform(-dw, dw)) / (image_h + np.random.uniform(-dh, dh)) scale = np.random.uniform(0.25, 2) if new_ar < 1: new_h = int(scale * net_h) new_w = int(net_h * new_ar) else: new_w = int(scale * net_w) new_h = int(net_w / new_ar) dx = int(np.random.uniform(0, net_w - new_w)) dy = int(np.random.uniform(0, net_h - new_h)) # apply scaling and cropping im_sized = apply_random_scale_and_crop(image, new_w, new_h, net_w, net_h, dx, dy) # randomly distort hsv space im_sized = random_distort_image(im_sized) # randomly flip flip = np.random.randint(2) im_sized = random_flip(im_sized, flip) # correct the size and pos of bounding boxes all_objs = correct_bounding_boxes(instance['object'], new_w, new_h, net_w, net_h, dx, dy, flip, image_w, image_h) return im_sized, all_objs def on_epoch_end(self): if self.shuffle: np.random.shuffle(self.instances) def num_classes(self): return len(self.labels) def size(self): return len(self.instances) def get_anchors(self): anchors = [] for anchor in self.anchors: anchors += [anchor.xmax, anchor.ymax] return anchors def load_annotation(self, i): annots = [] for obj in self.instances[i]['object']: annot = [obj['xmin'], obj['ymin'], obj['xmax'], obj['ymax'], self.labels.index(obj['name'])] annots += [annot] if len(annots) == 0: annots = [[]] return np.array(annots) def load_image(self, i): return cv2.imread(self.instances[i]['filename']) # BGR image ================================================ FILE: imageai_tf_deprecated/Detection/Custom/utils/__init__.py ================================================ ================================================ FILE: imageai_tf_deprecated/Detection/Custom/utils/bbox.py ================================================ import numpy as np import os import cv2 from .colors import get_color class BoundBox: def __init__(self, xmin, ymin, xmax, ymax, c = None, classes = None): self.xmin = xmin self.ymin = ymin self.xmax = xmax self.ymax = ymax self.c = c self.classes = classes self.label = -1 self.score = -1 def get_label(self): if self.label == -1: self.label = np.argmax(self.classes) return self.label def get_score(self): if self.score == -1: self.score = self.classes[self.get_label()] return self.score def _interval_overlap(interval_a, interval_b): x1, x2 = interval_a x3, x4 = interval_b if x3 < x1: if x4 < x1: return 0 else: return min(x2,x4) - x1 else: if x2 < x3: return 0 else: return min(x2,x4) - x3 def bbox_iou(box1, box2): intersect_w = _interval_overlap([box1.xmin, box1.xmax], [box2.xmin, box2.xmax]) intersect_h = _interval_overlap([box1.ymin, box1.ymax], [box2.ymin, box2.ymax]) intersect = intersect_w * intersect_h w1, h1 = box1.xmax-box1.xmin, box1.ymax-box1.ymin w2, h2 = box2.xmax-box2.xmin, box2.ymax-box2.ymin union = w1*h1 + w2*h2 - intersect if(union <= 0): union = 1 return float(intersect) / float(union) def draw_boxes(image, boxes, labels, obj_thresh, quiet=True): for box in boxes: label_str = '' label = -1 for i in range(len(labels)): if box.classes[i] > obj_thresh: if label_str != '': label_str += ', ' label_str += (labels[i] + ' ' + str(round(box.get_score()*100, 2)) + '%') label = i if not quiet: print(label_str) if label >= 0: text_size = cv2.getTextSize(label_str, cv2.FONT_HERSHEY_SIMPLEX, 1.1e-3 * image.shape[0], 5) width, height = text_size[0][0], text_size[0][1] region = np.array([[box.xmin-3, box.ymin], [box.xmin-3, box.ymin-height-26], [box.xmin+width+13, box.ymin-height-26], [box.xmin+width+13, box.ymin]], dtype='int32') cv2.rectangle(img=image, pt1=(box.xmin,box.ymin), pt2=(box.xmax,box.ymax), color=get_color(label), thickness=5) cv2.fillPoly(img=image, pts=[region], color=get_color(label)) cv2.putText(img=image, text=label_str, org=(box.xmin+13, box.ymin - 13), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1e-3 * image.shape[0], color=(0,0,0), thickness=2) return image ================================================ FILE: imageai_tf_deprecated/Detection/Custom/utils/colors.py ================================================ def get_color(label): """ Return a color from a set of predefined colors. Contains 80 colors in total. code originally from https://github.com/fizyr/keras-retinanet/ Args label: The label to get the color for. Returns A list of three values representing a RGB color. """ if label < len(colors): return colors[label] else: print('Label {} has no color, returning default.'.format(label)) return (0, 255, 0) colors = [ [31 , 0 , 255] , [0 , 159 , 255] , [255 , 95 , 0] , [255 , 19 , 0] , [255 , 0 , 0] , [255 , 38 , 0] , [0 , 255 , 25] , [255 , 0 , 133] , [255 , 172 , 0] , [108 , 0 , 255] , [0 , 82 , 255] , [0 , 255 , 6] , [255 , 0 , 152] , [223 , 0 , 255] , [12 , 0 , 255] , [0 , 255 , 178] , [108 , 255 , 0] , [184 , 0 , 255] , [255 , 0 , 76] , [146 , 255 , 0] , [51 , 0 , 255] , [0 , 197 , 255] , [255 , 248 , 0] , [255 , 0 , 19] , [255 , 0 , 38] , [89 , 255 , 0] , [127 , 255 , 0] , [255 , 153 , 0] , [0 , 255 , 255] , [0 , 255 , 216] , [0 , 255 , 121] , [255 , 0 , 248] , [70 , 0 , 255] , [0 , 255 , 159] , [0 , 216 , 255] , [0 , 6 , 255] , [0 , 63 , 255] , [31 , 255 , 0] , [255 , 57 , 0] , [255 , 0 , 210] , [0 , 255 , 102] , [242 , 255 , 0] , [255 , 191 , 0] , [0 , 255 , 63] , [255 , 0 , 95] , [146 , 0 , 255] , [184 , 255 , 0] , [255 , 114 , 0] , [0 , 255 , 235] , [255 , 229 , 0] , [0 , 178 , 255] , [255 , 0 , 114] , [255 , 0 , 57] , [0 , 140 , 255] , [0 , 121 , 255] , [12 , 255 , 0] , [255 , 210 , 0] , [0 , 255 , 44] , [165 , 255 , 0] , [0 , 25 , 255] , [0 , 255 , 140] , [0 , 101 , 255] , [0 , 255 , 82] , [223 , 255 , 0] , [242 , 0 , 255] , [89 , 0 , 255] , [165 , 0 , 255] , [70 , 255 , 0] , [255 , 0 , 172] , [255 , 76 , 0] , [203 , 255 , 0] , [204 , 0 , 255] , [255 , 0 , 229] , [255 , 133 , 0] , [127 , 0 , 255] , [0 , 235 , 255] , [0 , 255 , 197] , [255 , 0 , 191] , [0 , 44 , 255] , [50 , 255 , 0] ] ================================================ FILE: imageai_tf_deprecated/Detection/Custom/utils/image.py ================================================ import cv2 import numpy as np import copy def _rand_scale(scale): scale = np.random.uniform(1, scale) return scale if np.random.randint(2) == 0 else 1./scale def _constrain(min_v, max_v, value): if value < min_v: return min_v if value > max_v: return max_v return value def random_flip(image, flip): if flip == 1: return cv2.flip(image, 1) return image def correct_bounding_boxes(boxes, new_w, new_h, net_w, net_h, dx, dy, flip, image_w, image_h): boxes = copy.deepcopy(boxes) # randomize boxes' order np.random.shuffle(boxes) # correct sizes and positions sx, sy = float(new_w)/image_w, float(new_h)/image_h zero_boxes = [] for i in range(len(boxes)): boxes[i]['xmin'] = int(_constrain(0, net_w, boxes[i]['xmin']*sx + dx)) boxes[i]['xmax'] = int(_constrain(0, net_w, boxes[i]['xmax']*sx + dx)) boxes[i]['ymin'] = int(_constrain(0, net_h, boxes[i]['ymin']*sy + dy)) boxes[i]['ymax'] = int(_constrain(0, net_h, boxes[i]['ymax']*sy + dy)) if boxes[i]['xmax'] <= boxes[i]['xmin'] or boxes[i]['ymax'] <= boxes[i]['ymin']: zero_boxes += [i] continue if flip == 1: swap = boxes[i]['xmin'] boxes[i]['xmin'] = net_w - boxes[i]['xmax'] boxes[i]['xmax'] = net_w - swap boxes = [boxes[i] for i in range(len(boxes)) if i not in zero_boxes] return boxes def random_distort_image(image, hue=18, saturation=1.5, exposure=1.5): # determine scale factors dhue = np.random.uniform(-hue, hue) dsat = _rand_scale(saturation) dexp = _rand_scale(exposure) # convert RGB space to HSV space image = cv2.cvtColor(image, cv2.COLOR_RGB2HSV).astype('float') # change satuation and exposure image[:, :, 1] *= dsat image[:, :, 2] *= dexp # change hue image[:, :, 0] += dhue image[:, :, 0] -= (image[:, :, 0] > 180) * 180 image[:, :, 0] += (image[:, :, 0] < 0) * 180 # convert back to RGB from HSV return cv2.cvtColor(image.astype('uint8'), cv2.COLOR_HSV2RGB) def apply_random_scale_and_crop(image, new_w, new_h, net_w, net_h, dx, dy): im_sized = cv2.resize(image, (new_w, new_h)) if dx > 0: im_sized = np.pad(im_sized, ((0, 0), (dx, 0), (0, 0)), mode='constant', constant_values=127) else: im_sized = im_sized[:, -dx:, :] if (new_w + dx) < net_w: im_sized = np.pad(im_sized, ((0, 0), (0, net_w - (new_w+dx)), (0, 0)), mode='constant', constant_values=127) if dy > 0: im_sized = np.pad(im_sized, ((dy, 0), (0, 0), (0, 0)), mode='constant', constant_values=127) else: im_sized = im_sized[-dy:, :, :] if (new_h + dy) < net_h: im_sized = np.pad(im_sized, ((0, net_h - (new_h+dy)), (0, 0), (0, 0)), mode='constant', constant_values=127) return im_sized[:net_h, :net_w, :] ================================================ FILE: imageai_tf_deprecated/Detection/Custom/utils/multi_gpu_model.py ================================================ from keras.layers import Lambda, concatenate from keras.models import Model import tensorflow as tf def multi_gpu_model(model, gpus): if isinstance(gpus, (list, tuple)): num_gpus = len(gpus) target_gpu_ids = gpus else: num_gpus = gpus target_gpu_ids = range(num_gpus) def get_slice(data, i, parts): shape = tf.shape(data) batch_size = shape[:1] input_shape = shape[1:] step = batch_size // parts if i == num_gpus - 1: size = batch_size - step * i else: size = step size = tf.concat([size, input_shape], axis=0) stride = tf.concat([step, input_shape * 0], axis=0) start = stride * i return tf.slice(data, start, size) all_outputs = [] for i in range(len(model.outputs)): all_outputs.append([]) # Place a copy of the model on each GPU, # each getting a slice of the inputs. for i, gpu_id in enumerate(target_gpu_ids): with tf.device('/gpu:%d' % gpu_id): with tf.name_scope('replica_%d' % gpu_id): inputs = [] # Retrieve a slice of the input. for x in model.inputs: input_shape = tuple(x.get_shape().as_list())[1:] slice_i = Lambda(get_slice, output_shape=input_shape, arguments={'i': i, 'parts': num_gpus})(x) inputs.append(slice_i) # Apply model on slice # (creating a model replica on the target device). outputs = model(inputs) if not isinstance(outputs, list): outputs = [outputs] # Save the outputs for merging back together later. for o in range(len(outputs)): all_outputs[o].append(outputs[o]) # Merge outputs on CPU. with tf.device('/cpu:0'): merged = [] for name, outputs in zip(model.output_names, all_outputs): merged.append(concatenate(outputs, axis=0, name=name)) return Model(model.inputs, merged) ================================================ FILE: imageai_tf_deprecated/Detection/Custom/utils/utils.py ================================================ import cv2 import numpy as np import os from .bbox import BoundBox, bbox_iou from scipy.special import expit def _sigmoid(x): return expit(x) def makedirs(path): try: os.makedirs(path) except OSError: if not os.path.isdir(path): raise def evaluate(model, generator, iou_threshold, obj_thresh, nms_thresh, net_h=416, net_w=416, save_path=None): """ Evaluate a given dataset using a given model. code originally from https://github.com/fizyr/keras-retinanet # Arguments model : The model to evaluate. generator : The generator that represents the dataset to evaluate. iou_threshold : The threshold used to consider when a detection is positive or negative. obj_thresh : The threshold used to distinguish between object and non-object nms_thresh : The threshold used to determine whether two detections are duplicates net_h : The height of the input image to the model, higher value results in better accuracy net_w : The width of the input image to the model save_path : The path to save images with visualized detections to. # Returns A dict mapping class names to mAP scores. """ # gather all detections and annotations all_detections = [[None for i in range(generator.num_classes())] for j in range(generator.size())] all_annotations = [[None for i in range(generator.num_classes())] for j in range(generator.size())] for i in range(generator.size()): raw_image = [generator.load_image(i)] # make the boxes and the labels pred_boxes = get_yolo_boxes(model, raw_image, net_h, net_w, generator.get_anchors(), obj_thresh, nms_thresh)[0] score = np.array([box.get_score() for box in pred_boxes]) pred_labels = np.array([box.label for box in pred_boxes]) if len(pred_boxes) > 0: pred_boxes = np.array([[box.xmin, box.ymin, box.xmax, box.ymax, box.get_score()] for box in pred_boxes]) else: pred_boxes = np.array([[]]) # sort the boxes and the labels according to scores score_sort = np.argsort(-score) pred_labels = pred_labels[score_sort] pred_boxes = pred_boxes[score_sort] # copy detections to all_detections for label in range(generator.num_classes()): all_detections[i][label] = pred_boxes[pred_labels == label, :] annotations = generator.load_annotation(i) # copy detections to all_annotations for label in range(generator.num_classes()): all_annotations[i][label] = annotations[annotations[:, 4] == label, :4].copy() # compute mAP by comparing all detections and all annotations average_precisions = {} for label in range(generator.num_classes()): false_positives = np.zeros((0,)) true_positives = np.zeros((0,)) scores = np.zeros((0,)) num_annotations = 0.0 for i in range(generator.size()): detections = all_detections[i][label] annotations = all_annotations[i][label] num_annotations += annotations.shape[0] detected_annotations = [] for d in detections: scores = np.append(scores, d[4]) if annotations.shape[0] == 0: false_positives = np.append(false_positives, 1) true_positives = np.append(true_positives, 0) continue overlaps = compute_overlap(np.expand_dims(d, axis=0), annotations) assigned_annotation = np.argmax(overlaps, axis=1) max_overlap = overlaps[0, assigned_annotation] if max_overlap >= iou_threshold and assigned_annotation not in detected_annotations: false_positives = np.append(false_positives, 0) true_positives = np.append(true_positives, 1) detected_annotations.append(assigned_annotation) else: false_positives = np.append(false_positives, 1) true_positives = np.append(true_positives, 0) # no annotations -> AP for this class is 0 (is this correct?) if num_annotations == 0: average_precisions[label] = 0 continue # sort by score indices = np.argsort(-scores) false_positives = false_positives[indices] true_positives = true_positives[indices] # compute false positives and true positives false_positives = np.cumsum(false_positives) true_positives = np.cumsum(true_positives) # compute recall and precision recall = true_positives / num_annotations precision = true_positives / np.maximum(true_positives + false_positives, np.finfo(np.float64).eps) # compute average precision average_precision = compute_ap(recall, precision) average_precisions[label] = average_precision return average_precisions def correct_yolo_boxes(boxes, image_h, image_w, net_h, net_w): if (float(net_w)/image_w) < (float(net_h)/image_h): new_w = net_w new_h = (image_h*net_w)/image_w else: new_h = net_w new_w = (image_w*net_h)/image_h for i in range(len(boxes)): x_offset, x_scale = (net_w - new_w)/2./net_w, float(new_w)/net_w y_offset, y_scale = (net_h - new_h)/2./net_h, float(new_h)/net_h boxes[i].xmin = int((boxes[i].xmin - x_offset) / x_scale * image_w) boxes[i].xmax = int((boxes[i].xmax - x_offset) / x_scale * image_w) boxes[i].ymin = int((boxes[i].ymin - y_offset) / y_scale * image_h) boxes[i].ymax = int((boxes[i].ymax - y_offset) / y_scale * image_h) def do_nms(boxes, nms_thresh): if len(boxes) > 0: nb_class = len(boxes[0].classes) else: return for c in range(nb_class): sorted_indices = np.argsort([-box.classes[c] for box in boxes]) for i in range(len(sorted_indices)): index_i = sorted_indices[i] if boxes[index_i].classes[c] == 0: continue for j in range(i+1, len(sorted_indices)): index_j = sorted_indices[j] if bbox_iou(boxes[index_i], boxes[index_j]) >= nms_thresh: boxes[index_j].classes[c] = 0 def decode_netout(netout, anchors, obj_thresh, net_h, net_w): grid_h, grid_w = netout.shape[:2] nb_box = 3 netout = netout.reshape((grid_h, grid_w, nb_box, -1)) nb_class = netout.shape[-1] - 5 boxes = [] netout[..., :2] = _sigmoid(netout[..., :2]) netout[..., 4] = _sigmoid(netout[..., 4]) netout[..., 5:] = netout[..., 4][..., np.newaxis] * _softmax(netout[..., 5:]) netout[..., 5:] *= netout[..., 5:] > obj_thresh for i in range(grid_h*grid_w): row = i // grid_w col = i % grid_w for b in range(nb_box): # 4th element is objectness score objectness = netout[row, col, b, 4] if objectness <= obj_thresh: continue # first 4 elements are x, y, w, and h x, y, w, h = netout[row, col, b, :4] x = (col + x) / grid_w # center position, unit: image width y = (row + y) / grid_h # center position, unit: image height w = anchors[2 * b + 0] * np.exp(w) / net_w # unit: image width h = anchors[2 * b + 1] * np.exp(h) / net_h # unit: image height # last elements are class probabilities classes = netout[row, col, b, 5:] box = BoundBox(x-w/2, y-h/2, x+w/2, y+h/2, objectness, classes) boxes.append(box) return boxes def preprocess_input(image, net_h, net_w): new_h, new_w, _ = image.shape # determine the new size of the image if (float(net_w)/new_w) < (float(net_h)/new_h): new_h = (new_h * net_w)//new_w new_w = net_w else: new_w = (new_w * net_h)//new_h new_h = net_h # resize the image to the new size resized = cv2.resize(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)/255., (new_w, new_h)) # embed the image into the standard letter box new_image = np.ones((net_h, net_w, 3)) * 0.5 new_image[(net_h-new_h)//2:(net_h+new_h)//2, (net_w-new_w)//2:(net_w+new_w)//2, :] = resized new_image = np.expand_dims(new_image, 0) return new_image def normalize(image): return image/255. def get_yolo_boxes(model, images, net_h, net_w, anchors, obj_thresh, nms_thresh): image_h, image_w, _ = images[0].shape nb_images = len(images) batch_input = np.zeros((nb_images, net_h, net_w, 3)) # preprocess the input for i in range(nb_images): batch_input[i] = preprocess_input(images[i], net_h, net_w) # run the prediction batch_output = model.predict_on_batch(batch_input) batch_boxes = [None]*nb_images for i in range(nb_images): yolos = [batch_output[0][i], batch_output[1][i], batch_output[2][i]] boxes = [] # decode the output of the network for j in range(len(yolos)): yolo_anchors = anchors[(2-j)*6:(3-j)*6] # config['model']['anchors'] boxes += decode_netout(yolos[j], yolo_anchors, obj_thresh, net_h, net_w) # correct the sizes of the bounding boxes correct_yolo_boxes(boxes, image_h, image_w, net_h, net_w) # suppress non-maximal boxes do_nms(boxes, nms_thresh) batch_boxes[i] = boxes return batch_boxes def compute_overlap(a, b): """ Code originally from https://github.com/rbgirshick/py-faster-rcnn. Parameters ---------- a: (N, 4) ndarray of float b: (K, 4) ndarray of float Returns ------- overlaps: (N, K) ndarray of overlap between boxes and query_boxes """ area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1]) iw = np.minimum(np.expand_dims(a[:, 2], axis=1), b[:, 2]) - np.maximum(np.expand_dims(a[:, 0], 1), b[:, 0]) ih = np.minimum(np.expand_dims(a[:, 3], axis=1), b[:, 3]) - np.maximum(np.expand_dims(a[:, 1], 1), b[:, 1]) iw = np.maximum(iw, 0) ih = np.maximum(ih, 0) ua = np.expand_dims((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), axis=1) + area - iw * ih ua = np.maximum(ua, np.finfo(float).eps) intersection = iw * ih return intersection / ua def compute_ap(recall, precision): """ Compute the average precision, given the recall and precision curves. Code originally from https://github.com/rbgirshick/py-faster-rcnn. # Arguments recall: The recall curve (list). precision: The precision curve (list). # Returns The average precision as computed in py-faster-rcnn. """ # correct AP calculation # first append sentinel values at the end mrec = np.concatenate(([0.], recall, [1.])) mpre = np.concatenate(([0.], precision, [0.])) # compute the precision envelope for i in range(mpre.size - 1, 0, -1): mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) # to calculate area under PR curve, look for points # where X axis (recall) changes value i = np.where(mrec[1:] != mrec[:-1])[0] # and sum (\Delta recall) * prec ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) return ap def _softmax(x, axis=-1): x = x - np.amax(x, axis, keepdims=True) e_x = np.exp(x) return e_x / e_x.sum(axis, keepdims=True) ================================================ FILE: imageai_tf_deprecated/Detection/Custom/voc.py ================================================ import os import xml.etree.ElementTree as ET import pickle def parse_voc_annotation(ann_dir, img_dir, cache_name, labels=[]): if os.path.exists(cache_name): with open(cache_name, 'rb') as handle: cache = pickle.load(handle) all_insts, seen_labels = cache['all_insts'], cache['seen_labels'] else: all_insts = list() seen_labels = dict() for ann in sorted(os.listdir(ann_dir)): img = {'object': list()} try: tree = ET.parse(os.path.join(ann_dir, ann)) except Exception as e: print(e) print('Ignore this bad annotation: ' + os.path.join(ann_dir, ann)) continue for elem in tree.iter(): if 'filename' in elem.tag: img['filename'] = os.path.join(img_dir, elem.text) if 'width' in elem.tag: img['width'] = int(elem.text) if 'height' in elem.tag: img['height'] = int(elem.text) if 'object' in elem.tag or 'part' in elem.tag: obj = {} for attr in list(elem): if 'name' in attr.tag: obj['name'] = attr.text if obj['name'] in seen_labels: seen_labels[obj['name']] += 1 else: seen_labels[obj['name']] = 1 if len(labels) > 0 and obj['name'] not in labels: break else: img['object'] += [obj] if 'bndbox' in attr.tag: for dim in list(attr): if 'xmin' in dim.tag: obj['xmin'] = int(round(float(dim.text))) if 'ymin' in dim.tag: obj['ymin'] = int(round(float(dim.text))) if 'xmax' in dim.tag: obj['xmax'] = int(round(float(dim.text))) if 'ymax' in dim.tag: obj['ymax'] = int(round(float(dim.text))) if len(img['object']) > 0: all_insts += [img] cache = {'all_insts': all_insts, 'seen_labels': seen_labels} with open(cache_name, 'wb') as handle: pickle.dump(cache, handle, protocol=pickle.HIGHEST_PROTOCOL) return all_insts, seen_labels ================================================ FILE: imageai_tf_deprecated/Detection/README.md ================================================ # ImageAI : Object Detection A **DeepQuest AI** project [https://deepquestai.com](https://deepquestai.com) ### TABLE OF CONTENTS - :white_square_button: First Object Detection - :white_square_button: Object Detection, Extraction and Fine-tune - :white_square_button: Custom Object Detection - :white_square_button: Detection Speed - :white_square_button: Hiding/Showing Object Name and Probability - :white_square_button: Image Input & Output Types - :white_square_button: Documentation ImageAI provides very convenient and powerful methods to perform object detection on images and extract each object from the image. The object detection class supports RetinaNet, YOLOv3 and TinyYOLOv3. To start performing object detection, you must download the RetinaNet, YOLOv3 or TinyYOLOv3 object detection model via the links below: * **[RetinaNet](https://github.com/OlafenwaMoses/ImageAI/releases/download/essentials-v5/resnet50_coco_best_v2.1.0.h5)** _(Size = 145 mb, high performance and accuracy, with longer detection time)_ * **[YOLOv3](https://github.com/OlafenwaMoses/ImageAI/releases/download/1.0/yolo.h5)** _(Size = 237 mb, moderate performance and accuracy, with a moderate detection time)_ * **[TinyYOLOv3](https://github.com/OlafenwaMoses/ImageAI/releases/download/1.0/yolo-tiny.h5)** _(Size = 34 mb, optimized for speed and moderate performance, with fast detection time)_ Once you download the object detection model file, you should copy the model file to the your project folder where your .py files will be. Then create a python file and give it a name; an example is FirstObjectDetection.py. Then write the code below into the python file: ### FirstObjectDetection.py
```python from imageai.Detection import ObjectDetection import os execution_path = os.getcwd() detector = ObjectDetection() detector.setModelTypeAsYOLOv3() detector.setModelPath( os.path.join(execution_path , "yolo.h5")) detector.loadModel() detections = detector.detectObjectsFromImage(input_image=os.path.join(execution_path , "image2.jpg"), output_image_path=os.path.join(execution_path , "image2new.jpg"), minimum_percentage_probability=30) for eachObject in detections: print(eachObject["name"] , " : ", eachObject["percentage_probability"], " : ", eachObject["box_points"] ) print("--------------------------------") ``` Sample Result: Input Image ![Input Image](../../data-images/image2.jpg) Output Image ![Output Image](../../data-images/yolo.jpg) ``` laptop : 87.32235431671143 : (306, 238, 390, 284) -------------------------------- laptop : 96.86298966407776 : (121, 209, 258, 293) -------------------------------- laptop : 98.6301600933075 : (279, 321, 401, 425) -------------------------------- laptop : 99.78572130203247 : (451, 204, 579, 285) -------------------------------- bed : 94.02391314506531 : (23, 205, 708, 553) -------------------------------- apple : 48.03136885166168 : (527, 343, 557, 364) -------------------------------- cup : 34.09906327724457 : (462, 347, 496, 379) -------------------------------- cup : 44.65090036392212 : (582, 342, 618, 386) -------------------------------- person : 57.70219564437866 : (27, 311, 341, 437) -------------------------------- person : 85.26121377944946 : (304, 173, 387, 253) -------------------------------- person : 96.33603692054749 : (415, 130, 538, 266) -------------------------------- person : 96.95255160331726 : (174, 108, 278, 269) -------------------------------- ``` Let us make a breakdown of the object detection code that we used above. ```python from imageai.Detection import ObjectDetection import os execution_path = os.getcwd() ``` In the 3 lines above , we import the **ImageAI object detection** class in the first line, import the `os` in the second line and obtained the path to folder where our python file runs. ```python detector = ObjectDetection() detector.setModelTypeAsYOLOv3() detector.setModelPath( os.path.join(execution_path , "yolo.h5")) detector.loadModel() ``` In the 4 lines above, we created a new instance of the `ObjectDetection` class in the first line, set the model type to YOLOv3 in the second line, set the model path to the YOLOv3 model file we downloaded and copied to the python file folder in the third line and load the model in the fourth line. ```python detections = detector.detectObjectsFromImage(input_image=os.path.join(execution_path , "image2.jpg"), output_image_path=os.path.join(execution_path , "image2new.jpg")) for eachObject in detections: print(eachObject["name"] , " : ", eachObject["percentage_probability"], " : ", eachObject["box_points"] ) print("--------------------------------") ``` In the 2 lines above, we ran the `detectObjectsFromImage()` function and parse in the path to our image, and the path to the new image which the function will save. Then the function returns an array of dictionaries with each dictionary corresponding to the number of objects detected in the image. Each dictionary has the properties `name` (name of the object), `percentage_probability` (percentage probability of the detection) and `box_points` (the x1,y1,x2 and y2 coordinates of the bounding box of the object). Should you want to use the RetinaNet which is appropriate for high-performance and high-accuracy demanding detection tasks, you will download the RetinaNet model file from the links above, copy it to your python file's folder, set the model type and model path in your python code as seen below: ```python detector = ObjectDetection() detector.setModelTypeAsRetinaNet() detector.setModelPath( os.path.join(execution_path , "resnet50_coco_best_v2.0.1.h5")) detector.loadModel() ``` However, if you desire TinyYOLOv3 which is optimized for speed and embedded devices, you will download the TinyYOLOv3 model file from the links above, copy it to your python file's folder, set the model type and model path in your python code as seen below: ```python detector = ObjectDetection() detector.setModelTypeAsTinyYOLOv3() detector.setModelPath( os.path.join(execution_path , "yolo-tiny.h5")) detector.loadModel() ``` ## Object Detection, Extraction and Fine-tune
In the examples we used above, we ran the object detection on an image and it returned the detected objects in an array as well as save a new image with rectangular markers drawn on each object. In our next examples, we will be able to extract each object from the input image and save it independently. In the example code below which is very identical to the previous object detction code, we will save each object detected as a seperate image. ```python from imageai.Detection import ObjectDetection import os execution_path = os.getcwd() detector = ObjectDetection() detector.setModelTypeAsYOLOv3() detector.setModelPath( os.path.join(execution_path , "yolo.h5")) detector.loadModel() detections, objects_path = detector.detectObjectsFromImage(input_image=os.path.join(execution_path , "image3.jpg"), output_image_path=os.path.join(execution_path , "image3new.jpg"), minimum_percentage_probability=30, extract_detected_objects=True) for eachObject, eachObjectPath in zip(detections, objects_path): print(eachObject["name"] , " : " , eachObject["percentage_probability"], " : ", eachObject["box_points"] ) print("Object's image saved in " + eachObjectPath) print("--------------------------------") ``` ![Input Image](../../data-images/image3.jpg) ![Output Images](../../data-images/image3new.jpg) ![dog](../../data-images/image3new-objects/dog-1.jpg) ![motorcycle](../../data-images/image3new-objects/motorcycle-3.jpg) ![car](../../data-images/image3new-objects/car-4.jpg) ![bicycle](../../data-images/image3new-objects/bicycle-5.jpg) ![person](../../data-images/image3new-objects/person-6.jpg) ![person](../../data-images/image3new-objects/person-7.jpg) ![person](../../data-images/image3new-objects/person-8.jpg) ![person](../../data-images/image3new-objects/person-9.jpg) ![person](../../data-images/image3new-objects/person-10.jpg) Let us review the part of the code that perform the object detection and extract the images: ```python detections, objects_path = detector.detectObjectsFromImage(input_image=os.path.join(execution_path , "image3.jpg"), output_image_path=os.path.join(execution_path , "image3new.jpg"), minimum_percentage_probability=30, extract_detected_objects=True) for eachObject, eachObjectPath in zip(detections, objects_path): print(eachObject["name"] , " : " , eachObject["percentage_probability"], " : ", eachObject["box_points"] ) print("Object's image saved in " + eachObjectPath) print("--------------------------------") ``` In the above above lines, we called the `detectObjectsFromImage()` , parse in the input image path, output image path, and an extra parameter `extract_detected_objects=True`. This parameter states that the function should extract each object detected from the image and save it has a seperate image. The parameter is false by default. Once set to `true`, the function will create a directory which is the **output image path + "-objects"** . Then it saves all the extracted images into this new directory with each image's name being the **detected object name + "-" + a number** which corresponds to the order at which the objects were detected. This new parameter we set to extract and save detected objects as an image will make the function to return 2 values. The first is the array of dictionaries with each dictionary corresponding to a detected object. The second is an array of the paths to the saved images of each object detected and extracted, and they are arranged in order at which the objects are in the first array. **And one important feature you need to know!** You will recall that the percentage probability for each detected object is sent back by the `detectObjectsFromImage()` function. The function has a parameter `minimum_percentage_probability`, whose default value is `50` (value ranges between 0 - 100) , but it set to 30 in this example. That means the function will only return a detected object if it's percentage probability is **30 or above**. The value was kept at this number to ensure the integrity of the detection results. You fine-tune the object detection by setting **minimum_percentage_probability** equal to a smaller value to detect more number of objects or higher value to detect less number of objects. ## Custom Object Detection
The object detection model (**RetinaNet**) supported by **ImageAI** can detect 80 different types of objects. They include: ``` person, bicycle, car, motorcycle, airplane, bus, train, truck, boat, traffic light, fire hydrant, stop_sign, parking meter, bench, bird, cat, dog, horse, sheep, cow, elephant, bear, zebra, giraffe, backpack, umbrella, handbag, tie, suitcase, frisbee, skis, snowboard, sports ball, kite, baseball bat, baseball glove, skateboard, surfboard, tennis racket, bottle, wine glass, cup, fork, knife, spoon, bowl, banana, apple, sandwich, orange, broccoli, carrot, hot dog, pizza, donot, cake, chair, couch, potted plant, bed, dining table, toilet, tv, laptop, mouse, remote, keyboard, cell phone, microwave, oven, toaster, sink, refrigerator, book, clock, vase, scissors, teddy bear, hair dryer, toothbrush. ``` Interestingly, **ImageAI** allow you to perform detection for one or more of the items above. That means you can customize the type of object(s) you want to be detected in the image. Let's take a look at the code below: ```python from imageai.Detection import ObjectDetection import os execution_path = os.getcwd() detector = ObjectDetection() detector.setModelTypeAsYOLOv3() detector.setModelPath( os.path.join(execution_path , "yolo.h5")) detector.loadModel() custom_objects = detector.CustomObjects(car=True, motorcycle=True) detections = detector.detectCustomObjectsFromImage(custom_objects=custom_objects, input_image=os.path.join(execution_path , "image3.jpg"), output_image_path=os.path.join(execution_path , "image3custom.jpg"), minimum_percentage_probability=30) for eachObject in detections: print(eachObject["name"] , " : ", eachObject["percentage_probability"], " : ", eachObject["box_points"] ) print("--------------------------------") ``` ![Result](../../data-images/image3custom.jpg) Let us take a look at the part of the code that made this possible. ```python custom_objects = detector.CustomObjects(car=True, motorcycle=True) detections = detector.detectCustomObjectsFromImage(custom_objects=custom_objects, input_image=os.path.join(execution_path , "image3.jpg"), output_image_path=os.path.join(execution_path , "image3custom.jpg"), minimum_percentage_probability=30) ``` In the above code, after loading the model (can be done before loading the model as well), we defined a new variable `custom_objects = detector.CustomObjects()`, in which we set its car and motorcycle properties equal to **True**. This is to tell the model to detect only the object we set to True. Then we call the `detector.detectCustomObjectsFromImage()` which is the function that allows us to perform detection of custom objects. Then we will set the `custom_objects` value to the custom objects variable we defined. ## Detection Speed
**ImageAI** now provides detection speeds for all object detection tasks. The detection speeds allow you to reduce the time of detection at a rate between 20% - 80%, and yet having just slight changes but accurate detection results. Coupled with lowering the `minimum_percentage_probability` parameter, detections can match the normal speed and yet reduce detection time drastically. The available detection speeds are **"normal"**(default), **"fast"**, **"faster"** , **"fastest"** and **"flash"**. All you need to do is to state the speed mode you desire when loading the model as seen below. ```python detector.loadModel(detection_speed="fast") ``` ## Hiding/Showing Object Name and Probability
**ImageAI** provides options to hide the name of objects detected and/or the percentage probability from being shown on the saved/returned detected image. Using the `detectObjectsFromImage()` and `detectCustomObjectsFromImage()` functions, the parameters `display_object_name` and `display_percentage_probability` can be set to True of False individually. Take a look at the code below: ```python detections = detector.detectObjectsFromImage(input_image=os.path.join(execution_path , "image3.jpg"), output_image_path=os.path.join(execution_path , "image3new_nodetails.jpg"), minimum_percentage_probability=30, display_percentage_probability=False, display_object_name=False) ``` In the above code, we specified that both the object name and percentage probability should not be shown. As you can see in the result below, both the names of the objects and their individual percentage probability is not shown in the detected image. ![Result](../../data-images/nodetails.jpg) ## Image Input & Output Types
**ImageAI** supports 3 types of inputs which are **file path to image file**(default), **numpy array of image** and **image file stream** as well as 2 types of output which are image **file**(default) and numpy **array **. This means you can now perform object detection in production applications such as on a web server and system that returns file in any of the above stated formats. To perform object detection with numpy array or file stream input, you just need to state the input type in the `.detectObjectsFromImage()` function or the `.detectCustomObjectsFromImage()` function. See example below. ```python detections = detector.detectObjectsFromImage(input_type="array", input_image=image_array , output_image_path=os.path.join(execution_path , "image.jpg")) # For numpy array input type detections = detector.detectObjectsFromImage(input_type="stream", input_image=image_stream , output_image_path=os.path.join(execution_path , "test2new.jpg")) # For file stream input type ``` To perform object detection with numpy array output you just need to state the output type in the `.detectObjectsFromImage()` function or the `.detectCustomObjectsFromImage()` function. See example below. ```python detected_image_array, detections = detector.detectObjectsFromImage(output_type="array", input_image="image.jpg" ) # For numpy array output type ``` ## Documentation
We have provided full documentation for all **ImageAI** classes and functions in 3 major languages. Find links below: * Documentation - **English Version [https://imageai.readthedocs.io](https://imageai.readthedocs.io)** * Documentation - **Chinese Version [https://imageai-cn.readthedocs.io](https://imageai-cn.readthedocs.io)** * Documentation - **French Version [https://imageai-fr.readthedocs.io](https://imageai-fr.readthedocs.io)** ================================================ FILE: imageai_tf_deprecated/Detection/VIDEO.md ================================================ # ImageAI : Video Object Detection, Tracking and Analysis A **DeepQuest AI** project [https://deepquestai.com](https://deepquestai.com) --- ## TABLE OF CONTENTS - :white_square_button: First Video Object Detection - :white_square_button: Custom Video Object Detection (Object Tracking) - :white_square_button: Camera / Live Stream Video Detection - :white_square_button: Video Analysis - :white_square_button: Detection Speed - :white_square_button: Hiding/Showing Object Name and Probability - :white_square_button: Frame Detection Intervals - :white_square_button: Video Detection Timeout (NEW) - :white_square_button: Documentation ImageAI provides convenient, flexible and powerful methods to perform object detection on videos. The video object detection class provided only supports RetinaNet, YOLOv3 and TinyYOLOv3. This version of **ImageAI** provides commercial grade video objects detection features, which include but not limited to device/IP camera inputs, per frame, per second, per minute and entire video analysis for storing in databases and/or real-time visualizations and for future insights. To start performing video object detection, you must download the RetinaNet, YOLOv3 or TinyYOLOv3 object detection model via the links below: - **[RetinaNet](https://github.com/OlafenwaMoses/ImageAI/releases/download/1.0/resnet50_coco_best_v2.0.1.h5)** _(Size = 145 mb, high performance and accuracy, with longer detection time)_ - **[YOLOv3](https://github.com/OlafenwaMoses/ImageAI/releases/download/1.0/yolo.h5)** _(Size = 237 mb, moderate performance and accuracy, with a moderate detection time)_ - **[TinyYOLOv3](https://github.com/OlafenwaMoses/ImageAI/releases/download/1.0/yolo-tiny.h5)** _(Size = 34 mb, optimized for speed and moderate performance, with fast detection time)_ Because video object detection is a compute intensive tasks, we advise you perform this experiment using a computer with a NVIDIA GPU and the GPU version of Tensorflow installed. Performing Video Object Detection CPU will be slower than using an NVIDIA GPU powered computer. You can use Google Colab for this experiment as it has an NVIDIA K80 GPU available for free. Once you download the object detection model file, you should copy the model file to the your project folder where your .py files will be. Then create a python file and give it a name; an example is `FirstVideoObjectDetection.py`. Then write the code below into the python file: ### FirstVideoObjectDetection.py
```python from imageai.Detection import VideoObjectDetection import os execution_path = os.getcwd() detector = VideoObjectDetection() detector.setModelTypeAsRetinaNet() detector.setModelPath( os.path.join(execution_path , "resnet50_coco_best_v2.0.1.h5")) detector.loadModel() video_path = detector.detectObjectsFromVideo(input_file_path=os.path.join(execution_path, "traffic.mp4"), output_file_path=os.path.join(execution_path, "traffic_detected") , frames_per_second=20, log_progress=True) print(video_path) ``` Input Video (a 1min 24seconds video) [![](../../data-images/video--1.jpg)](https://github.com/OlafenwaMoses/ImageAI/blob/master/data-videos/traffic.mp4) Output Video [![](../../data-images/video-2.jpg)](https://www.youtube.com/embed/qplVDqOmElI?rel=0) Let us make a breakdown of the object detection code that we used above. ```python from imageai.Detection import VideoObjectDetection import os execution_path = os.getcwd() ``` In the 3 lines above , we import the **ImageAI video object detection ** class in the first line, import the **os** in the second line and obtained the path to folder where our python file runs. ```python detector = VideoObjectDetection() detector.setModelTypeAsRetinaNet() detector.setModelPath( os.path.join(execution_path , "resnet50_coco_best_v2.0.1.h5")) detector.loadModel() ``` In the 4 lines above, we created a new instance of the **VideoObjectDetection** class in the first line, set the model type to RetinaNet in the second line, set the model path to the RetinaNet model file we downloaded and copied to the python file folder in the third line and load the model in the fourth line. ```python video_path = detector.detectObjectsFromVideo(input_file_path=os.path.join(execution_path, "traffic.mp4"), output_file_path=os.path.join(execution_path, "traffic_detected"), frames_per_second=20, log_progress=True) print(video_path) ``` In the 2 lines above, we ran the `detectObjectsFromVideo()` function and parse in the path to our video,the path to the new video (without the extension, it saves a .avi video by default) which the function will save, the number of frames per second (fps) that you we desire the output video to have and option to log the progress of the detection in the console. Then the function returns a the path to the saved video which contains boxes and percentage probabilities rendered on objects detected in the video. ### Custom Video Object Detection
The video object detection model (**RetinaNet**) supported by **ImageAI** can detect 80 different types of objects. They include: ``` person, bicycle, car, motorcycle, airplane, bus, train, truck, boat, traffic light, fire hydrant, stop_sign, parking meter, bench, bird, cat, dog, horse, sheep, cow, elephant, bear, zebra, giraffe, backpack, umbrella, handbag, tie, suitcase, frisbee, skis, snowboard, sports ball, kite, baseball bat, baseball glove, skateboard, surfboard, tennis racket, bottle, wine glass, cup, fork, knife, spoon, bowl, banana, apple, sandwich, orange, broccoli, carrot, hot dog, pizza, donot, cake, chair, couch, potted plant, bed, dining table, toilet, tv, laptop, mouse, remote, keyboard, cell phone, microwave, oven, toaster, sink, refrigerator, book, clock, vase, scissors, teddy bear, hair dryer, toothbrush. ``` Interestingly, **ImageAI** allow you to perform detection for one or more of the items above. That means you can customize the type of object(s) you want to be detected in the video. Let's take a look at the code below: ```python from imageai.Detection import VideoObjectDetection import os execution_path = os.getcwd() detector = VideoObjectDetection() detector.setModelTypeAsRetinaNet() detector.setModelPath( os.path.join(execution_path , "resnet50_coco_best_v2.0.1.h5")) detector.loadModel() custom_objects = detector.CustomObjects(person=True, bicycle=True, motorcycle=True) video_path = detector.detectCustomObjectsFromVideo( custom_objects=custom_objects, input_file_path=os.path.join(execution_path, "traffic.mp4"), output_file_path=os.path.join(execution_path, "traffic_custom_detected"), frames_per_second=20, log_progress=True) print(video_path) ``` Let us take a look at the part of the code that made this possible. ```python custom_objects = detector.CustomObjects(person=True, bicycle=True, motorcycle=True) video_path = detector.detectCustomObjectsFromVideo( custom_objects=custom_objects, input_file_path=os.path.join(execution_path, "traffic.mp4"), output_file_path=os.path.join(execution_path, "traffic_custom_detected"), frames_per_second=20, log_progress=True) ``` In the above code, after loading the model (can be done before loading the model as well), we defined a new variable `custom_objects = detector.CustomObjects()`, in which we set its person, car and motorcycle properties equal to **True**. This is to tell the model to detect only the object we set to True. Then we call the `detector.detectCustomObjectsFromVideo()` which is the function that allows us to perform detection of custom objects. Then we will set the `custom_objects` value to the custom objects variable we defined. Output Video [![Output Video](../../data-images/video-3.jpg)](https://www.youtube.com/embed/YfAycAzkwPM?rel=0) C:\Users\User\PycharmProjects\ImageAITest\traffic_custom_detected.avi ### Camera / Live Stream Video Detection
**ImageAI** now allows live-video detection with support for camera inputs. Using **OpenCV**'s `VideoCapture()` function, you can load live-video streams from a device camera, cameras connected by cable or IP cameras, and parse it into **ImageAI**'s `detectObjectsFromVideo()` and `detectCustomObjectsFromVideo()` functions. All features that are supported for detecting objects in a video file is also available for detecting objects in a camera's live-video feed. Find below an example of detecting live-video feed from the device camera. ```python from imageai.Detection import VideoObjectDetection import os import cv2 execution_path = os.getcwd() camera = cv2.VideoCapture(0) detector = VideoObjectDetection() detector.setModelTypeAsRetinaNet() detector.setModelPath(os.path.join(execution_path , "resnet50_coco_best_v2.0.1.h5")) detector.loadModel() video_path = detector.detectObjectsFromVideo( camera_input=camera, output_file_path=os.path.join(execution_path, "camera_detected_video"), frames_per_second=20, log_progress=True, minimum_percentage_probability=40) ``` The difference in the code above and the code for the detection of a video file is that we defined an **OpenCV VideoCapture** instance and loaded the default device camera into it. Then we parsed the camera we defined into the parameter `camera_input` which replaces the `input_file_path` that is used for video file. ### Video Analysis
**ImageAI** now provide commercial-grade video analysis in the Video Object Detection class, for both video file inputs and camera inputs. This feature allows developers to obtain deep insights into any video processed with **ImageAI**. This insights can be visualized in real-time, stored in a NoSQL database for future review or analysis. For video analysis, the `detectObjectsFromVideo()` and `detectCustomObjectsFromVideo()` now allows you to state your own defined functions which will be executed for every frame, seconds and/or minute of the video detected as well as a state a function that will be executed at the end of a video detection. Once this functions are stated, they will receive raw but comprehensive analytical data on the index of the frame/second/minute, objects detected (name, percentage_probability and box_points), number of instances of each unique object detected and average number of occurrence of each unique object detected over a second/minute and entire video. To obtain the video analysis, all you need to do is specify a function, state the corresponding parameters it will be receiving and parse the function name into the `per_frame_function`, `per_second_function`, `per_minute_function` and `video_complete_function` parameters in the detection function. Find below examples of video analysis functions. ```python def forFrame(frame_number, output_array, output_count): print("FOR FRAME " , frame_number) print("Output for each object : ", output_array) print("Output count for unique objects : ", output_count) print("------------END OF A FRAME --------------") def forSeconds(second_number, output_arrays, count_arrays, average_output_count): print("SECOND : ", second_number) print("Array for the outputs of each frame ", output_arrays) print("Array for output count for unique objects in each frame : ", count_arrays) print("Output average count for unique objects in the last second: ", average_output_count) print("------------END OF A SECOND --------------") def forMinute(minute_number, output_arrays, count_arrays, average_output_count): print("MINUTE : ", minute_number) print("Array for the outputs of each frame ", output_arrays) print("Array for output count for unique objects in each frame : ", count_arrays) print("Output average count for unique objects in the last minute: ", average_output_count) print("------------END OF A MINUTE --------------") video_detector = VideoObjectDetection() video_detector.setModelTypeAsYOLOv3() video_detector.setModelPath(os.path.join(execution_path, "yolo.h5")) video_detector.loadModel() video_detector.detectObjectsFromVideo( input_file_path=os.path.join(execution_path, "traffic.mp4"), output_file_path=os.path.join(execution_path, "traffic_detected"), frames_per_second=10, per_second_function=forSeconds, per_frame_function=forFrame, per_minute_function=forMinute, minimum_percentage_probability=30 ) ``` When the detection starts on a video feed, be it from a video file or camera input, the result will have the format as below: **Results for the Frame function** ``` FOR FRAME : 1 Output for each object : [{'box_points': (362, 295, 443, 355), 'name': 'boat', 'percentage_probability': 26.666194200515747}, {'box_points': (319, 245, 386, 296), 'name': 'boat', 'percentage_probability': 30.052968859672546}, {'box_points': (219, 308, 341, 358), 'name': 'boat', 'percentage_probability': 47.46982455253601}, {'box_points': (589, 198, 621, 241), 'name': 'bus', 'percentage_probability': 24.62330162525177}, {'box_points': (519, 181, 583, 263), 'name': 'bus', 'percentage_probability': 27.446213364601135}, {'box_points': (493, 197, 561, 272), 'name': 'bus', 'percentage_probability': 59.81815457344055}, {'box_points': (432, 187, 491, 240), 'name': 'bus', 'percentage_probability': 64.42965269088745}, {'box_points': (157, 225, 220, 255), 'name': 'car', 'percentage_probability': 21.150341629981995}, {'box_points': (324, 249, 377, 293), 'name': 'car', 'percentage_probability': 24.089913070201874}, {'box_points': (152, 275, 260, 327), 'name': 'car', 'percentage_probability': 30.341443419456482}, {'box_points': (433, 198, 485, 244), 'name': 'car', 'percentage_probability': 37.205660343170166}, {'box_points': (184, 226, 233, 260), 'name': 'car', 'percentage_probability': 38.52525353431702}, {'box_points': (3, 296, 134, 359), 'name': 'car', 'percentage_probability': 47.80363142490387}, {'box_points': (357, 302, 439, 359), 'name': 'car', 'percentage_probability': 47.94844686985016}, {'box_points': (481, 266, 546, 314), 'name': 'car', 'percentage_probability': 65.8585786819458}, {'box_points': (597, 269, 624, 318), 'name': 'person', 'percentage_probability': 27.125394344329834}] Output count for unique objects : {'bus': 4, 'boat': 3, 'person': 1, 'car': 8} ------------END OF A FRAME -------------- ``` For any function you parse into the **per_frame_function**, the function will be executed after every single video frame is processed and he following will be parsed into it: * **Frame Index:** This is the position number of the frame inside the video (e.g 1 for first frame and 20 for twentieth frame). * **Output Array:** This is an array of dictionaries. Each dictionary corresponds to each detected object in the image and it contains the "name", "percentage_probabaility" and "box_points"(x1,y1,x2,y2) values of the object. * **Output Count:** This is a dictionary that has the name of each unique object detected as its keys and the number of instances of the objects detected as the values. **Results for the Second function** ``` FOR SECOND : 1 Array for the outputs of each frame [[{'box_points': (362, 295, 443, 355), 'name': 'boat', 'percentage_probability': 26.666194200515747}, {'box_points': (319, 245, 386, 296), 'name': 'boat', 'percentage_probability': 30.052968859672546}, {'box_points': (219, 308, 341, 358), 'name': 'boat', 'percentage_probability': 47.46982455253601}, {'box_points': (589, 198, 621, 241), 'name': 'bus', 'percentage_probability': 24.62330162525177}, {'box_points': (519, 181, 583, 263), 'name': 'bus', 'percentage_probability': 27.446213364601135}, {'box_points': (493, 197, 561, 272), 'name': 'bus', 'percentage_probability': 59.81815457344055}, {'box_points': (432, 187, 491, 240), 'name': 'bus', 'percentage_probability': 64.42965269088745}, {'box_points': (157, 225, 220, 255), 'name': 'car', 'percentage_probability': 21.150341629981995}, {'box_points': (324, 249, 377, 293), 'name': 'car', 'percentage_probability': 24.089913070201874}, {'box_points': (152, 275, 260, 327), 'name': 'car', 'percentage_probability': 30.341443419456482}, {'box_points': (433, 198, 485, 244), 'name': 'car', 'percentage_probability': 37.205660343170166}, {'box_points': (184, 226, 233, 260), 'name': 'car', 'percentage_probability': 38.52525353431702}, {'box_points': (3, 296, 134, 359), 'name': 'car', 'percentage_probability': 47.80363142490387}, {'box_points': (357, 302, 439, 359), 'name': 'car', 'percentage_probability': 47.94844686985016}, {'box_points': (481, 266, 546, 314), 'name': 'car', 'percentage_probability': 65.8585786819458}, {'box_points': (597, 269, 624, 318), 'name': 'person', 'percentage_probability': 27.125394344329834}], [{'box_points': (316, 240, 384, 302), 'name': 'boat', 'percentage_probability': 29.594269394874573}, {'box_points': (361, 295, 441, 354), 'name': 'boat', 'percentage_probability': 36.11513376235962}, {'box_points': (216, 305, 340, 357), 'name': 'boat', 'percentage_probability': 44.89373862743378}, {'box_points': (432, 198, 488, 244), 'name': 'truck', 'percentage_probability': 22.914741933345795}, {'box_points': (589, 199, 623, 240), 'name': 'bus', 'percentage_probability': 20.545457303524017}, {'box_points': (519, 182, 583, 263), 'name': 'bus', 'percentage_probability': 24.467085301876068}, {'box_points': (492, 197, 563, 271), 'name': 'bus', 'percentage_probability': 61.112016439437866}, {'box_points': (433, 188, 490, 241), 'name': 'bus', 'percentage_probability': 65.08989334106445}, {'box_points': (352, 303, 442, 357), 'name': 'car', 'percentage_probability': 20.025095343589783}, {'box_points': (136, 172, 188, 195), 'name': 'car', 'percentage_probability': 21.571354568004608}, {'box_points': (152, 276, 261, 326), 'name': 'car', 'percentage_probability': 33.07966589927673}, {'box_points': (181, 225, 230, 256), 'name': 'car', 'percentage_probability': 35.111838579177856}, {'box_points': (432, 198, 488, 244), 'name': 'car', 'percentage_probability': 36.25282347202301}, {'box_points': (3, 292, 130, 360), 'name': 'car', 'percentage_probability': 67.55480170249939}, {'box_points': (479, 265, 546, 314), 'name': 'car', 'percentage_probability': 71.47912979125977}, {'box_points': (597, 269, 625, 318), 'name': 'person', 'percentage_probability': 25.903674960136414}],................, [{'box_points': (133, 250, 187, 278), 'name': 'umbrella', 'percentage_probability': 21.518094837665558}, {'box_points': (154, 233, 218, 259), 'name': 'umbrella', 'percentage_probability': 23.687003552913666}, {'box_points': (348, 311, 425, 360), 'name': 'boat', 'percentage_probability': 21.015766263008118}, {'box_points': (11, 164, 137, 225), 'name': 'bus', 'percentage_probability': 32.20453858375549}, {'box_points': (424, 187, 485, 243), 'name': 'bus', 'percentage_probability': 38.043853640556335}, {'box_points': (496, 186, 570, 264), 'name': 'bus', 'percentage_probability': 63.83994221687317}, {'box_points': (588, 197, 622, 240), 'name': 'car', 'percentage_probability': 23.51653128862381}, {'box_points': (58, 268, 111, 303), 'name': 'car', 'percentage_probability': 24.538707733154297}, {'box_points': (2, 246, 72, 301), 'name': 'car', 'percentage_probability': 28.433072566986084}, {'box_points': (472, 273, 539, 323), 'name': 'car', 'percentage_probability': 87.17672824859619}, {'box_points': (597, 270, 626, 317), 'name': 'person', 'percentage_probability': 27.459821105003357}] ] Array for output count for unique objects in each frame : [{'bus': 4, 'boat': 3, 'person': 1, 'car': 8}, {'truck': 1, 'bus': 4, 'boat': 3, 'person': 1, 'car': 7}, {'bus': 5, 'boat': 2, 'person': 1, 'car': 5}, {'bus': 5, 'boat': 1, 'person': 1, 'car': 9}, {'truck': 1, 'bus': 2, 'car': 6, 'person': 1}, {'truck': 2, 'bus': 4, 'boat': 2, 'person': 1, 'car': 7}, {'truck': 1, 'bus': 3, 'car': 7, 'person': 1, 'umbrella': 1}, {'bus': 4, 'car': 7, 'person': 1, 'umbrella': 2}, {'bus': 3, 'car': 6, 'boat': 1, 'person': 1, 'umbrella': 3}, {'bus': 3, 'car': 4, 'boat': 1, 'person': 1, 'umbrella': 2}] Output average count for unique objects in the last second: {'truck': 0.5, 'bus': 3.7, 'umbrella': 0.8, 'boat': 1.3, 'person': 1.0, 'car': 6.6} ------------END OF A SECOND -------------- ``` In the above result, the video was processed and saved in 10 frames per second (FPS). For any function you parse into the **per_second_function**, the function will be executed after every single second of the video that is processed and he following will be parsed into it: - **Second Index:** This is the position number of the second inside the video (e.g 1 for first second and 20 for twentieth second). - **Output Array:** This is an array of arrays, with each contained array and its position (array index + 1) corresponding to the equivalent frame in the last second of the video (In the above example, their are 10 arrays which corresponds to the 10 frames contained in one second). Each contained array contains dictionaries. Each dictionary corresponds to each detected object in the image and it contains the "name", "percentage_probabaility" and "box_points"(x1,y1,x2,y2) values of the object. - **Count arrays:** This is an array of dictionaries. Each dictionary and its position (array index + 1) corresponds to the equivalent frame in the last second of he video. Each dictionary has the name of each unique object detected as its keys and the number of instances of the objects detected as the values. - **Average Output Count:** This is a dictionary that has the name of each unique object detected in the last second as its keys and the average number of instances of the objects detected across the number of frames as the values. **Results for the Minute function** The above set of **4 parameters** that are returned for every second of the video processed is the same parameters to that will be returned for every minute of the video processed. The difference is that the index returned corresponds to the minute index, the **output_arrays** is an array that contains the number of FPS * 60 number of arrays (in the code example above, 10 frames per second(fps) * 60 seconds = 600 frames = 600 arrays), and the **count_arrays** is an array that contains the number of FPS * 60 number of dictionaries (in the code example above, 10 frames per second(fps) * 60 seconds = 600 frames = 600 dictionaries) and the **average_output_count** is a dictionary that covers all the objects detected in all the frames contained in the last minute. **Results for the Video Complete Function** **ImageAI** allows you to obtain complete analysis of the entire video processed. All you need is to define a function like the forSecond or forMinute function and set the **video_complete_function** parameter into your `.detectObjectsFromVideo()` or `.detectCustomObjectsFromVideo()` function. The same values for the per_second-function and per_minute_function will be returned. The difference is that no index will be returned and the other 3 values will be returned, and the 3 values will cover all frames in the video. Below is a sample function: ```python def forFull(output_arrays, count_arrays, average_output_count): #Perform action on the 3 parameters returned into the function video_detector.detectObjectsFromVideo( input_file_path=os.path.join(execution_path, "traffic.mp4"), output_file_path=os.path.join(execution_path, "traffic_detected"), frames_per_second=10, video_complete_function=forFull, minimum_percentage_probability=30 ) ``` **FINAL NOTE ON VIDEO ANALYSIS** : **ImageAI** allows you to obtain the detected video frame as a Numpy array at each frame, second and minute function. All you need to do is specify one more parameter in your function and set `return_detected_frame=True` in your `detectObjectsFromVideo()` or `detectCustomObjectsFrom()` function. Once this is set, the extra parameter you sepecified in your function will be the Numpy array of the detected frame. See a sample below: ```python def forFrame(frame_number, output_array, output_count, detected_frame): print("FOR FRAME " , frame_number) print("Output for each object : ", output_array) print("Output count for unique objects : ", output_count) print("Returned Objects is : ", type(detected_frame)) print("------------END OF A FRAME --------------") video_detector.detectObjectsFromVideo( input_file_path=os.path.join(execution_path, "traffic.mp4"), output_file_path=os.path.join(execution_path, "traffic_detected"), frames_per_second=10, per_frame_function=forFrame, minimum_percentage_probability=30, return_detected_frame=True ) ``` ### Video Detection Speed
**ImageAI** now provides detection speeds for all video object detection tasks. The detection speeds allow you to reduce the time of detection at a rate between 20% - 80%, and yet having just slight changes but accurate detection results. Coupled with lowering the **minimum_percentage_probability** parameter, detections can closely match the normal speed and yet reduce detection time drastically. The available detection speeds are **"normal"**(default), **"fast"**, **"faster"** , **"fastest"** and **"flash"**. All you need to do is to state the speed mode you desire when loading the model as seen below. ```python detector.loadModel(detection_speed="fast") ``` To observe the differences in the detection speeds, look below for each speed applied to object detection with coupled with the adjustment of the minimum_percentage_probability , time taken to detect and detections given. The results below are obtained from detections performed on a NVIDIA K80 GPU. Links are provided below to download the videos for each detection speed applied. Video Length = 1min 24seconds, Detection Speed = "normal" , Minimum Percentage Probability = 50 (default), Detection Time = 29min 3seconds [![](../../data-images/video-4.jpg)](https://www.youtube.com/embed/qplVDqOmElI?rel=0) **Video Length = 1min 24seconds, Detection Speed = "fast" , Minimum Percentage Probability = 40, Detection Time = 11min 6seconds** **Video Length = 1min 24seconds, Detection Speed = "faster" , Minimum Percentage Probability = 30, Detection Time = 7min 47seconds** **Video Length = 1min 24seconds, Detection Speed = "fastest" , Minimum Percentage Probability = 20, Detection Time = 6min 20seconds** **Video Length = 1min 24seconds, Detection Speed = "flash" , Minimum Percentage Probability = 10, Detection Time = 3min 55seconds** If you use more powerful NVIDIA GPUs, you will definitely have faster detection time than stated above. ### Frame Detection Intervals
The above video objects detection task are optimized for frame-real-time object detections that ensures that objects in every frame of the video is detected. **ImageAI** provides you the option to adjust the video frame detections which can speed up your video detection process. When calling the `.detectObjectsFromVideo()` or `.detectCustomObjectsFromVideo()`, you can specify at which frame interval detections should be made. By setting the **frame_detection_interval** parameter to be equal to 5 or 20, that means the object detections in the video will be updated after 5 frames or 20 frames. If your output video **frames_per_second** is set to 20, that means the object detections in the video will be updated once in every quarter of a second or every second. This is useful in case scenarious where the available compute is less powerful and speeds of moving objects are low. This ensures you can have objects detected as second-real-time , half-a-second-real-time or whichever way suits your needs. We conducted video object detection on the same input video we have been using all this while by applying a **frame_detection_interval** value equal to 5. The results below are obtained from detections performed on a NVIDIA K80 GPU. See the results and link to download the videos below: **Video Length = 1min 24seconds, Detection Speed = "normal" , Minimum Percentage Probability = 50 (default), Frame Detection Interval = 5, Detection Time = 15min 49seconds** **Video Length = 1min 24seconds, Detection Speed = "fast" , Minimum Percentage Probability = 40, Frame Detection Interval = 5, Detection Time = 5min 6seconds** **Video Length = 1min 24seconds, Detection Speed = "faster" , Minimum Percentage Probability = 30, Frame Detection Interval = 5, Detection Time = 3min 18seconds** **Video Length = 1min 24seconds, Detection Speed = "fastest" , Minimum Percentage Probability = 20 , Frame Detection Interval = 5, Detection Time = 2min 18seconds** [![](../../data-images/video-3.jpg)](https://www.youtube.com/embed/S-jgBTQgbd4?rel=0) **Video Length = 1min 24seconds, Detection Speed = "flash" , Minimum Percentage Probability = 10, Frame Detection Interval = 5, Detection Time = 1min 27seconds** [Download detected video at speed "flash" and interval=5](https://drive.google.com/open?id=1aN2nnVoFjhUWpcz2Und3dsCT9OKrakM0) ###Video Detection Timeout
**ImageAI** now allows you to set a timeout in seconds for detection of objects in videos or camera live feed. To set a timeout for your video detection code, all you need to do is specify the `detection_timeout` parameter in the `detectObjectsFromVideo()` function to the number of desired seconds. In the example code below, we set `detection_timeout` to 120 seconds (2 minutes). ```python from imageai.Detection import VideoObjectDetection import os import cv2 execution_path = os.getcwd() camera = cv2.VideoCapture(0) detector = VideoObjectDetection() detector.setModelTypeAsRetinaNet() detector.setModelPath(os.path.join(execution_path , "resnet50_coco_best_v2.0.1.h5")) detector.loadModel() video_path = detector.detectObjectsFromVideo(camera_input=camera, output_file_path=os.path.join(execution_path, "camera_detected_video"), frames_per_second=20, log_progress=True, minimum_percentage_probability=40, detection_timeout=120) ``` ### Documentation
We have provided full documentation for all **ImageAI** classes and functions in 3 major languages. Find links below: - Documentation - **English Version [https://imageai.readthedocs.io](https://imageai.readthedocs.io)** - Documentation - **Chinese Version [https://imageai-cn.readthedocs.io](https://imageai-cn.readthedocs.io)** - Documentation - **French Version [https://imageai-fr.readthedocs.io](https://imageai-fr.readthedocs.io)** ================================================ FILE: imageai_tf_deprecated/Detection/YOLO/__init__.py ================================================ ================================================ FILE: imageai_tf_deprecated/Detection/YOLO/utils.py ================================================ import tensorflow as tf from keras import backend as K import numpy as np from PIL import Image import cv2 def yolo_head(feats, anchors, num_classes, input_shape, calc_loss=False): num_anchors = len(anchors) anchors_tensor = K.reshape(K.constant(anchors), [1, 1, 1, num_anchors, 2]) grid_shape = K.shape(feats)[1:3] grid_y = K.tile(K.reshape(K.arange(0, stop=grid_shape[0]), [-1, 1, 1, 1]), [1, grid_shape[1], 1, 1]) grid_x = K.tile(K.reshape(K.arange(0, stop=grid_shape[1]), [1, -1, 1, 1]), [grid_shape[0], 1, 1, 1]) grid = K.concatenate([grid_x, grid_y]) grid = K.cast(grid, K.dtype(feats)) feats = K.reshape( feats, [-1, grid_shape[0], grid_shape[1], num_anchors, num_classes + 5]) box_xy = (K.sigmoid(feats[..., :2]) + grid) / K.cast(grid_shape[::-1], K.dtype(feats)) box_wh = K.exp(feats[..., 2:4]) * anchors_tensor / K.cast(input_shape[::-1], K.dtype(feats)) box_confidence = K.sigmoid(feats[..., 4:5]) box_class_probs = K.sigmoid(feats[..., 5:]) if calc_loss == True: return grid, feats, box_xy, box_wh return box_xy, box_wh, box_confidence, box_class_probs def yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape): box_yx = box_xy[..., ::-1] box_hw = box_wh[..., ::-1] input_shape = K.cast(input_shape, K.dtype(box_yx)) image_shape = K.cast(image_shape, K.dtype(box_yx)) new_shape = K.round(image_shape * K.min(input_shape/image_shape)) offset = (input_shape-new_shape)/2./input_shape scale = input_shape/new_shape box_yx = (box_yx - offset) * scale box_hw *= scale box_mins = box_yx - (box_hw / 2.) box_maxes = box_yx + (box_hw / 2.) boxes = K.concatenate([ box_mins[..., 0:1], box_mins[..., 1:2], box_maxes[..., 0:1], box_maxes[..., 1:2] ]) boxes *= K.concatenate([image_shape, image_shape]) return boxes def yolo_boxes_and_scores(feats, anchors, num_classes, input_shape, image_shape): box_xy, box_wh, box_confidence, box_class_probs = yolo_head(feats, anchors, num_classes, input_shape) boxes = yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape) boxes = K.reshape(boxes, [-1, 4]) box_scores = box_confidence * box_class_probs box_scores = K.reshape(box_scores, [-1, num_classes]) return boxes, box_scores def yolo_eval(yolo_outputs, anchors, num_classes, image_shape, max_boxes=20, score_threshold=.6, iou_threshold=.5): num_layers = len(yolo_outputs) anchor_mask = [[6,7,8], [3,4,5], [0,1,2]] if num_layers==3 else [[3,4,5], [1,2,3]] input_shape = K.shape(yolo_outputs[0])[1:3] * 32 boxes = [] box_scores = [] for l in range(num_layers): _boxes, _box_scores = yolo_boxes_and_scores(yolo_outputs[l], anchors[anchor_mask[l]], num_classes, input_shape, image_shape) boxes.append(_boxes) box_scores.append(_box_scores) boxes = K.concatenate(boxes, axis=0) box_scores = K.concatenate(box_scores, axis=0) mask = box_scores >= score_threshold max_boxes_tensor = K.constant(max_boxes, dtype='int32') boxes_ = [] scores_ = [] classes_ = [] for c in range(num_classes): class_boxes = tf.boolean_mask(boxes, mask[:, c]) class_box_scores = tf.boolean_mask(box_scores[:, c], mask[:, c]) nms_index = tf.image.non_max_suppression( class_boxes, class_box_scores, max_boxes_tensor, iou_threshold=iou_threshold) class_boxes = K.gather(class_boxes, nms_index) class_box_scores = K.gather(class_box_scores, nms_index) classes = K.ones_like(class_box_scores, 'int32') * c boxes_.append(class_boxes) scores_.append(class_box_scores) classes_.append(classes) boxes_ = K.concatenate(boxes_, axis=0) scores_ = K.concatenate(scores_, axis=0) classes_ = K.concatenate(classes_, axis=0) return boxes_, scores_, classes_ def letterbox_image(image, size): iw, ih = image.size w, h = size scale = min(w/iw, h/ih) nw = int(iw*scale) nh = int(ih*scale) image = image.resize((nw,nh), Image.BICUBIC) new_image = Image.new('RGB', size, (128,128,128)) new_image.paste(image, ((w-nw)//2, (h-nh)//2)) return new_image def correct_yolo_boxes(boxes, image_h, image_w, net_h, net_w): if (float(net_w)/image_w) < (float(net_h)/image_h): new_w = net_w new_h = (image_h*net_w)/image_w else: new_h = net_w new_w = (image_w*net_h)/image_h for i in range(len(boxes)): x_offset, x_scale = (net_w - new_w)/2./net_w, float(new_w)/net_w y_offset, y_scale = (net_h - new_h)/2./net_h, float(new_h)/net_h boxes[i].xmin = int((boxes[i].xmin - x_offset) / x_scale * image_w) boxes[i].xmax = int((boxes[i].xmax - x_offset) / x_scale * image_w) boxes[i].ymin = int((boxes[i].ymin - y_offset) / y_scale * image_h) boxes[i].ymax = int((boxes[i].ymax - y_offset) / y_scale * image_h) class BoundBox: def __init__(self, xmin, ymin, xmax, ymax, objness = None, classes = None): self.xmin = xmin self.ymin = ymin self.xmax = xmax self.ymax = ymax self.objness = objness self.classes = classes self.label = -1 self.score = -1 def get_label(self): if self.label == -1: self.label = np.argmax(self.classes) return self.label def get_score(self): if self.score == -1: self.score = self.classes[self.get_label()] return self.score def _interval_overlap(interval_a, interval_b): x1, x2 = interval_a x3, x4 = interval_b if x3 < x1: if x4 < x1: return 0 else: return min(x2,x4) - x1 else: if x2 < x3: return 0 else: return min(x2,x4) - x3 def _sigmoid(x): return 1. / (1. + np.exp(-x)) def bbox_iou(box1, box2): intersect_w = _interval_overlap([box1.xmin, box1.xmax], [box2.xmin, box2.xmax]) intersect_h = _interval_overlap([box1.ymin, box1.ymax], [box2.ymin, box2.ymax]) intersect = intersect_w * intersect_h w1, h1 = box1.xmax-box1.xmin, box1.ymax-box1.ymin w2, h2 = box2.xmax-box2.xmin, box2.ymax-box2.ymin union = w1*h1 + w2*h2 - intersect return float(intersect) / union def do_nms(boxes, nms_thresh): if len(boxes) > 0: nb_class = len(boxes[0].classes) else: return for c in range(nb_class): sorted_indices = np.argsort([-box.classes[c] for box in boxes]) for i in range(len(sorted_indices)): index_i = sorted_indices[i] if boxes[index_i].classes[c] == 0: continue for j in range(i+1, len(sorted_indices)): index_j = sorted_indices[j] if bbox_iou(boxes[index_i], boxes[index_j]) >= nms_thresh: boxes[index_j].classes[c] = 0 def decode_netout(netout, anchors, obj_thresh, nms_thresh, net_h, net_w): grid_h, grid_w = netout.shape[:2] nb_box = 3 netout = netout.reshape((grid_h, grid_w, nb_box, -1)) nb_class = netout.shape[-1] - 5 boxes = [] netout[..., :2] = _sigmoid(netout[..., :2]) netout[..., 4:] = _sigmoid(netout[..., 4:]) netout[..., 5:] = netout[..., 4][..., np.newaxis] * netout[..., 5:] netout[..., 5:] *= netout[..., 5:] > obj_thresh for i in range(grid_h*grid_w): row = i / grid_w col = i % grid_w for b in range(nb_box): # 4th element is objectness score objectness = netout[int(row)][int(col)][b][4] #objectness = netout[..., :4] if(objectness.all() <= obj_thresh): continue # first 4 elements are x, y, w, and h x, y, w, h = netout[int(row)][int(col)][b][:4] x = (col + x) / grid_w # center position, unit: image width y = (row + y) / grid_h # center position, unit: image height w = anchors[2 * b + 0] * np.exp(w) / net_w # unit: image width h = anchors[2 * b + 1] * np.exp(h) / net_h # unit: image height # last elements are class probabilities classes = netout[int(row)][col][b][5:] box = BoundBox(x-w/2, y-h/2, x+w/2, y+h/2, objectness, classes) #box = BoundBox(x-w/2, y-h/2, x+w/2, y+h/2, None, classes) boxes.append(box) return boxes def preprocess_input(image, input_shape): net_h, net_w = input_shape new_h, new_w, _ = image.shape # determine the new size of the image if (float(net_w)/new_w) < (float(net_h)/new_h): new_h = int((new_h * net_w)/new_w) new_w = net_w else: new_w = int((new_w * net_h)/new_h) new_h = net_h # resize the image to the new size resized = cv2.resize(image[:,:,::-1]/255., (int(new_w), int(new_h))) # embed the image into the standard letter box new_image = np.ones((net_h, net_w, 3)) * 0.5 new_image[int((net_h-new_h)//2):int((net_h+new_h)//2), int((net_w-new_w)//2):int((net_w+new_w)//2), :] = resized new_image = np.expand_dims(new_image, 0) return new_image def retrieve_yolo_detections(yolo_result, anchors, min_probability, nms_thresh, image_input_size, image_size, labels_dict ): boxes = [] for i in range(len(yolo_result)): # decode the output of the network boxes += decode_netout(yolo_result[i][0], anchors[i], min_probability, nms_thresh, image_input_size[0], image_input_size[1]) # correct the sizes of the bounding boxes correct_yolo_boxes(boxes, image_size[1], image_size[0], image_input_size[0], image_input_size[1]) # suppress non-maximal boxes do_nms(boxes, nms_thresh) detections = list() for box in boxes: label = -1 for i in range(len(labels_dict.keys())): if box.classes[i] > min_probability: label = labels_dict[i] percentage_probability = box.classes[i] * 100 xmin = box.xmin ymin = box.ymin xmax = box.xmax ymax = box.ymax if xmin < 0: xmin = 0 if ymin < 0: ymin = 0 detection = dict() detection["name"] = label detection["percentage_probability"] = percentage_probability detection["box_points"] = [ xmin, ymin, xmax, ymax] detections.append(detection) return detections def draw_boxes(image, box_points, draw_box, label, percentage_probability, color): xmin, ymin, xmax, ymax = box_points if draw_box is True: cv2.rectangle(image, (xmin,ymin), (xmax,ymax), color, 2) if label is not None: if percentage_probability is None: label = "{}".format(label) else: label = "{} {:.2f}%".format(label, percentage_probability) elif percentage_probability is not None: label = "{:.2f}".format(percentage_probability) if label is not None or percentage_probability is not None: cv2.putText(image, label, (xmin, ymin - 13), cv2.FONT_HERSHEY_SIMPLEX, 1e-3 * image.shape[0], (255, 0, 0), 2) cv2.putText(image, label, (xmin, ymin - 13), cv2.FONT_HERSHEY_SIMPLEX, 1e-3 * image.shape[0], (255, 255, 255), 1) return image ================================================ FILE: imageai_tf_deprecated/Detection/YOLO/yolov3.py ================================================ from tensorflow.keras.layers import Conv2D, MaxPool2D, Add, ZeroPadding2D, UpSampling2D, Concatenate, LeakyReLU, Lambda from tensorflow.keras.layers import LeakyReLU from tensorflow.keras.layers import BatchNormalization from tensorflow.keras.regularizers import l2 from tensorflow.keras.models import Model from tensorflow.keras import Input from tensorflow.keras.layers import add, concatenate from tensorflow.keras.layers import Layer import tensorflow as tf class YoloLayer(Layer): def __init__(self, anchors, max_grid, batch_size, warmup_batches, ignore_thresh, grid_scale, obj_scale, noobj_scale, xywh_scale, class_scale, **kwargs): # make the model settings persistent self.ignore_thresh = ignore_thresh self.warmup_batches = warmup_batches self.anchors = tf.constant(anchors, dtype='float', shape=[1,1,1,3,2]) self.grid_scale = grid_scale self.obj_scale = obj_scale self.noobj_scale = noobj_scale self.xywh_scale = xywh_scale self.class_scale = class_scale # make a persistent mesh grid max_grid_h, max_grid_w = max_grid cell_x = tf.cast(tf.reshape(tf.tile(tf.range(max_grid_w), [max_grid_h]), (1, max_grid_h, max_grid_w, 1, 1)), dtype=tf.float32) cell_y = tf.transpose(cell_x, (0,2,1,3,4)) self.cell_grid = tf.tile(tf.concat([cell_x,cell_y],-1), [batch_size, 1, 1, 3, 1]) super(YoloLayer, self).__init__(**kwargs) def build(self, input_shape): super(YoloLayer, self).build(input_shape) # Be sure to call this somewhere! def call(self, x): input_image, y_pred, y_true, true_boxes = x # adjust the shape of the y_predict [batch, grid_h, grid_w, 3, 4+1+nb_class] y_pred = tf.reshape(y_pred, tf.concat([tf.shape(y_pred)[:3], tf.constant([3, -1])], axis=0)) # initialize the masks object_mask = tf.expand_dims(y_true[..., 4], 4) # the variable to keep track of number of batches processed batch_seen = tf.Variable(0.) # compute grid factor and net factor grid_h = tf.shape(y_true)[1] grid_w = tf.shape(y_true)[2] grid_factor = tf.reshape(tf.cast([grid_w, grid_h], tf.float32), [1,1,1,1,2]) net_h = tf.shape(input_image)[1] net_w = tf.shape(input_image)[2] net_factor = tf.reshape(tf.cast([net_w, net_h], tf.float32), [1,1,1,1,2]) """ Adjust prediction """ pred_box_xy = (self.cell_grid[:,:grid_h,:grid_w,:,:] + tf.sigmoid(y_pred[..., :2])) # sigma(t_xy) + c_xy pred_box_wh = y_pred[..., 2:4] # t_wh pred_box_conf = tf.expand_dims(tf.sigmoid(y_pred[..., 4]), 4) # adjust confidence pred_box_class = y_pred[..., 5:] # adjust class probabilities """ Adjust ground truth """ true_box_xy = y_true[..., 0:2] # (sigma(t_xy) + c_xy) true_box_wh = y_true[..., 2:4] # t_wh true_box_conf = tf.expand_dims(y_true[..., 4], 4) true_box_class = tf.argmax(y_true[..., 5:], -1) """ Compare each predicted box to all true boxes """ # initially, drag all objectness of all boxes to 0 conf_delta = pred_box_conf - 0 # then, ignore the boxes which have good overlap with some true box true_xy = true_boxes[..., 0:2] / grid_factor true_wh = true_boxes[..., 2:4] / net_factor true_wh_half = true_wh / 2. true_mins = true_xy - true_wh_half true_maxes = true_xy + true_wh_half pred_xy = tf.expand_dims(pred_box_xy / grid_factor, 4) pred_wh = tf.expand_dims(tf.exp(pred_box_wh) * self.anchors / net_factor, 4) pred_wh_half = pred_wh / 2. pred_mins = pred_xy - pred_wh_half pred_maxes = pred_xy + pred_wh_half intersect_mins = tf.maximum(pred_mins, true_mins) intersect_maxes = tf.minimum(pred_maxes, true_maxes) intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] true_areas = true_wh[..., 0] * true_wh[..., 1] pred_areas = pred_wh[..., 0] * pred_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = tf.truediv(intersect_areas, union_areas) best_ious = tf.reduce_max(iou_scores, axis=4) conf_delta *= tf.expand_dims(tf.cast((best_ious < self.ignore_thresh), dtype=tf.float32), 4) """ Compute some online statistics """ true_xy = true_box_xy / grid_factor true_wh = tf.exp(true_box_wh) * self.anchors / net_factor true_wh_half = true_wh / 2. true_mins = true_xy - true_wh_half true_maxes = true_xy + true_wh_half pred_xy = pred_box_xy / grid_factor pred_wh = tf.exp(pred_box_wh) * self.anchors / net_factor pred_wh_half = pred_wh / 2. pred_mins = pred_xy - pred_wh_half pred_maxes = pred_xy + pred_wh_half intersect_mins = tf.maximum(pred_mins, true_mins) intersect_maxes = tf.minimum(pred_maxes, true_maxes) intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] true_areas = true_wh[..., 0] * true_wh[..., 1] pred_areas = pred_wh[..., 0] * pred_wh[..., 1] union_areas = pred_areas + true_areas - intersect_areas iou_scores = tf.truediv(intersect_areas, union_areas) iou_scores = object_mask * tf.expand_dims(iou_scores, 4) count = tf.reduce_sum(object_mask) count_noobj = tf.reduce_sum(1 - object_mask) detect_mask = tf.cast((pred_box_conf*object_mask >= 0.5), dtype=tf.float32) class_mask = tf.expand_dims(tf.cast(tf.equal(tf.argmax(pred_box_class, -1), true_box_class), dtype=tf.float32), 4) recall50 = tf.reduce_sum(tf.cast((iou_scores >= 0.5), dtype=tf.float32) * detect_mask * class_mask) / (count + 1e-3) recall75 = tf.reduce_sum(tf.cast((iou_scores >= 0.75), dtype=tf.float32) * detect_mask * class_mask) / (count + 1e-3) avg_iou = tf.reduce_sum(iou_scores) / (count + 1e-3) avg_obj = tf.reduce_sum(pred_box_conf * object_mask) / (count + 1e-3) avg_noobj = tf.reduce_sum(pred_box_conf * (1-object_mask)) / (count_noobj + 1e-3) avg_cat = tf.reduce_sum(object_mask * class_mask) / (count + 1e-3) """ Warm-up training """ batch_seen = tf.compat.v1.assign_add(batch_seen, 1.) true_box_xy, true_box_wh, xywh_mask = tf.cond(tf.less(batch_seen, self.warmup_batches+1), lambda: [true_box_xy + (0.5 + self.cell_grid[:,:grid_h,:grid_w,:,:]) * (1-object_mask), true_box_wh + tf.zeros_like(true_box_wh) * (1-object_mask), tf.ones_like(object_mask)], lambda: [true_box_xy, true_box_wh, object_mask]) """ Compare each true box to all anchor boxes """ wh_scale = tf.exp(true_box_wh) * self.anchors / net_factor wh_scale = tf.expand_dims(2 - wh_scale[..., 0] * wh_scale[..., 1], axis=4) # the smaller the box, the bigger the scale xy_delta = xywh_mask * (pred_box_xy-true_box_xy) * wh_scale * self.xywh_scale wh_delta = xywh_mask * (pred_box_wh-true_box_wh) * wh_scale * self.xywh_scale conf_delta = object_mask * (pred_box_conf-true_box_conf) * self.obj_scale + (1-object_mask) * conf_delta * self.noobj_scale class_delta = object_mask * \ tf.expand_dims(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=true_box_class, logits=pred_box_class), 4) * \ self.class_scale loss_xy = tf.reduce_sum(tf.square(xy_delta), list(range(1,5))) loss_wh = tf.reduce_sum(tf.square(wh_delta), list(range(1,5))) loss_conf = tf.reduce_sum(tf.square(conf_delta), list(range(1,5))) loss_class = tf.reduce_sum(class_delta, list(range(1,5))) loss = loss_xy + loss_wh + loss_conf + loss_class return loss*self.grid_scale def compute_output_shape(self, input_shape): return [(None, 1)] def dummy_loss(y_true, y_pred): return tf.sqrt(tf.reduce_sum(y_pred)) def NetworkConv2D_BN_Leaky(input, channels, kernel_size, kernel_regularizer = l2(5e-4), strides=(1,1), padding="same", use_bias=False): network = Conv2D( filters=channels, kernel_size=kernel_size, strides=strides, padding=padding, kernel_regularizer=kernel_regularizer, use_bias=use_bias)(input) network = BatchNormalization()(network) network = LeakyReLU(alpha=0.1)(network) return network def residual_block(input, channels, num_blocks): network = ZeroPadding2D(((1,0), (1,0)))(input) network = NetworkConv2D_BN_Leaky(input=network,channels=channels, kernel_size=(3,3), strides=(2,2), padding="valid") for blocks in range(num_blocks): network_1 = NetworkConv2D_BN_Leaky(input=network, channels= channels // 2, kernel_size=(1,1)) network_1 = NetworkConv2D_BN_Leaky(input=network_1,channels= channels, kernel_size=(3,3)) network = Add()([network, network_1]) return network def darknet(input): network = NetworkConv2D_BN_Leaky(input=input, channels=32, kernel_size=(3,3)) network = residual_block(input=network, channels=64, num_blocks=1) network = residual_block(input=network, channels=128, num_blocks=2) network = residual_block(input=network, channels=256, num_blocks=8) network = residual_block(input=network, channels=512, num_blocks=8) network = residual_block(input=network, channels=1024, num_blocks=4) return network def last_layers(input, channels_in, channels_out, layer_name=""): network = NetworkConv2D_BN_Leaky( input=input, channels=channels_in, kernel_size=(1,1)) network = NetworkConv2D_BN_Leaky(input=network, channels= (channels_in * 2) , kernel_size=(3, 3)) network = NetworkConv2D_BN_Leaky(input=network, channels=channels_in, kernel_size=(1, 1)) network = NetworkConv2D_BN_Leaky(input=network, channels=(channels_in * 2), kernel_size=(3, 3)) network = NetworkConv2D_BN_Leaky(input=network, channels=channels_in, kernel_size=(1, 1)) network_1 = NetworkConv2D_BN_Leaky(input=network, channels=(channels_in * 2), kernel_size=(3, 3)) network_1 = Conv2D(filters=channels_out, kernel_size=(1,1), name=layer_name)(network_1) return network, network_1 def yolov3_base(input, num_anchors, num_classes): darknet_network = Model(input, darknet(input)) network, network_1 = last_layers(darknet_network.output, 512, num_anchors * (num_classes + 5), layer_name="last1") network = NetworkConv2D_BN_Leaky( input=network, channels=256, kernel_size=(1,1)) network = UpSampling2D(2)(network) network = Concatenate()([network, darknet_network.layers[152].output]) network, network_2 = last_layers(network, 256, num_anchors * (num_classes + 5), layer_name="last2") network = NetworkConv2D_BN_Leaky(input=network, channels=128, kernel_size=(1, 1)) network = UpSampling2D(2)(network) network = Concatenate()([network, darknet_network.layers[92].output]) network, network_3 = last_layers(network, 128, num_anchors * (num_classes + 5), layer_name="last3") return input, network_1, network_2, network_3 def yolov3_main(input, num_anchors, num_classes): input, network_1, network_2, network_3 = yolov3_base(input, num_anchors, num_classes) return Model(input, [network_1, network_2, network_3]) def yolov3_train(num_classes, anchors, max_box_per_image, max_grid, batch_size, warmup_batches, ignore_thresh, grid_scales, obj_scale, noobj_scale, xywh_scale, class_scale): input_image = Input(shape=(None, None, 3)) # net_h, net_w, 3 true_boxes = Input(shape=(1, 1, 1, max_box_per_image, 4)) true_yolo_1 = Input(shape=(None, None, len(anchors)//6, 4+1+num_classes)) # grid_h, grid_w, nb_anchor, 5+nb_class true_yolo_2 = Input(shape=(None, None, len(anchors)//6, 4+1+num_classes)) # grid_h, grid_w, nb_anchor, 5+nb_class true_yolo_3 = Input(shape=(None, None, len(anchors)//6, 4+1+num_classes)) # grid_h, grid_w, nb_anchor, 5+nb_class _ , network_1, network_2, network_3 = yolov3_base(input_image, len(anchors)//6, num_classes) loss_yolo_1 = YoloLayer(anchors[12:], [1*num for num in max_grid], batch_size, warmup_batches, ignore_thresh, grid_scales[0], obj_scale, noobj_scale, xywh_scale, class_scale)([input_image, network_1, true_yolo_1, true_boxes]) loss_yolo_2 = YoloLayer(anchors[6:12], [2*num for num in max_grid], batch_size, warmup_batches, ignore_thresh, grid_scales[1], obj_scale, noobj_scale, xywh_scale, class_scale)([input_image, network_2, true_yolo_2, true_boxes]) loss_yolo_3 = YoloLayer(anchors[:6], [4*num for num in max_grid], batch_size, warmup_batches, ignore_thresh, grid_scales[2], obj_scale, noobj_scale, xywh_scale, class_scale)([input_image, network_3, true_yolo_3, true_boxes]) train_model = Model([input_image, true_boxes, true_yolo_1, true_yolo_2, true_yolo_3], [loss_yolo_1, loss_yolo_2, loss_yolo_3]) infer_model = Model(input_image, [network_1, network_2, network_3]) return [train_model, infer_model] def tiny_yolov3_main(input, num_anchors, num_classes): network_1 = NetworkConv2D_BN_Leaky(input=input, channels=16, kernel_size=(3,3) ) network_1 = MaxPool2D(pool_size=(2,2), strides=(2,2), padding="same")(network_1) network_1 = NetworkConv2D_BN_Leaky(input=network_1, channels=32, kernel_size=(3, 3)) network_1 = MaxPool2D(pool_size=(2, 2), strides=(2, 2), padding="same")(network_1) network_1 = NetworkConv2D_BN_Leaky(input=network_1, channels=64, kernel_size=(3, 3)) network_1 = MaxPool2D(pool_size=(2, 2), strides=(2, 2), padding="same")(network_1) network_1 = NetworkConv2D_BN_Leaky(input=network_1, channels=128, kernel_size=(3, 3)) network_1 = MaxPool2D(pool_size=(2, 2), strides=(2, 2), padding="same")(network_1) network_1 = NetworkConv2D_BN_Leaky(input=network_1, channels=256, kernel_size=(3, 3)) network_2 = MaxPool2D(pool_size=(2, 2), strides=(2, 2), padding="same")(network_1) network_2 = NetworkConv2D_BN_Leaky(input=network_2, channels=512, kernel_size=(3, 3)) network_2 = MaxPool2D(pool_size=(2, 2), strides=(1, 1), padding="same")(network_2) network_2 = NetworkConv2D_BN_Leaky(input=network_2, channels=1024, kernel_size=(3, 3)) network_2 = NetworkConv2D_BN_Leaky(input=network_2, channels=256, kernel_size=(1, 1)) network_3 = NetworkConv2D_BN_Leaky(input=network_2, channels=512, kernel_size=(3, 3)) network_3 = Conv2D(num_anchors * (num_classes + 5), kernel_size=(1,1))(network_3) network_2 = NetworkConv2D_BN_Leaky(input=network_2, channels=128, kernel_size=(1, 1)) network_2 = UpSampling2D(2)(network_2) network_4 = Concatenate()([network_2, network_1]) network_4 = NetworkConv2D_BN_Leaky(input=network_4, channels=256, kernel_size=(3, 3)) network_4 = Conv2D(num_anchors * (num_classes + 5), kernel_size=(1,1))(network_4) return Model(input, [network_3, network_4]) def dummy_loss(y_true, y_pred): return tf.sqrt(tf.reduce_sum(y_pred)) ================================================ FILE: imageai_tf_deprecated/Detection/__init__.py ================================================ import cv2 from imageai.Detection.keras_retinanet import models as retinanet_models from imageai.Detection.keras_retinanet.utils.image import read_image_bgr, preprocess_image, resize_image from imageai.Detection.keras_retinanet.utils.visualization import draw_box, draw_caption import matplotlib.pyplot as plt import matplotlib.image as pltimage import numpy as np import tensorflow as tf import os from tensorflow.keras import backend as K from tensorflow.keras.layers import Input from PIL import Image import colorsys import warnings from imageai.Detection.YOLO.yolov3 import tiny_yolov3_main, yolov3_main from imageai.Detection.YOLO.utils import letterbox_image, yolo_eval, preprocess_input, retrieve_yolo_detections, draw_boxes class ObjectDetection: """ This is the object detection class for images in the ImageAI library. It provides support for RetinaNet , YOLOv3 and TinyYOLOv3 object detection networks . After instantiating this class, you can set it's properties and make object detections using it's pre-defined functions. The following functions are required to be called before object detection can be made * setModelPath() * At least of of the following and it must correspond to the model set in the setModelPath() [setModelTypeAsRetinaNet(), setModelTypeAsYOLOv3(), setModelTypeAsTinyYOLOv3()] * loadModel() [This must be called once only before performing object detection] Once the above functions have been called, you can call the detectObjectsFromImage() function of the object detection instance object at anytime to obtain observable objects in any image. """ def __init__(self): self.__modelType = "" self.modelPath = "" self.__modelPathAdded = False self.__modelLoaded = False self.__model_collection = [] # Instance variables for RetinaNet Model self.__input_image_min = 1333 self.__input_image_max = 800 self.numbers_to_names = {0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle', 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl', 46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli', 51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table', 61: 'toilet', 62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone', 68: 'microwave', 69: 'oven', 70: 'toaster', 71: 'sink', 72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors', 77: 'teddy bear', 78: 'hair dryer', 79: 'toothbrush'} # Unique instance variables for YOLOv3 and TinyYOLOv3 model self.__yolo_iou = 0.45 self.__yolo_score = 0.1 self.__nms_thresh = 0.45 self.__yolo_anchors = [[116,90, 156,198, 373,326], [30,61, 62,45, 59,119], [10,13, 16,30, 33,23]] self.__yolo_model_image_size = (416, 416) self.__yolo_boxes, self.__yolo_scores, self.__yolo_classes = "", "", "" self.__tiny_yolo_anchors = [[81, 82, 135, 169, 344, 319], [10, 14, 23, 27, 37, 58]] self.__box_color = (112, 19, 24) def setModelTypeAsRetinaNet(self): """ 'setModelTypeAsRetinaNet()' is used to set the model type to the RetinaNet model for the video object detection instance instance object . :return: """ self.__modelType = "retinanet" def setModelTypeAsYOLOv3(self): """ 'setModelTypeAsYOLOv3()' is used to set the model type to the YOLOv3 model for the video object detection instance instance object . :return: """ self.__modelType = "yolov3" def setModelTypeAsTinyYOLOv3(self): """ 'setModelTypeAsTinyYOLOv3()' is used to set the model type to the TinyYOLOv3 model for the video object detection instance instance object . :return: """ self.__modelType = "tinyyolov3" def setModelPath(self, model_path): """ 'setModelPath()' function is required and is used to set the file path to a RetinaNet object detection model trained on the COCO dataset. :param model_path: :return: """ if (self.__modelPathAdded == False): self.modelPath = model_path self.__modelPathAdded = True def loadModel(self, detection_speed="normal"): """ 'loadModel()' function is required and is used to load the model structure into the program from the file path defined in the setModelPath() function. This function receives an optional value which is "detection_speed". The value is used to reduce the time it takes to detect objects in an image, down to about a 10% of the normal time, with with just slight reduction in the number of objects detected. * prediction_speed (optional); Acceptable values are "normal", "fast", "faster", "fastest" and "flash" :param detection_speed: :return: """ if (self.__modelType == "retinanet"): if (detection_speed == "normal"): self.__input_image_min = 800 self.__input_image_max = 1333 elif (detection_speed == "fast"): self.__input_image_min = 400 self.__input_image_max = 700 elif (detection_speed == "faster"): self.__input_image_min = 300 self.__input_image_max = 500 elif (detection_speed == "fastest"): self.__input_image_min = 200 self.__input_image_max = 350 elif (detection_speed == "flash"): self.__input_image_min = 100 self.__input_image_max = 250 elif (self.__modelType == "yolov3"): if (detection_speed == "normal"): self.__yolo_model_image_size = (416, 416) elif (detection_speed == "fast"): self.__yolo_model_image_size = (320, 320) elif (detection_speed == "faster"): self.__yolo_model_image_size = (208, 208) elif (detection_speed == "fastest"): self.__yolo_model_image_size = (128, 128) elif (detection_speed == "flash"): self.__yolo_model_image_size = (96, 96) elif (self.__modelType == "tinyyolov3"): if (detection_speed == "normal"): self.__yolo_model_image_size = (832, 832) elif (detection_speed == "fast"): self.__yolo_model_image_size = (576, 576) elif (detection_speed == "faster"): self.__yolo_model_image_size = (416, 416) elif (detection_speed == "fastest"): self.__yolo_model_image_size = (320, 320) elif (detection_speed == "flash"): self.__yolo_model_image_size = (272, 272) if (self.__modelLoaded == False): if (self.__modelType == ""): raise ValueError("You must set a valid model type before loading the model.") elif (self.__modelType == "retinanet"): model = retinanet_models.load_model(self.modelPath, backbone_name='resnet50') self.__model_collection.append(model) self.__modelLoaded = True elif (self.__modelType == "yolov3" or self.__modelType == "tinyyolov3"): input_image = Input(shape=(None, None, 3)) if self.__modelType == "yolov3": model = yolov3_main(input_image, len(self.__yolo_anchors), len(self.numbers_to_names.keys())) else: model = tiny_yolov3_main(input_image, 3, len(self.numbers_to_names.keys())) model.load_weights(self.modelPath) self.__model_collection.append(model) self.__modelLoaded = True def detectObjectsFromImage(self, input_image="", output_image_path="", input_type="file", output_type="file", extract_detected_objects=False, minimum_percentage_probability=50, display_percentage_probability=True, display_object_name=True, display_box=True, thread_safe=False, custom_objects=None): """ 'detectObjectsFromImage()' function is used to detect objects observable in the given image path: * input_image , which can be a filepath, image numpy array or image file stream * output_image_path (only if output_type = file) , file path to the output image that will contain the detection boxes and label, if output_type="file" * input_type (optional) , file path/numpy array/image file stream of the image. Acceptable values are "file", "array" and "stream" * output_type (optional) , file path/numpy array/image file stream of the image. Acceptable values are "file" and "array" * extract_detected_objects (optional) , option to save each object detected individually as an image and return an array of the objects' image path. * minimum_percentage_probability (optional, 50 by default) , option to set the minimum percentage probability for nominating a detected object for output. * display_percentage_probability (optional, True by default), option to show or hide the percentage probability of each object in the saved/returned detected image * display_display_object_name (optional, True by default), option to show or hide the name of each object in the saved/returned detected image * thread_safe (optional, False by default), enforce the loaded detection model works across all threads if set to true, made possible by forcing all Tensorflow inference to run on the default graph. The values returned by this function depends on the parameters parsed. The possible values returnable are stated as below - If extract_detected_objects = False or at its default value and output_type = 'file' or at its default value, you must parse in the 'output_image_path' as a string to the path you want the detected image to be saved. Then the function will return: 1. an array of dictionaries, with each dictionary corresponding to the objects detected in the image. Each dictionary contains the following property: * name (string) * percentage_probability (float) * box_points (list of x1,y1,x2 and y2 coordinates) - If extract_detected_objects = False or at its default value and output_type = 'array' , Then the function will return: 1. a numpy array of the detected image 2. an array of dictionaries, with each dictionary corresponding to the objects detected in the image. Each dictionary contains the following property: * name (string) * percentage_probability (float) * box_points (list of x1,y1,x2 and y2 coordinates) - If extract_detected_objects = True and output_type = 'file' or at its default value, you must parse in the 'output_image_path' as a string to the path you want the detected image to be saved. Then the function will return: 1. an array of dictionaries, with each dictionary corresponding to the objects detected in the image. Each dictionary contains the following property: * name (string) * percentage_probability (float) * box_points (list of x1,y1,x2 and y2 coordinates) 2. an array of string paths to the image of each object extracted from the image - If extract_detected_objects = True and output_type = 'array', the the function will return: 1. a numpy array of the detected image 2. an array of dictionaries, with each dictionary corresponding to the objects detected in the image. Each dictionary contains the following property: * name (string) * percentage_probability (float) * box_points (list of x1,y1,x2 and y2 coordinates) 3. an array of numpy arrays of each object detected in the image :param input_image: :param output_image_path: :param input_type: :param output_type: :param extract_detected_objects: :param minimum_percentage_probability: :param display_percentage_probability: :param display_object_name: :param thread_safe: :return image_frame: :return output_objects_array: :return detected_objects_image_array: """ if (self.__modelLoaded == False): raise ValueError("You must call the loadModel() function before making object detection.") elif (self.__modelLoaded == True): try: model_detections = list() detections = list() image_copy = None detected_objects_image_array = [] min_probability = minimum_percentage_probability / 100 if (input_type == "file"): input_image = cv2.imread(input_image) elif (input_type == "array"): input_image = np.array(input_image) detected_copy = input_image image_copy = input_image if (self.__modelType == "yolov3" or self.__modelType == "tinyyolov3"): image_h, image_w, _ = detected_copy.shape detected_copy = preprocess_input(detected_copy, self.__yolo_model_image_size) model = self.__model_collection[0] yolo_result = model.predict(detected_copy) model_detections = retrieve_yolo_detections(yolo_result, self.__yolo_anchors, min_probability, self.__nms_thresh, self.__yolo_model_image_size, (image_w, image_h), self.numbers_to_names) elif (self.__modelType == "retinanet"): detected_copy = preprocess_image(detected_copy) detected_copy, scale = resize_image(detected_copy) model = self.__model_collection[0] boxes, scores, labels = model.predict_on_batch(np.expand_dims(detected_copy, axis=0)) boxes /= scale for box, score, label in zip(boxes[0], scores[0], labels[0]): # scores are sorted so we can break if score < min_probability: break detection_dict = dict() detection_dict["name"] = self.numbers_to_names[label] detection_dict["percentage_probability"] = score * 100 detection_dict["box_points"] = box.astype(int).tolist() model_detections.append(detection_dict) counting = 0 objects_dir = output_image_path + "-objects" for detection in model_detections: counting += 1 label = detection["name"] percentage_probability = detection["percentage_probability"] box_points = detection["box_points"] if (custom_objects is not None): if (custom_objects[label] != "valid"): continue detections.append(detection) if display_object_name == False: label = None if display_percentage_probability == False: percentage_probability = None image_copy = draw_boxes(image_copy, box_points, display_box, label, percentage_probability, self.__box_color) if (extract_detected_objects == True): splitted_copy = image_copy.copy()[box_points[1]:box_points[3], box_points[0]:box_points[2]] if (output_type == "file"): if (os.path.exists(objects_dir) == False): os.mkdir(objects_dir) splitted_image_path = os.path.join(objects_dir, detection["name"] + "-" + str( counting) + ".jpg") cv2.imwrite(splitted_image_path, splitted_copy) detected_objects_image_array.append(splitted_image_path) elif (output_type == "array"): detected_objects_image_array.append(splitted_copy) if (output_type == "file"): cv2.imwrite(output_image_path, image_copy) if (extract_detected_objects == True): if (output_type == "file"): return detections, detected_objects_image_array elif (output_type == "array"): return image_copy, detections, detected_objects_image_array else: if (output_type == "file"): return detections elif (output_type == "array"): return image_copy, detections except: raise ValueError( "Ensure you specified correct input image, input type, output type and/or output image path ") def CustomObjects(self, person=False, bicycle=False, car=False, motorcycle=False, airplane=False, bus=False, train=False, truck=False, boat=False, traffic_light=False, fire_hydrant=False, stop_sign=False, parking_meter=False, bench=False, bird=False, cat=False, dog=False, horse=False, sheep=False, cow=False, elephant=False, bear=False, zebra=False, giraffe=False, backpack=False, umbrella=False, handbag=False, tie=False, suitcase=False, frisbee=False, skis=False, snowboard=False, sports_ball=False, kite=False, baseball_bat=False, baseball_glove=False, skateboard=False, surfboard=False, tennis_racket=False, bottle=False, wine_glass=False, cup=False, fork=False, knife=False, spoon=False, bowl=False, banana=False, apple=False, sandwich=False, orange=False, broccoli=False, carrot=False, hot_dog=False, pizza=False, donut=False, cake=False, chair=False, couch=False, potted_plant=False, bed=False, dining_table=False, toilet=False, tv=False, laptop=False, mouse=False, remote=False, keyboard=False, cell_phone=False, microwave=False, oven=False, toaster=False, sink=False, refrigerator=False, book=False, clock=False, vase=False, scissors=False, teddy_bear=False, hair_dryer=False, toothbrush=False): """ The 'CustomObjects()' function allows you to handpick the type of objects you want to detect from an image. The objects are pre-initiated in the function variables and predefined as 'False', which you can easily set to true for any number of objects available. This function returns a dictionary which must be parsed into the 'detectCustomObjectsFromImage()'. Detecting custom objects only happens when you call the function 'detectCustomObjectsFromImage()' * true_values_of_objects (array); Acceptable values are 'True' and False for all object values present :param boolean_values: :return: custom_objects_dict """ custom_objects_dict = {} input_values = [person, bicycle, car, motorcycle, airplane, bus, train, truck, boat, traffic_light, fire_hydrant, stop_sign, parking_meter, bench, bird, cat, dog, horse, sheep, cow, elephant, bear, zebra, giraffe, backpack, umbrella, handbag, tie, suitcase, frisbee, skis, snowboard, sports_ball, kite, baseball_bat, baseball_glove, skateboard, surfboard, tennis_racket, bottle, wine_glass, cup, fork, knife, spoon, bowl, banana, apple, sandwich, orange, broccoli, carrot, hot_dog, pizza, donut, cake, chair, couch, potted_plant, bed, dining_table, toilet, tv, laptop, mouse, remote, keyboard, cell_phone, microwave, oven, toaster, sink, refrigerator, book, clock, vase, scissors, teddy_bear, hair_dryer, toothbrush] actual_labels = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair dryer", "toothbrush"] for input_value, actual_label in zip(input_values, actual_labels): if (input_value == True): custom_objects_dict[actual_label] = "valid" else: custom_objects_dict[actual_label] = "invalid" return custom_objects_dict def detectCustomObjectsFromImage(self, input_image="", output_image_path="", input_type="file", output_type="file", extract_detected_objects=False, minimum_percentage_probability=50, display_percentage_probability=True, display_object_name=True, display_box=True, thread_safe=False, custom_objects=None): warnings.warn("'detectCustomObjectsFromImage()' function has been deprecated and will be removed in future versions of ImageAI. \n Kindly use 'detectObjectsFromImage()' ", DeprecationWarning, stacklevel=2) return self.detectObjectsFromImage(input_image=input_image, output_image_path=output_image_path, input_type=input_type, output_type=output_type, extract_detected_objects=extract_detected_objects, minimum_percentage_probability=minimum_percentage_probability, display_percentage_probability=display_percentage_probability, display_object_name=display_object_name, display_box=display_box, thread_safe=thread_safe, custom_objects=custom_objects) class VideoObjectDetection: """ This is the object detection class for videos and camera live stream inputs in the ImageAI library. It provides support for RetinaNet, YOLOv3 and TinyYOLOv3 object detection networks. After instantiating this class, you can set it's properties and make object detections using it's pre-defined functions. The following functions are required to be called before object detection can be made * setModelPath() * At least of of the following and it must correspond to the model set in the setModelPath() [setModelTypeAsRetinaNet(), setModelTypeAsYOLOv3(), setModelTinyYOLOv3()] * loadModel() [This must be called once only before performing object detection] Once the above functions have been called, you can call the detectObjectsFromVideo() function or the detectCustomObjectsFromVideo() of the object detection instance object at anytime to obtain observable objects in any video or camera live stream. """ def __init__(self): self.__modelType = "" self.modelPath = "" self.__modelPathAdded = False self.__modelLoaded = False self.__detector = None self.__input_image_min = 1333 self.__input_image_max = 800 self.__detection_storage = None self.numbers_to_names = {0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle', 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl', 46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli', 51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table', 61: 'toilet', 62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone', 68: 'microwave', 69: 'oven', 70: 'toaster', 71: 'sink', 72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors', 77: 'teddy bear', 78: 'hair dryer', 79: 'toothbrush'} def setModelTypeAsRetinaNet(self): """ 'setModelTypeAsRetinaNet()' is used to set the model type to the RetinaNet model for the video object detection instance instance object . :return: """ self.__modelType = "retinanet" def setModelTypeAsYOLOv3(self): """ 'setModelTypeAsYOLOv3()' is used to set the model type to the YOLOv3 model for the video object detection instance instance object . :return: """ self.__modelType = "yolov3" def setModelTypeAsTinyYOLOv3(self): """ 'setModelTypeAsTinyYOLOv3()' is used to set the model type to the TinyYOLOv3 model for the video object detection instance instance object . :return: """ self.__modelType = "tinyyolov3" def setModelPath(self, model_path): """ 'setModelPath()' function is required and is used to set the file path to a RetinaNet, YOLOv3 or TinyYOLOv3 object detection model trained on the COCO dataset. :param model_path: :return: """ if (self.__modelPathAdded == False): self.modelPath = model_path self.__modelPathAdded = True def loadModel(self, detection_speed="normal"): """ 'loadModel()' function is required and is used to load the model structure into the program from the file path defined in the setModelPath() function. This function receives an optional value which is "detection_speed". The value is used to reduce the time it takes to detect objects in an image, down to about a 10% of the normal time, with with just slight reduction in the number of objects detected. * prediction_speed (optional); Acceptable values are "normal", "fast", "faster", "fastest" and "flash" :param detection_speed: :return: """ if (self.__modelLoaded == False): frame_detector = ObjectDetection() if (self.__modelType == "retinanet"): frame_detector.setModelTypeAsRetinaNet() elif (self.__modelType == "yolov3"): frame_detector.setModelTypeAsYOLOv3() elif (self.__modelType == "tinyyolov3"): frame_detector.setModelTypeAsTinyYOLOv3() frame_detector.setModelPath(self.modelPath) frame_detector.loadModel(detection_speed) self.__detector = frame_detector self.__modelLoaded = True def detectObjectsFromVideo(self, input_file_path="", camera_input=None, output_file_path="", frames_per_second=20, frame_detection_interval=1, minimum_percentage_probability=50, log_progress=False, display_percentage_probability=True, display_object_name=True, display_box=True, save_detected_video=True, per_frame_function=None, per_second_function=None, per_minute_function=None, video_complete_function=None, return_detected_frame=False, detection_timeout = None, thread_safe=False, custom_objects=None): """ 'detectObjectsFromVideo()' function is used to detect objects observable in the given video path or a camera input: * input_file_path , which is the file path to the input video. It is required only if 'camera_input' is not set * camera_input , allows you to parse in camera input for live video detections * output_file_path , which is the path to the output video. It is required only if 'save_detected_video' is not set to False * frames_per_second , which is the number of frames to be used in the output video * frame_detection_interval (optional, 1 by default) , which is the intervals of frames that will be detected. * minimum_percentage_probability (optional, 50 by default) , option to set the minimum percentage probability for nominating a detected object for output. * log_progress (optional) , which states if the progress of the frame processed is to be logged to console * display_percentage_probability (optional), can be used to hide or show probability scores on the detected video frames * display_object_name (optional), can be used to show or hide object names on the detected video frames * save_save_detected_video (optional, True by default), can be set to or not to save the detected video * per_frame_function (optional), this parameter allows you to parse in a function you will want to execute after each frame of the video is detected. If this parameter is set to a function, after every video frame is detected, the function will be executed with the following values parsed into it: -- position number of the frame -- an array of dictinaries, with each dictionary corresponding to each object detected. Each dictionary contains 'name', 'percentage_probability' and 'box_points' -- a dictionary with with keys being the name of each unique objects and value are the number of instances of the object present -- If return_detected_frame is set to True, the numpy array of the detected frame will be parsed as the fourth value into the function * per_second_function (optional), this parameter allows you to parse in a function you will want to execute after each second of the video is detected. If this parameter is set to a function, after every second of a video is detected, the function will be executed with the following values parsed into it: -- position number of the second -- an array of dictionaries whose keys are position number of each frame present in the last second , and the value for each key is the array for each frame that contains the dictionaries for each object detected in the frame -- an array of dictionaries, with each dictionary corresponding to each frame in the past second, and the keys of each dictionary are the name of the number of unique objects detected in each frame, and the key values are the number of instances of the objects found in the frame -- a dictionary with its keys being the name of each unique object detected throughout the past second, and the key values are the average number of instances of the object found in all the frames contained in the past second -- If return_detected_frame is set to True, the numpy array of the detected frame will be parsed as the fifth value into the function * per_minute_function (optional), this parameter allows you to parse in a function you will want to execute after each minute of the video is detected. If this parameter is set to a function, after every minute of a video is detected, the function will be executed with the following values parsed into it: -- position number of the minute -- an array of dictionaries whose keys are position number of each frame present in the last minute , and the value for each key is the array for each frame that contains the dictionaries for each object detected in the frame -- an array of dictionaries, with each dictionary corresponding to each frame in the past minute, and the keys of each dictionary are the name of the number of unique objects detected in each frame, and the key values are the number of instances of the objects found in the frame -- a dictionary with its keys being the name of each unique object detected throughout the past minute, and the key values are the average number of instances of the object found in all the frames contained in the past minute -- If return_detected_frame is set to True, the numpy array of the detected frame will be parsed as the fifth value into the function * video_complete_function (optional), this parameter allows you to parse in a function you will want to execute after all of the video frames have been detected. If this parameter is set to a function, after all of frames of a video is detected, the function will be executed with the following values parsed into it: -- an array of dictionaries whose keys are position number of each frame present in the entire video , and the value for each key is the array for each frame that contains the dictionaries for each object detected in the frame -- an array of dictionaries, with each dictionary corresponding to each frame in the entire video, and the keys of each dictionary are the name of the number of unique objects detected in each frame, and the key values are the number of instances of the objects found in the frame -- a dictionary with its keys being the name of each unique object detected throughout the entire video, and the key values are the average number of instances of the object found in all the frames contained in the entire video * return_detected_frame (optionally, False by default), option to obtain the return the last detected video frame into the per_per_frame_function, per_per_second_function or per_per_minute_function * detection_timeout (optionally, None by default), option to state the number of seconds of a video that should be detected after which the detection function stop processing the video * thread_safe (optional, False by default), enforce the loaded detection model works across all threads if set to true, made possible by forcing all Tensorflow inference to run on the default graph. :param input_file_path: :param camera_input :param output_file_path: :param save_detected_video: :param frames_per_second: :param frame_detection_interval: :param minimum_percentage_probability: :param log_progress: :param display_percentage_probability: :param display_object_name: :param per_frame_function: :param per_second_function: :param per_minute_function: :param video_complete_function: :param return_detected_frame: :param detection_timeout: :param thread_safe: :return output_video_filepath: :return counting: :return output_objects_array: :return output_objects_count: :return detected_copy: :return this_second_output_object_array: :return this_second_counting_array: :return this_second_counting: :return this_minute_output_object_array: :return this_minute_counting_array: :return this_minute_counting: :return this_video_output_object_array: :return this_video_counting_array: :return this_video_counting: """ if (input_file_path == "" and camera_input == None): raise ValueError( "You must set 'input_file_path' to a valid video file, or set 'camera_input' to a valid camera") elif (save_detected_video == True and output_file_path == ""): raise ValueError( "You must set 'output_video_filepath' to a valid video file name, in which the detected video will be saved. If you don't intend to save the detected video, set 'save_detected_video=False'") else: try: output_frames_dict = {} output_frames_count_dict = {} input_video = cv2.VideoCapture(input_file_path) if (camera_input != None): input_video = camera_input output_video_filepath = output_file_path + '.avi' frame_width = int(input_video.get(3)) frame_height = int(input_video.get(4)) output_video = cv2.VideoWriter(output_video_filepath, cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), frames_per_second, (frame_width, frame_height)) counting = 0 detection_timeout_count = 0 video_frames_count = 0 while (input_video.isOpened()): ret, frame = input_video.read() if (ret == True): video_frames_count += 1 if (detection_timeout != None): if ((video_frames_count % frames_per_second) == 0): detection_timeout_count += 1 if (detection_timeout_count >= detection_timeout): break output_objects_array = [] counting += 1 if (log_progress == True): print("Processing Frame : ", str(counting)) detected_copy = frame.copy() check_frame_interval = counting % frame_detection_interval if (counting == 1 or check_frame_interval == 0): try: detected_copy, output_objects_array = self.__detector.detectObjectsFromImage( input_image=frame, input_type="array", output_type="array", minimum_percentage_probability=minimum_percentage_probability, display_percentage_probability=display_percentage_probability, display_object_name=display_object_name, display_box=display_box, custom_objects=custom_objects) except: None output_frames_dict[counting] = output_objects_array output_objects_count = {} for eachItem in output_objects_array: eachItemName = eachItem["name"] try: output_objects_count[eachItemName] = output_objects_count[eachItemName] + 1 except: output_objects_count[eachItemName] = 1 output_frames_count_dict[counting] = output_objects_count if (save_detected_video == True): output_video.write(detected_copy) if (counting == 1 or check_frame_interval == 0): if (per_frame_function != None): if (return_detected_frame == True): per_frame_function(counting, output_objects_array, output_objects_count, detected_copy) elif (return_detected_frame == False): per_frame_function(counting, output_objects_array, output_objects_count) if (per_second_function != None): if (counting != 1 and (counting % frames_per_second) == 0): this_second_output_object_array = [] this_second_counting_array = [] this_second_counting = {} for aa in range(counting): if (aa >= (counting - frames_per_second)): this_second_output_object_array.append(output_frames_dict[aa + 1]) this_second_counting_array.append(output_frames_count_dict[aa + 1]) for eachCountingDict in this_second_counting_array: for eachItem in eachCountingDict: try: this_second_counting[eachItem] = this_second_counting[eachItem] + \ eachCountingDict[eachItem] except: this_second_counting[eachItem] = eachCountingDict[eachItem] for eachCountingItem in this_second_counting: this_second_counting[eachCountingItem] = int(this_second_counting[eachCountingItem] / frames_per_second) if (return_detected_frame == True): per_second_function(int(counting / frames_per_second), this_second_output_object_array, this_second_counting_array, this_second_counting, detected_copy) elif (return_detected_frame == False): per_second_function(int(counting / frames_per_second), this_second_output_object_array, this_second_counting_array, this_second_counting) if (per_minute_function != None): if (counting != 1 and (counting % (frames_per_second * 60)) == 0): this_minute_output_object_array = [] this_minute_counting_array = [] this_minute_counting = {} for aa in range(counting): if (aa >= (counting - (frames_per_second * 60))): this_minute_output_object_array.append(output_frames_dict[aa + 1]) this_minute_counting_array.append(output_frames_count_dict[aa + 1]) for eachCountingDict in this_minute_counting_array: for eachItem in eachCountingDict: try: this_minute_counting[eachItem] = this_minute_counting[eachItem] + \ eachCountingDict[eachItem] except: this_minute_counting[eachItem] = eachCountingDict[eachItem] for eachCountingItem in this_minute_counting: this_minute_counting[eachCountingItem] = int(this_minute_counting[eachCountingItem] / (frames_per_second * 60)) if (return_detected_frame == True): per_minute_function(int(counting / (frames_per_second * 60)), this_minute_output_object_array, this_minute_counting_array, this_minute_counting, detected_copy) elif (return_detected_frame == False): per_minute_function(int(counting / (frames_per_second * 60)), this_minute_output_object_array, this_minute_counting_array, this_minute_counting) else: break if (video_complete_function != None): this_video_output_object_array = [] this_video_counting_array = [] this_video_counting = {} for aa in range(counting): this_video_output_object_array.append(output_frames_dict[aa + 1]) this_video_counting_array.append(output_frames_count_dict[aa + 1]) for eachCountingDict in this_video_counting_array: for eachItem in eachCountingDict: try: this_video_counting[eachItem] = this_video_counting[eachItem] + \ eachCountingDict[eachItem] except: this_video_counting[eachItem] = eachCountingDict[eachItem] for eachCountingItem in this_video_counting: this_video_counting[eachCountingItem] = int(this_video_counting[eachCountingItem] / counting) video_complete_function(this_video_output_object_array, this_video_counting_array, this_video_counting) input_video.release() output_video.release() if (save_detected_video == True): return output_video_filepath except: raise ValueError( "An error occured. It may be that your input video is invalid. Ensure you specified a proper string value for 'output_file_path' is 'save_detected_video' is not False. " "Also ensure your per_frame, per_second, per_minute or video_complete_analysis function is properly configured to receive the right parameters. ") def CustomObjects(self, person=False, bicycle=False, car=False, motorcycle=False, airplane=False, bus=False, train=False, truck=False, boat=False, traffic_light=False, fire_hydrant=False, stop_sign=False, parking_meter=False, bench=False, bird=False, cat=False, dog=False, horse=False, sheep=False, cow=False, elephant=False, bear=False, zebra=False, giraffe=False, backpack=False, umbrella=False, handbag=False, tie=False, suitcase=False, frisbee=False, skis=False, snowboard=False, sports_ball=False, kite=False, baseball_bat=False, baseball_glove=False, skateboard=False, surfboard=False, tennis_racket=False, bottle=False, wine_glass=False, cup=False, fork=False, knife=False, spoon=False, bowl=False, banana=False, apple=False, sandwich=False, orange=False, broccoli=False, carrot=False, hot_dog=False, pizza=False, donut=False, cake=False, chair=False, couch=False, potted_plant=False, bed=False, dining_table=False, toilet=False, tv=False, laptop=False, mouse=False, remote=False, keyboard=False, cell_phone=False, microwave=False, oven=False, toaster=False, sink=False, refrigerator=False, book=False, clock=False, vase=False, scissors=False, teddy_bear=False, hair_dryer=False, toothbrush=False): """ The 'CustomObjects()' function allows you to handpick the type of objects you want to detect from a video. The objects are pre-initiated in the function variables and predefined as 'False', which you can easily set to true for any number of objects available. This function returns a dictionary which must be parsed into the 'detectCustomObjectsFromVideo()'. Detecting custom objects only happens when you call the function 'detectCustomObjectsFromVideo()' * true_values_of_objects (array); Acceptable values are 'True' and False for all object values present :param boolean_values: :return: custom_objects_dict """ custom_objects_dict = {} input_values = [person, bicycle, car, motorcycle, airplane, bus, train, truck, boat, traffic_light, fire_hydrant, stop_sign, parking_meter, bench, bird, cat, dog, horse, sheep, cow, elephant, bear, zebra, giraffe, backpack, umbrella, handbag, tie, suitcase, frisbee, skis, snowboard, sports_ball, kite, baseball_bat, baseball_glove, skateboard, surfboard, tennis_racket, bottle, wine_glass, cup, fork, knife, spoon, bowl, banana, apple, sandwich, orange, broccoli, carrot, hot_dog, pizza, donut, cake, chair, couch, potted_plant, bed, dining_table, toilet, tv, laptop, mouse, remote, keyboard, cell_phone, microwave, oven, toaster, sink, refrigerator, book, clock, vase, scissors, teddy_bear, hair_dryer, toothbrush] actual_labels = ["person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair dryer", "toothbrush"] for input_value, actual_label in zip(input_values, actual_labels): if (input_value == True): custom_objects_dict[actual_label] = "valid" else: custom_objects_dict[actual_label] = "invalid" return custom_objects_dict def detectCustomObjectsFromVideo(self, input_file_path="", camera_input=None, output_file_path="", frames_per_second=20, frame_detection_interval=1, minimum_percentage_probability=50, log_progress=False, display_percentage_probability=True, display_object_name=True, display_box=True, save_detected_video=True, per_frame_function=None, per_second_function=None, per_minute_function=None, video_complete_function=None, return_detected_frame=False, detection_timeout = None, thread_safe=False, custom_objects=None): return self.detectObjectsFromVideo(input_file_path=input_file_path, camera_input=camera_input, output_file_path=output_file_path, frames_per_second=frames_per_second, frame_detection_interval=frame_detection_interval, minimum_percentage_probability=minimum_percentage_probability, log_progress=log_progress, display_percentage_probability=display_percentage_probability, display_object_name=display_object_name, display_box=display_box, save_detected_video=save_detected_video, per_frame_function=per_frame_function, per_second_function=per_second_function, per_minute_function=per_minute_function, video_complete_function=video_complete_function, return_detected_frame=return_detected_frame, detection_timeout = detection_timeout, thread_safe=thread_safe, custom_objects=custom_objects) ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/__init__.py ================================================ ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/backend/__init__.py ================================================ from .backend import * # noqa: F401,F403 ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/backend/backend.py ================================================ """ Copyright 2017-2018 Fizyr (https://fizyr.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ import tensorflow from tensorflow import keras def bbox_transform_inv(boxes, deltas, mean=None, std=None): """ Applies deltas (usually regression results) to boxes (usually anchors). Before applying the deltas to the boxes, the normalization that was previously applied (in the generator) has to be removed. The mean and std are the mean and std as applied in the generator. They are unnormalized in this function and then applied to the boxes. Args boxes : np.array of shape (B, N, 4), where B is the batch size, N the number of boxes and 4 values for (x1, y1, x2, y2). deltas: np.array of same shape as boxes. These deltas (d_x1, d_y1, d_x2, d_y2) are a factor of the width/height. mean : The mean value used when computing deltas (defaults to [0, 0, 0, 0]). std : The standard deviation used when computing deltas (defaults to [0.2, 0.2, 0.2, 0.2]). Returns A np.array of the same shape as boxes, but with deltas applied to each box. The mean and std are used during training to normalize the regression values (networks love normalization). """ if mean is None: mean = [0, 0, 0, 0] if std is None: std = [0.2, 0.2, 0.2, 0.2] width = boxes[:, :, 2] - boxes[:, :, 0] height = boxes[:, :, 3] - boxes[:, :, 1] x1 = boxes[:, :, 0] + (deltas[:, :, 0] * std[0] + mean[0]) * width y1 = boxes[:, :, 1] + (deltas[:, :, 1] * std[1] + mean[1]) * height x2 = boxes[:, :, 2] + (deltas[:, :, 2] * std[2] + mean[2]) * width y2 = boxes[:, :, 3] + (deltas[:, :, 3] * std[3] + mean[3]) * height pred_boxes = keras.backend.stack([x1, y1, x2, y2], axis=2) return pred_boxes def shift(shape, stride, anchors): """ Produce shifted anchors based on shape of the map and stride size. Args shape : Shape to shift the anchors over. stride : Stride to shift the anchors with over the shape. anchors: The anchors to apply at each location. """ shift_x = (keras.backend.arange(0, shape[1], dtype=keras.backend.floatx()) + keras.backend.constant(0.5, dtype=keras.backend.floatx())) * stride shift_y = (keras.backend.arange(0, shape[0], dtype=keras.backend.floatx()) + keras.backend.constant(0.5, dtype=keras.backend.floatx())) * stride shift_x, shift_y = tensorflow.meshgrid(shift_x, shift_y) shift_x = keras.backend.reshape(shift_x, [-1]) shift_y = keras.backend.reshape(shift_y, [-1]) shifts = keras.backend.stack([ shift_x, shift_y, shift_x, shift_y ], axis=0) shifts = keras.backend.transpose(shifts) number_of_anchors = keras.backend.shape(anchors)[0] k = keras.backend.shape(shifts)[0] # number of base points = feat_h * feat_w shifted_anchors = keras.backend.reshape(anchors, [1, number_of_anchors, 4]) + keras.backend.cast(keras.backend.reshape(shifts, [k, 1, 4]), keras.backend.floatx()) shifted_anchors = keras.backend.reshape(shifted_anchors, [k * number_of_anchors, 4]) return shifted_anchors def map_fn(*args, **kwargs): """ See https://www.tensorflow.org/api_docs/python/tf/map_fn . """ if "shapes" in kwargs: shapes = kwargs.pop("shapes") dtype = kwargs.pop("dtype") sig = [tensorflow.TensorSpec(shapes[i], dtype=t) for i, t in enumerate(dtype)] # Try to use the new feature fn_output_signature in TF 2.3, use fallback if this is not available try: return tensorflow.map_fn(*args, **kwargs, fn_output_signature=sig) except TypeError: kwargs["dtype"] = dtype return tensorflow.map_fn(*args, **kwargs) def resize_images(images, size, method='bilinear', align_corners=False): """ See https://www.tensorflow.org/versions/r1.14/api_docs/python/tf/image/resize_images . Args method: The method used for interpolation. One of ('bilinear', 'nearest', 'bicubic', 'area'). """ methods = { 'bilinear': tensorflow.image.ResizeMethod.BILINEAR, 'nearest' : tensorflow.image.ResizeMethod.NEAREST_NEIGHBOR, 'bicubic' : tensorflow.image.ResizeMethod.BICUBIC, 'area' : tensorflow.image.ResizeMethod.AREA, } return tensorflow.compat.v1.image.resize_images(images, size, methods[method], align_corners) ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/bin/__init__.py ================================================ ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/bin/convert_model.py ================================================ #!/usr/bin/env python """ Copyright 2017-2018 Fizyr (https://fizyr.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ import argparse import os import sys # Allow relative imports when being executed as script. if __name__ == "__main__" and __package__ is None: sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..')) import keras_retinanet.bin # noqa: F401 __package__ = "keras_retinanet.bin" # Change these to absolute imports if you copy this script outside the keras_retinanet package. from .. import models from ..utils.config import read_config_file, parse_anchor_parameters, parse_pyramid_levels from ..utils.gpu import setup_gpu from ..utils.tf_version import check_tf_version def parse_args(args): parser = argparse.ArgumentParser(description='Script for converting a training model to an inference model.') parser.add_argument('model_in', help='The model to convert.') parser.add_argument('model_out', help='Path to save the converted model to.') parser.add_argument('--backbone', help='The backbone of the model to convert.', default='resnet50') parser.add_argument('--no-nms', help='Disables non maximum suppression.', dest='nms', action='store_false') parser.add_argument('--no-class-specific-filter', help='Disables class specific filtering.', dest='class_specific_filter', action='store_false') parser.add_argument('--config', help='Path to a configuration parameters .ini file.') parser.add_argument('--nms-threshold', help='Value for non maximum suppression threshold.', type=float, default=0.5) parser.add_argument('--score-threshold', help='Threshold for prefiltering boxes.', type=float, default=0.05) parser.add_argument('--max-detections', help='Maximum number of detections to keep.', type=int, default=300) parser.add_argument('--parallel-iterations', help='Number of batch items to process in parallel.', type=int, default=32) return parser.parse_args(args) def main(args=None): # parse arguments if args is None: args = sys.argv[1:] args = parse_args(args) # make sure tensorflow is the minimum required version check_tf_version() # set modified tf session to avoid using the GPUs setup_gpu('cpu') # optionally load config parameters anchor_parameters = None pyramid_levels = None if args.config: args.config = read_config_file(args.config) if 'anchor_parameters' in args.config: anchor_parameters = parse_anchor_parameters(args.config) if 'pyramid_levels' in args.config: pyramid_levels = parse_pyramid_levels(args.config) # load the model model = models.load_model(args.model_in, backbone_name=args.backbone) # check if this is indeed a training model models.check_training_model(model) # convert the model model = models.convert_model( model, nms=args.nms, class_specific_filter=args.class_specific_filter, anchor_params=anchor_parameters, pyramid_levels=pyramid_levels, nms_threshold=args.nms_threshold, score_threshold=args.score_threshold, max_detections=args.max_detections, parallel_iterations=args.parallel_iterations ) # save model model.save(args.model_out) if __name__ == '__main__': main() ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/bin/debug.py ================================================ #!/usr/bin/env python """ Copyright 2017-2018 Fizyr (https://fizyr.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ import argparse import os import sys import cv2 # Set keycodes for changing images # 81, 83 are left and right arrows on linux in Ascii code (probably not needed) # 65361, 65363 are left and right arrows in linux # 2424832, 2555904 are left and right arrows on Windows # 110, 109 are 'n' and 'm' on mac, windows, linux # (unfortunately arrow keys not picked up on mac) leftkeys = (81, 110, 65361, 2424832) rightkeys = (83, 109, 65363, 2555904) # Allow relative imports when being executed as script. if __name__ == "__main__" and __package__ is None: sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..')) import keras_retinanet.bin # noqa: F401 __package__ = "keras_retinanet.bin" # Change these to absolute imports if you copy this script outside the keras_retinanet package. from ..preprocessing.pascal_voc import PascalVocGenerator from ..preprocessing.csv_generator import CSVGenerator from ..preprocessing.kitti import KittiGenerator from ..preprocessing.open_images import OpenImagesGenerator from ..utils.anchors import anchors_for_shape, compute_gt_annotations from ..utils.config import read_config_file, parse_anchor_parameters, parse_pyramid_levels from ..utils.image import random_visual_effect_generator from ..utils.tf_version import check_tf_version from ..utils.transform import random_transform_generator from ..utils.visualization import draw_annotations, draw_boxes, draw_caption def create_generator(args): """ Create the data generators. Args: args: parseargs arguments object. """ common_args = { 'config' : args.config, 'image_min_side' : args.image_min_side, 'image_max_side' : args.image_max_side, 'group_method' : args.group_method } # create random transform generator for augmenting training data transform_generator = random_transform_generator( min_rotation=-0.1, max_rotation=0.1, min_translation=(-0.1, -0.1), max_translation=(0.1, 0.1), min_shear=-0.1, max_shear=0.1, min_scaling=(0.9, 0.9), max_scaling=(1.1, 1.1), flip_x_chance=0.5, flip_y_chance=0.5, ) visual_effect_generator = random_visual_effect_generator( contrast_range=(0.9, 1.1), brightness_range=(-.1, .1), hue_range=(-0.05, 0.05), saturation_range=(0.95, 1.05) ) if args.dataset_type == 'coco': # import here to prevent unnecessary dependency on cocoapi from ..preprocessing.coco import CocoGenerator generator = CocoGenerator( args.coco_path, args.coco_set, transform_generator=transform_generator, visual_effect_generator=visual_effect_generator, **common_args ) elif args.dataset_type == 'pascal': generator = PascalVocGenerator( args.pascal_path, args.pascal_set, image_extension=args.image_extension, transform_generator=transform_generator, visual_effect_generator=visual_effect_generator, **common_args ) elif args.dataset_type == 'csv': generator = CSVGenerator( args.annotations, args.classes, transform_generator=transform_generator, visual_effect_generator=visual_effect_generator, **common_args ) elif args.dataset_type == 'oid': generator = OpenImagesGenerator( args.main_dir, subset=args.subset, version=args.version, labels_filter=args.labels_filter, parent_label=args.parent_label, annotation_cache_dir=args.annotation_cache_dir, transform_generator=transform_generator, visual_effect_generator=visual_effect_generator, **common_args ) elif args.dataset_type == 'kitti': generator = KittiGenerator( args.kitti_path, subset=args.subset, transform_generator=transform_generator, visual_effect_generator=visual_effect_generator, **common_args ) else: raise ValueError('Invalid data type received: {}'.format(args.dataset_type)) return generator def parse_args(args): """ Parse the arguments. """ parser = argparse.ArgumentParser(description='Debug script for a RetinaNet network.') subparsers = parser.add_subparsers(help='Arguments for specific dataset types.', dest='dataset_type') subparsers.required = True coco_parser = subparsers.add_parser('coco') coco_parser.add_argument('coco_path', help='Path to dataset directory (ie. /tmp/COCO).') coco_parser.add_argument('--coco-set', help='Name of the set to show (defaults to val2017).', default='val2017') pascal_parser = subparsers.add_parser('pascal') pascal_parser.add_argument('pascal_path', help='Path to dataset directory (ie. /tmp/VOCdevkit).') pascal_parser.add_argument('--pascal-set', help='Name of the set to show (defaults to test).', default='test') pascal_parser.add_argument('--image-extension', help='Declares the dataset images\' extension.', default='.jpg') kitti_parser = subparsers.add_parser('kitti') kitti_parser.add_argument('kitti_path', help='Path to dataset directory (ie. /tmp/kitti).') kitti_parser.add_argument('subset', help='Argument for loading a subset from train/val.') def csv_list(string): return string.split(',') oid_parser = subparsers.add_parser('oid') oid_parser.add_argument('main_dir', help='Path to dataset directory.') oid_parser.add_argument('subset', help='Argument for loading a subset from train/validation/test.') oid_parser.add_argument('--version', help='The current dataset version is v4.', default='v4') oid_parser.add_argument('--labels-filter', help='A list of labels to filter.', type=csv_list, default=None) oid_parser.add_argument('--annotation-cache-dir', help='Path to store annotation cache.', default='.') oid_parser.add_argument('--parent-label', help='Use the hierarchy children of this label.', default=None) csv_parser = subparsers.add_parser('csv') csv_parser.add_argument('annotations', help='Path to CSV file containing annotations for evaluation.') csv_parser.add_argument('classes', help='Path to a CSV file containing class label mapping.') parser.add_argument('--no-resize', help='Disable image resizing.', dest='resize', action='store_false') parser.add_argument('--anchors', help='Show positive anchors on the image.', action='store_true') parser.add_argument('--display-name', help='Display image name on the bottom left corner.', action='store_true') parser.add_argument('--show-annotations', help='Show annotations on the image. Green annotations have anchors, red annotations don\'t and therefore don\'t contribute to training.', action='store_true') parser.add_argument('--random-transform', help='Randomly transform image and annotations.', action='store_true') parser.add_argument('--image-min-side', help='Rescale the image so the smallest side is min_side.', type=int, default=800) parser.add_argument('--image-max-side', help='Rescale the image if the largest side is larger than max_side.', type=int, default=1333) parser.add_argument('--config', help='Path to a configuration parameters .ini file.') parser.add_argument('--no-gui', help='Do not open a GUI window. Save images to an output directory instead.', action='store_true') parser.add_argument('--output-dir', help='The output directory to save images to if --no-gui is specified.', default='.') parser.add_argument('--flatten-output', help='Flatten the folder structure of saved output images into a single folder.', action='store_true') parser.add_argument('--group-method', help='Determines how images are grouped together', type=str, default='ratio', choices=['none', 'random', 'ratio']) return parser.parse_args(args) def run(generator, args, anchor_params, pyramid_levels): """ Main loop. Args generator: The generator to debug. args: parseargs args object. """ # display images, one at a time i = 0 while True: # load the data image = generator.load_image(i) annotations = generator.load_annotations(i) if len(annotations['labels']) > 0 : # apply random transformations if args.random_transform: image, annotations = generator.random_transform_group_entry(image, annotations) image, annotations = generator.random_visual_effect_group_entry(image, annotations) # resize the image and annotations if args.resize: image, image_scale = generator.resize_image(image) annotations['bboxes'] *= image_scale anchors = anchors_for_shape(image.shape, anchor_params=anchor_params, pyramid_levels=pyramid_levels) positive_indices, _, max_indices = compute_gt_annotations(anchors, annotations['bboxes']) # draw anchors on the image if args.anchors: draw_boxes(image, anchors[positive_indices], (255, 255, 0), thickness=1) # draw annotations on the image if args.show_annotations: # draw annotations in red draw_annotations(image, annotations, color=(0, 0, 255), label_to_name=generator.label_to_name) # draw regressed anchors in green to override most red annotations # result is that annotations without anchors are red, with anchors are green draw_boxes(image, annotations['bboxes'][max_indices[positive_indices], :], (0, 255, 0)) # display name on the image if args.display_name: draw_caption(image, [0, image.shape[0]], os.path.basename(generator.image_path(i))) # write to file and advance if no-gui selected if args.no_gui: output_path = make_output_path(args.output_dir, generator.image_path(i), flatten=args.flatten_output) os.makedirs(os.path.dirname(output_path), exist_ok=True) cv2.imwrite(output_path, image) i += 1 if i == generator.size(): # have written all images break else: continue # if we are using the GUI, then show an image cv2.imshow('Image', image) key = cv2.waitKeyEx() # press right for next image and left for previous (linux or windows, doesn't work for macOS) # if you run macOS, press "n" or "m" (will also work on linux and windows) if key in rightkeys: i = (i + 1) % generator.size() if key in leftkeys: i -= 1 if i < 0: i = generator.size() - 1 # press q or Esc to quit if (key == ord('q')) or (key == 27): return False return True def make_output_path(output_dir, image_path, flatten = False): """ Compute the output path for a debug image. """ # If the output hierarchy is flattened to a single folder, throw away all leading folders. if flatten: path = os.path.basename(image_path) # Otherwise, make sure absolute paths are taken relative to the filesystem root. else: # Make sure to drop drive letters on Windows, otherwise relpath wil fail. _, path = os.path.splitdrive(image_path) if os.path.isabs(path): path = os.path.relpath(path, '/') # In all cases, append "_debug" to the filename, before the extension. base, extension = os.path.splitext(path) path = base + "_debug" + extension # Finally, join the whole thing to the output directory. return os.path.join(output_dir, path) def main(args=None): # parse arguments if args is None: args = sys.argv[1:] args = parse_args(args) # make sure tensorflow is the minimum required version check_tf_version() # create the generator generator = create_generator(args) # optionally load config parameters if args.config: args.config = read_config_file(args.config) # optionally load anchor parameters anchor_params = None if args.config and 'anchor_parameters' in args.config: anchor_params = parse_anchor_parameters(args.config) pyramid_levels = None if args.config and 'pyramid_levels' in args.config: pyramid_levels = parse_pyramid_levels(args.config) # create the display window if necessary if not args.no_gui: cv2.namedWindow('Image', cv2.WINDOW_NORMAL) run(generator, args, anchor_params=anchor_params, pyramid_levels=pyramid_levels) if __name__ == '__main__': main() ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/bin/evaluate.py ================================================ #!/usr/bin/env python """ Copyright 2017-2018 Fizyr (https://fizyr.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ import argparse import os import sys # Allow relative imports when being executed as script. if __name__ == "__main__" and __package__ is None: sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..')) import keras_retinanet.bin # noqa: F401 __package__ = "keras_retinanet.bin" # Change these to absolute imports if you copy this script outside the keras_retinanet package. from .. import models from ..preprocessing.csv_generator import CSVGenerator from ..preprocessing.pascal_voc import PascalVocGenerator from ..utils.anchors import make_shapes_callback from ..utils.config import read_config_file, parse_anchor_parameters, parse_pyramid_levels from ..utils.eval import evaluate from ..utils.gpu import setup_gpu from ..utils.tf_version import check_tf_version def create_generator(args, preprocess_image): """ Create generators for evaluation. """ common_args = { 'config' : args.config, 'image_min_side' : args.image_min_side, 'image_max_side' : args.image_max_side, 'no_resize' : args.no_resize, 'preprocess_image' : preprocess_image, 'group_method' : args.group_method } if args.dataset_type == 'coco': # import here to prevent unnecessary dependency on cocoapi from ..preprocessing.coco import CocoGenerator validation_generator = CocoGenerator( args.coco_path, 'val2017', shuffle_groups=False, **common_args ) elif args.dataset_type == 'pascal': validation_generator = PascalVocGenerator( args.pascal_path, 'test', image_extension=args.image_extension, shuffle_groups=False, **common_args ) elif args.dataset_type == 'csv': validation_generator = CSVGenerator( args.annotations, args.classes, shuffle_groups=False, **common_args ) else: raise ValueError('Invalid data type received: {}'.format(args.dataset_type)) return validation_generator def parse_args(args): """ Parse the arguments. """ parser = argparse.ArgumentParser(description='Evaluation script for a RetinaNet network.') subparsers = parser.add_subparsers(help='Arguments for specific dataset types.', dest='dataset_type') subparsers.required = True coco_parser = subparsers.add_parser('coco') coco_parser.add_argument('coco_path', help='Path to dataset directory (ie. /tmp/COCO).') pascal_parser = subparsers.add_parser('pascal') pascal_parser.add_argument('pascal_path', help='Path to dataset directory (ie. /tmp/VOCdevkit).') pascal_parser.add_argument('--image-extension', help='Declares the dataset images\' extension.', default='.jpg') csv_parser = subparsers.add_parser('csv') csv_parser.add_argument('annotations', help='Path to CSV file containing annotations for evaluation.') csv_parser.add_argument('classes', help='Path to a CSV file containing class label mapping.') parser.add_argument('model', help='Path to RetinaNet model.') parser.add_argument('--convert-model', help='Convert the model to an inference model (ie. the input is a training model).', action='store_true') parser.add_argument('--backbone', help='The backbone of the model.', default='resnet50') parser.add_argument('--gpu', help='Id of the GPU to use (as reported by nvidia-smi).') parser.add_argument('--score-threshold', help='Threshold on score to filter detections with (defaults to 0.05).', default=0.05, type=float) parser.add_argument('--iou-threshold', help='IoU Threshold to count for a positive detection (defaults to 0.5).', default=0.5, type=float) parser.add_argument('--max-detections', help='Max Detections per image (defaults to 100).', default=100, type=int) parser.add_argument('--save-path', help='Path for saving images with detections (doesn\'t work for COCO).') parser.add_argument('--image-min-side', help='Rescale the image so the smallest side is min_side.', type=int, default=800) parser.add_argument('--image-max-side', help='Rescale the image if the largest side is larger than max_side.', type=int, default=1333) parser.add_argument('--no-resize', help='Don''t rescale the image.', action='store_true') parser.add_argument('--config', help='Path to a configuration parameters .ini file (only used with --convert-model).') parser.add_argument('--group-method', help='Determines how images are grouped together', type=str, default='ratio', choices=['none', 'random', 'ratio']) return parser.parse_args(args) def main(args=None): # parse arguments if args is None: args = sys.argv[1:] args = parse_args(args) # make sure tensorflow is the minimum required version check_tf_version() # optionally choose specific GPU if args.gpu: setup_gpu(args.gpu) # make save path if it doesn't exist if args.save_path is not None and not os.path.exists(args.save_path): os.makedirs(args.save_path) # optionally load config parameters if args.config: args.config = read_config_file(args.config) # create the generator backbone = models.backbone(args.backbone) generator = create_generator(args, backbone.preprocess_image) # optionally load anchor parameters anchor_params = None pyramid_levels = None if args.config and 'anchor_parameters' in args.config: anchor_params = parse_anchor_parameters(args.config) if args.config and 'pyramid_levels' in args.config: pyramid_levels = parse_pyramid_levels(args.config) # load the model print('Loading model, this may take a second...') model = models.load_model(args.model, backbone_name=args.backbone) generator.compute_shapes = make_shapes_callback(model) # optionally convert the model if args.convert_model: model = models.convert_model(model, anchor_params=anchor_params, pyramid_levels=pyramid_levels) # print model summary # print(model.summary()) # start evaluation if args.dataset_type == 'coco': from ..utils.coco_eval import evaluate_coco evaluate_coco(generator, model, args.score_threshold) else: average_precisions, inference_time = evaluate( generator, model, iou_threshold=args.iou_threshold, score_threshold=args.score_threshold, max_detections=args.max_detections, save_path=args.save_path ) # print evaluation total_instances = [] precisions = [] for label, (average_precision, num_annotations) in average_precisions.items(): print('{:.0f} instances of class'.format(num_annotations), generator.label_to_name(label), 'with average precision: {:.4f}'.format(average_precision)) total_instances.append(num_annotations) precisions.append(average_precision) if sum(total_instances) == 0: print('No test instances found.') return print('Inference time for {:.0f} images: {:.4f}'.format(generator.size(), inference_time)) print('mAP using the weighted average of precisions among classes: {:.4f}'.format(sum([a * b for a, b in zip(total_instances, precisions)]) / sum(total_instances))) print('mAP: {:.4f}'.format(sum(precisions) / sum(x > 0 for x in total_instances))) if __name__ == '__main__': main() ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/bin/train.py ================================================ #!/usr/bin/env python """ Copyright 2017-2018 Fizyr (https://fizyr.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ import argparse import os import sys import warnings from tensorflow import keras import tensorflow as tf # Allow relative imports when being executed as script. if __name__ == "__main__" and __package__ is None: sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..')) import keras_retinanet.bin # noqa: F401 __package__ = "keras_retinanet.bin" # Change these to absolute imports if you copy this script outside the keras_retinanet package. from .. import layers # noqa: F401 from .. import losses from .. import models from ..callbacks import RedirectModel from ..callbacks.eval import Evaluate from ..models.retinanet import retinanet_bbox from ..preprocessing.csv_generator import CSVGenerator from ..preprocessing.kitti import KittiGenerator from ..preprocessing.open_images import OpenImagesGenerator from ..preprocessing.pascal_voc import PascalVocGenerator from ..utils.anchors import make_shapes_callback from ..utils.config import read_config_file, parse_anchor_parameters, parse_pyramid_levels from ..utils.gpu import setup_gpu from ..utils.image import random_visual_effect_generator from ..utils.model import freeze as freeze_model from ..utils.tf_version import check_tf_version from ..utils.transform import random_transform_generator def makedirs(path): # Intended behavior: try to create the directory, # pass if the directory exists already, fails otherwise. # Meant for Python 2.7/3.n compatibility. try: os.makedirs(path) except OSError: if not os.path.isdir(path): raise def model_with_weights(model, weights, skip_mismatch): """ Load weights for model. Args model : The model to load weights for. weights : The weights to load. skip_mismatch : If True, skips layers whose shape of weights doesn't match with the model. """ if weights is not None: model.load_weights(weights, by_name=True, skip_mismatch=skip_mismatch) return model def create_models(backbone_retinanet, num_classes, weights, multi_gpu=0, freeze_backbone=False, lr=1e-5, optimizer_clipnorm=0.001, config=None): """ Creates three models (model, training_model, prediction_model). Args backbone_retinanet : A function to call to create a retinanet model with a given backbone. num_classes : The number of classes to train. weights : The weights to load into the model. multi_gpu : The number of GPUs to use for training. freeze_backbone : If True, disables learning for the backbone. config : Config parameters, None indicates the default configuration. Returns model : The base model. This is also the model that is saved in snapshots. training_model : The training model. If multi_gpu=0, this is identical to model. prediction_model : The model wrapped with utility functions to perform object detection (applies regression values and performs NMS). """ modifier = freeze_model if freeze_backbone else None # load anchor parameters, or pass None (so that defaults will be used) anchor_params = None num_anchors = None pyramid_levels = None if config and 'anchor_parameters' in config: anchor_params = parse_anchor_parameters(config) num_anchors = anchor_params.num_anchors() if config and 'pyramid_levels' in config: pyramid_levels = parse_pyramid_levels(config) # Keras recommends initialising a multi-gpu model on the CPU to ease weight sharing, and to prevent OOM errors. # optionally wrap in a parallel model if multi_gpu > 1: from keras.utils import multi_gpu_model with tf.device('/cpu:0'): model = model_with_weights(backbone_retinanet(num_classes, num_anchors=num_anchors, modifier=modifier, pyramid_levels=pyramid_levels), weights=weights, skip_mismatch=True) training_model = multi_gpu_model(model, gpus=multi_gpu) else: model = model_with_weights(backbone_retinanet(num_classes, num_anchors=num_anchors, modifier=modifier, pyramid_levels=pyramid_levels), weights=weights, skip_mismatch=True) training_model = model # make prediction model prediction_model = retinanet_bbox(model=model, anchor_params=anchor_params, pyramid_levels=pyramid_levels) # compile model training_model.compile( loss={ 'regression' : losses.smooth_l1(), 'classification': losses.focal() }, optimizer=keras.optimizers.Adam(lr=lr, clipnorm=optimizer_clipnorm) ) return model, training_model, prediction_model def create_callbacks(model, training_model, prediction_model, validation_generator, args): """ Creates the callbacks to use during training. Args model: The base model. training_model: The model that is used for training. prediction_model: The model that should be used for validation. validation_generator: The generator for creating validation data. args: parseargs args object. Returns: A list of callbacks used for training. """ callbacks = [] tensorboard_callback = None if args.tensorboard_dir: makedirs(args.tensorboard_dir) update_freq = args.tensorboard_freq if update_freq not in ['epoch', 'batch']: update_freq = int(update_freq) tensorboard_callback = keras.callbacks.TensorBoard( log_dir = args.tensorboard_dir, histogram_freq = 0, batch_size = args.batch_size, write_graph = True, write_grads = False, write_images = False, update_freq = update_freq, embeddings_freq = 0, embeddings_layer_names = None, embeddings_metadata = None ) if args.evaluation and validation_generator: if args.dataset_type == 'coco': from ..callbacks.coco import CocoEval # use prediction model for evaluation evaluation = CocoEval(validation_generator, tensorboard=tensorboard_callback) else: evaluation = Evaluate(validation_generator, tensorboard=tensorboard_callback, weighted_average=args.weighted_average) evaluation = RedirectModel(evaluation, prediction_model) callbacks.append(evaluation) # save the model if args.snapshots: # ensure directory created first; otherwise h5py will error after epoch. makedirs(args.snapshot_path) checkpoint = keras.callbacks.ModelCheckpoint( os.path.join( args.snapshot_path, '{backbone}_{dataset_type}_{{epoch:02d}}.h5'.format(backbone=args.backbone, dataset_type=args.dataset_type) ), verbose=1, # save_best_only=True, # monitor="mAP", # mode='max' ) checkpoint = RedirectModel(checkpoint, model) callbacks.append(checkpoint) callbacks.append(keras.callbacks.ReduceLROnPlateau( monitor = 'loss', factor = args.reduce_lr_factor, patience = args.reduce_lr_patience, verbose = 1, mode = 'auto', min_delta = 0.0001, cooldown = 0, min_lr = 0 )) if args.evaluation and validation_generator: callbacks.append(keras.callbacks.EarlyStopping( monitor = 'mAP', patience = 5, mode = 'max', min_delta = 0.01 )) if args.tensorboard_dir: callbacks.append(tensorboard_callback) return callbacks def create_generators(args, preprocess_image): """ Create generators for training and validation. Args args : parseargs object containing configuration for generators. preprocess_image : Function that preprocesses an image for the network. """ common_args = { 'batch_size' : args.batch_size, 'config' : args.config, 'image_min_side' : args.image_min_side, 'image_max_side' : args.image_max_side, 'no_resize' : args.no_resize, 'preprocess_image' : preprocess_image, 'group_method' : args.group_method } # create random transform generator for augmenting training data if args.random_transform: transform_generator = random_transform_generator( min_rotation=-0.1, max_rotation=0.1, min_translation=(-0.1, -0.1), max_translation=(0.1, 0.1), min_shear=-0.1, max_shear=0.1, min_scaling=(0.9, 0.9), max_scaling=(1.1, 1.1), flip_x_chance=0.5, flip_y_chance=0.5, ) visual_effect_generator = random_visual_effect_generator( contrast_range=(0.9, 1.1), brightness_range=(-.1, .1), hue_range=(-0.05, 0.05), saturation_range=(0.95, 1.05) ) else: transform_generator = random_transform_generator(flip_x_chance=0.5) visual_effect_generator = None if args.dataset_type == 'coco': # import here to prevent unnecessary dependency on cocoapi from ..preprocessing.coco import CocoGenerator train_generator = CocoGenerator( args.coco_path, 'train2017', transform_generator=transform_generator, visual_effect_generator=visual_effect_generator, **common_args ) validation_generator = CocoGenerator( args.coco_path, 'val2017', shuffle_groups=False, **common_args ) elif args.dataset_type == 'pascal': train_generator = PascalVocGenerator( args.pascal_path, 'train', image_extension=args.image_extension, transform_generator=transform_generator, visual_effect_generator=visual_effect_generator, **common_args ) validation_generator = PascalVocGenerator( args.pascal_path, 'val', image_extension=args.image_extension, shuffle_groups=False, **common_args ) elif args.dataset_type == 'csv': train_generator = CSVGenerator( args.annotations, args.classes, transform_generator=transform_generator, visual_effect_generator=visual_effect_generator, **common_args ) if args.val_annotations: validation_generator = CSVGenerator( args.val_annotations, args.classes, shuffle_groups=False, **common_args ) else: validation_generator = None elif args.dataset_type == 'oid': train_generator = OpenImagesGenerator( args.main_dir, subset='train', version=args.version, labels_filter=args.labels_filter, annotation_cache_dir=args.annotation_cache_dir, parent_label=args.parent_label, transform_generator=transform_generator, visual_effect_generator=visual_effect_generator, **common_args ) validation_generator = OpenImagesGenerator( args.main_dir, subset='validation', version=args.version, labels_filter=args.labels_filter, annotation_cache_dir=args.annotation_cache_dir, parent_label=args.parent_label, shuffle_groups=False, **common_args ) elif args.dataset_type == 'kitti': train_generator = KittiGenerator( args.kitti_path, subset='train', transform_generator=transform_generator, visual_effect_generator=visual_effect_generator, **common_args ) validation_generator = KittiGenerator( args.kitti_path, subset='val', shuffle_groups=False, **common_args ) else: raise ValueError('Invalid data type received: {}'.format(args.dataset_type)) return train_generator, validation_generator def check_args(parsed_args): """ Function to check for inherent contradictions within parsed arguments. For example, batch_size < num_gpus Intended to raise errors prior to backend initialisation. Args parsed_args: parser.parse_args() Returns parsed_args """ if parsed_args.multi_gpu > 1 and parsed_args.batch_size < parsed_args.multi_gpu: raise ValueError( "Batch size ({}) must be equal to or higher than the number of GPUs ({})".format(parsed_args.batch_size, parsed_args.multi_gpu)) if parsed_args.multi_gpu > 1 and parsed_args.snapshot: raise ValueError( "Multi GPU training ({}) and resuming from snapshots ({}) is not supported.".format(parsed_args.multi_gpu, parsed_args.snapshot)) if parsed_args.multi_gpu > 1 and not parsed_args.multi_gpu_force: raise ValueError("Multi-GPU support is experimental, use at own risk! Run with --multi-gpu-force if you wish to continue.") if 'resnet' not in parsed_args.backbone: warnings.warn('Using experimental backbone {}. Only resnet50 has been properly tested.'.format(parsed_args.backbone)) return parsed_args def parse_args(args): """ Parse the arguments. """ parser = argparse.ArgumentParser(description='Simple training script for training a RetinaNet network.') subparsers = parser.add_subparsers(help='Arguments for specific dataset types.', dest='dataset_type') subparsers.required = True coco_parser = subparsers.add_parser('coco') coco_parser.add_argument('coco_path', help='Path to dataset directory (ie. /tmp/COCO).') pascal_parser = subparsers.add_parser('pascal') pascal_parser.add_argument('pascal_path', help='Path to dataset directory (ie. /tmp/VOCdevkit).') pascal_parser.add_argument('--image-extension', help='Declares the dataset images\' extension.', default='.jpg') kitti_parser = subparsers.add_parser('kitti') kitti_parser.add_argument('kitti_path', help='Path to dataset directory (ie. /tmp/kitti).') def csv_list(string): return string.split(',') oid_parser = subparsers.add_parser('oid') oid_parser.add_argument('main_dir', help='Path to dataset directory.') oid_parser.add_argument('--version', help='The current dataset version is v4.', default='v4') oid_parser.add_argument('--labels-filter', help='A list of labels to filter.', type=csv_list, default=None) oid_parser.add_argument('--annotation-cache-dir', help='Path to store annotation cache.', default='.') oid_parser.add_argument('--parent-label', help='Use the hierarchy children of this label.', default=None) csv_parser = subparsers.add_parser('csv') csv_parser.add_argument('annotations', help='Path to CSV file containing annotations for training.') csv_parser.add_argument('classes', help='Path to a CSV file containing class label mapping.') csv_parser.add_argument('--val-annotations', help='Path to CSV file containing annotations for validation (optional).') group = parser.add_mutually_exclusive_group() group.add_argument('--snapshot', help='Resume training from a snapshot.') group.add_argument('--imagenet-weights', help='Initialize the model with pretrained imagenet weights. This is the default behaviour.', action='store_const', const=True, default=True) group.add_argument('--weights', help='Initialize the model with weights from a file.') group.add_argument('--no-weights', help='Don\'t initialize the model with any weights.', dest='imagenet_weights', action='store_const', const=False) parser.add_argument('--backbone', help='Backbone model used by retinanet.', default='resnet50', type=str) parser.add_argument('--batch-size', help='Size of the batches.', default=1, type=int) parser.add_argument('--gpu', help='Id of the GPU to use (as reported by nvidia-smi).') parser.add_argument('--multi-gpu', help='Number of GPUs to use for parallel processing.', type=int, default=0) parser.add_argument('--multi-gpu-force', help='Extra flag needed to enable (experimental) multi-gpu support.', action='store_true') parser.add_argument('--initial-epoch', help='Epoch from which to begin the train, useful if resuming from snapshot.', type=int, default=0) parser.add_argument('--epochs', help='Number of epochs to train.', type=int, default=50) parser.add_argument('--steps', help='Number of steps per epoch.', type=int, default=10000) parser.add_argument('--lr', help='Learning rate.', type=float, default=1e-5) parser.add_argument('--optimizer-clipnorm', help='Clipnorm parameter for optimizer.', type=float, default=0.001) parser.add_argument('--snapshot-path', help='Path to store snapshots of models during training (defaults to \'./snapshots\')', default='./snapshots') parser.add_argument('--tensorboard-dir', help='Log directory for Tensorboard output', default='') # default='./logs') => https://github.com/tensorflow/tensorflow/pull/34870 parser.add_argument('--tensorboard-freq', help='Update frequency for Tensorboard output. Values \'epoch\', \'batch\' or int', default='epoch') parser.add_argument('--no-snapshots', help='Disable saving snapshots.', dest='snapshots', action='store_false') parser.add_argument('--no-evaluation', help='Disable per epoch evaluation.', dest='evaluation', action='store_false') parser.add_argument('--freeze-backbone', help='Freeze training of backbone layers.', action='store_true') parser.add_argument('--random-transform', help='Randomly transform image and annotations.', action='store_true') parser.add_argument('--image-min-side', help='Rescale the image so the smallest side is min_side.', type=int, default=800) parser.add_argument('--image-max-side', help='Rescale the image if the largest side is larger than max_side.', type=int, default=1333) parser.add_argument('--no-resize', help='Don''t rescale the image.', action='store_true') parser.add_argument('--config', help='Path to a configuration parameters .ini file.') parser.add_argument('--weighted-average', help='Compute the mAP using the weighted average of precisions among classes.', action='store_true') parser.add_argument('--compute-val-loss', help='Compute validation loss during training', dest='compute_val_loss', action='store_true') parser.add_argument('--reduce-lr-patience', help='Reduce learning rate after validation loss decreases over reduce_lr_patience epochs', type=int, default=2) parser.add_argument('--reduce-lr-factor', help='When learning rate is reduced due to reduce_lr_patience, multiply by reduce_lr_factor', type=float, default=0.1) parser.add_argument('--group-method', help='Determines how images are grouped together', type=str, default='ratio', choices=['none', 'random', 'ratio']) # Fit generator arguments parser.add_argument('--multiprocessing', help='Use multiprocessing in fit_generator.', action='store_true') parser.add_argument('--workers', help='Number of generator workers.', type=int, default=1) parser.add_argument('--max-queue-size', help='Queue length for multiprocessing workers in fit_generator.', type=int, default=10) return check_args(parser.parse_args(args)) def main(args=None): # parse arguments if args is None: args = sys.argv[1:] args = parse_args(args) # create object that stores backbone information backbone = models.backbone(args.backbone) # make sure tensorflow is the minimum required version check_tf_version() # optionally choose specific GPU if args.gpu is not None: setup_gpu(args.gpu) # optionally load config parameters if args.config: args.config = read_config_file(args.config) # create the generators train_generator, validation_generator = create_generators(args, backbone.preprocess_image) # create the model if args.snapshot is not None: print('Loading model, this may take a second...') model = models.load_model(args.snapshot, backbone_name=args.backbone) training_model = model anchor_params = None pyramid_levels = None if args.config and 'anchor_parameters' in args.config: anchor_params = parse_anchor_parameters(args.config) if args.config and 'pyramid_levels' in args.config: pyramid_levels = parse_pyramid_levels(args.config) prediction_model = retinanet_bbox(model=model, anchor_params=anchor_params, pyramid_levels=pyramid_levels) else: weights = args.weights # default to imagenet if nothing else is specified if weights is None and args.imagenet_weights: weights = backbone.download_imagenet() print('Creating model, this may take a second...') model, training_model, prediction_model = create_models( backbone_retinanet=backbone.retinanet, num_classes=train_generator.num_classes(), weights=weights, multi_gpu=args.multi_gpu, freeze_backbone=args.freeze_backbone, lr=args.lr, optimizer_clipnorm=args.optimizer_clipnorm, config=args.config ) # print model summary print(model.summary()) # this lets the generator compute backbone layer shapes using the actual backbone model if 'vgg' in args.backbone or 'densenet' in args.backbone: train_generator.compute_shapes = make_shapes_callback(model) if validation_generator: validation_generator.compute_shapes = train_generator.compute_shapes # create the callbacks callbacks = create_callbacks( model, training_model, prediction_model, validation_generator, args, ) if not args.compute_val_loss: validation_generator = None # start training return training_model.fit_generator( generator=train_generator, steps_per_epoch=args.steps, epochs=args.epochs, verbose=1, callbacks=callbacks, workers=args.workers, use_multiprocessing=args.multiprocessing, max_queue_size=args.max_queue_size, validation_data=validation_generator, initial_epoch=args.initial_epoch ) if __name__ == '__main__': main() ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/callbacks/__init__.py ================================================ from .common import * # noqa: F401,F403 ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/callbacks/coco.py ================================================ """ Copyright 2017-2018 Fizyr (https://fizyr.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ from tensorflow import keras from ..utils.coco_eval import evaluate_coco class CocoEval(keras.callbacks.Callback): """ Performs COCO evaluation on each epoch. """ def __init__(self, generator, tensorboard=None, threshold=0.05): """ CocoEval callback intializer. Args generator : The generator used for creating validation data. tensorboard : If given, the results will be written to tensorboard. threshold : The score threshold to use. """ self.generator = generator self.threshold = threshold self.tensorboard = tensorboard super(CocoEval, self).__init__() def on_epoch_end(self, epoch, logs=None): logs = logs or {} coco_tag = ['AP @[ IoU=0.50:0.95 | area= all | maxDets=100 ]', 'AP @[ IoU=0.50 | area= all | maxDets=100 ]', 'AP @[ IoU=0.75 | area= all | maxDets=100 ]', 'AP @[ IoU=0.50:0.95 | area= small | maxDets=100 ]', 'AP @[ IoU=0.50:0.95 | area=medium | maxDets=100 ]', 'AP @[ IoU=0.50:0.95 | area= large | maxDets=100 ]', 'AR @[ IoU=0.50:0.95 | area= all | maxDets= 1 ]', 'AR @[ IoU=0.50:0.95 | area= all | maxDets= 10 ]', 'AR @[ IoU=0.50:0.95 | area= all | maxDets=100 ]', 'AR @[ IoU=0.50:0.95 | area= small | maxDets=100 ]', 'AR @[ IoU=0.50:0.95 | area=medium | maxDets=100 ]', 'AR @[ IoU=0.50:0.95 | area= large | maxDets=100 ]'] coco_eval_stats = evaluate_coco(self.generator, self.model, self.threshold) if coco_eval_stats is not None: for index, result in enumerate(coco_eval_stats): logs[coco_tag[index]] = result if self.tensorboard: import tensorflow as tf writer = tf.summary.create_file_writer(self.tensorboard.log_dir) with writer.as_default(): for index, result in enumerate(coco_eval_stats): tf.summary.scalar('{}. {}'.format(index + 1, coco_tag[index]), result, step=epoch) writer.flush() ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/callbacks/common.py ================================================ from tensorflow import keras class RedirectModel(keras.callbacks.Callback): """Callback which wraps another callback, but executed on a different model. ```python model = keras.models.load_model('model.h5') model_checkpoint = ModelCheckpoint(filepath='snapshot.h5') parallel_model = multi_gpu_model(model, gpus=2) parallel_model.fit(X_train, Y_train, callbacks=[RedirectModel(model_checkpoint, model)]) ``` Args callback : callback to wrap. model : model to use when executing callbacks. """ def __init__(self, callback, model): super(RedirectModel, self).__init__() self.callback = callback self.redirect_model = model def on_epoch_begin(self, epoch, logs=None): self.callback.on_epoch_begin(epoch, logs=logs) def on_epoch_end(self, epoch, logs=None): self.callback.on_epoch_end(epoch, logs=logs) def on_batch_begin(self, batch, logs=None): self.callback.on_batch_begin(batch, logs=logs) def on_batch_end(self, batch, logs=None): self.callback.on_batch_end(batch, logs=logs) def on_train_begin(self, logs=None): # overwrite the model with our custom model self.callback.set_model(self.redirect_model) self.callback.on_train_begin(logs=logs) def on_train_end(self, logs=None): self.callback.on_train_end(logs=logs) ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/callbacks/eval.py ================================================ """ Copyright 2017-2018 Fizyr (https://fizyr.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ from tensorflow import keras from ..utils.eval import evaluate class Evaluate(keras.callbacks.Callback): """ Evaluation callback for arbitrary datasets. """ def __init__( self, generator, iou_threshold=0.5, score_threshold=0.05, max_detections=100, save_path=None, tensorboard=None, weighted_average=False, verbose=1 ): """ Evaluate a given dataset using a given model at the end of every epoch during training. # Arguments generator : The generator that represents the dataset to evaluate. iou_threshold : The threshold used to consider when a detection is positive or negative. score_threshold : The score confidence threshold to use for detections. max_detections : The maximum number of detections to use per image. save_path : The path to save images with visualized detections to. tensorboard : Instance of keras.callbacks.TensorBoard used to log the mAP value. weighted_average : Compute the mAP using the weighted average of precisions among classes. verbose : Set the verbosity level, by default this is set to 1. """ self.generator = generator self.iou_threshold = iou_threshold self.score_threshold = score_threshold self.max_detections = max_detections self.save_path = save_path self.tensorboard = tensorboard self.weighted_average = weighted_average self.verbose = verbose super(Evaluate, self).__init__() def on_epoch_end(self, epoch, logs=None): logs = logs or {} # run evaluation average_precisions, _ = evaluate( self.generator, self.model, iou_threshold=self.iou_threshold, score_threshold=self.score_threshold, max_detections=self.max_detections, save_path=self.save_path ) # compute per class average precision total_instances = [] precisions = [] for label, (average_precision, num_annotations) in average_precisions.items(): if self.verbose == 1: print('{:.0f} instances of class'.format(num_annotations), self.generator.label_to_name(label), 'with average precision: {:.4f}'.format(average_precision)) total_instances.append(num_annotations) precisions.append(average_precision) if self.weighted_average: self.mean_ap = sum([a * b for a, b in zip(total_instances, precisions)]) / sum(total_instances) else: self.mean_ap = sum(precisions) / sum(x > 0 for x in total_instances) if self.tensorboard: import tensorflow as tf writer = tf.summary.create_file_writer(self.tensorboard.log_dir) with writer.as_default(): tf.summary.scalar("mAP", self.mean_ap, step=epoch) if self.verbose == 1: for label, (average_precision, num_annotations) in average_precisions.items(): tf.summary.scalar("AP_" + self.generator.label_to_name(label), average_precision, step=epoch) writer.flush() logs['mAP'] = self.mean_ap if self.verbose == 1: print('mAP: {:.4f}'.format(self.mean_ap)) ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/initializers.py ================================================ """ Copyright 2017-2018 Fizyr (https://fizyr.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ from tensorflow import keras import math class PriorProbability(keras.initializers.Initializer): """ Apply a prior probability to the weights. """ def __init__(self, probability=0.01): self.probability = probability def get_config(self): return { 'probability': self.probability } def __call__(self, shape, dtype=None): # set bias to -log((1 - p)/p) for foreground result = keras.backend.ones(shape, dtype=dtype) * -math.log((1 - self.probability) / self.probability) return result ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/layers/__init__.py ================================================ from ._misc import RegressBoxes, UpsampleLike, Anchors, ClipBoxes # noqa: F401 from .filter_detections import FilterDetections # noqa: F401 ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/layers/_misc.py ================================================ """ Copyright 2017-2018 Fizyr (https://fizyr.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ import tensorflow from tensorflow import keras from .. import backend from ..utils import anchors as utils_anchors import numpy as np class Anchors(keras.layers.Layer): """ Keras layer for generating achors for a given shape. """ def __init__(self, size, stride, ratios=None, scales=None, *args, **kwargs): """ Initializer for an Anchors layer. Args size: The base size of the anchors to generate. stride: The stride of the anchors to generate. ratios: The ratios of the anchors to generate (defaults to AnchorParameters.default.ratios). scales: The scales of the anchors to generate (defaults to AnchorParameters.default.scales). """ self.size = size self.stride = stride self.ratios = ratios self.scales = scales if ratios is None: self.ratios = utils_anchors.AnchorParameters.default.ratios elif isinstance(ratios, list): self.ratios = np.array(ratios) if scales is None: self.scales = utils_anchors.AnchorParameters.default.scales elif isinstance(scales, list): self.scales = np.array(scales) self.num_anchors = len(self.ratios) * len(self.scales) self.anchors = utils_anchors.generate_anchors( base_size=self.size, ratios=self.ratios, scales=self.scales, ).astype(np.float32) super(Anchors, self).__init__(*args, **kwargs) def call(self, inputs, **kwargs): features = inputs features_shape = keras.backend.shape(features) # generate proposals from bbox deltas and shifted anchors if keras.backend.image_data_format() == 'channels_first': anchors = backend.shift(features_shape[2:4], self.stride, self.anchors) else: anchors = backend.shift(features_shape[1:3], self.stride, self.anchors) anchors = keras.backend.tile(keras.backend.expand_dims(anchors, axis=0), (features_shape[0], 1, 1)) return anchors def compute_output_shape(self, input_shape): if None not in input_shape[1:]: if keras.backend.image_data_format() == 'channels_first': total = np.prod(input_shape[2:4]) * self.num_anchors else: total = np.prod(input_shape[1:3]) * self.num_anchors return (input_shape[0], total, 4) else: return (input_shape[0], None, 4) def get_config(self): config = super(Anchors, self).get_config() config.update({ 'size' : self.size, 'stride' : self.stride, 'ratios' : self.ratios.tolist(), 'scales' : self.scales.tolist(), }) return config class UpsampleLike(keras.layers.Layer): """ Keras layer for upsampling a Tensor to be the same shape as another Tensor. """ def call(self, inputs, **kwargs): source, target = inputs target_shape = keras.backend.shape(target) if keras.backend.image_data_format() == 'channels_first': source = tensorflow.transpose(source, (0, 2, 3, 1)) output = backend.resize_images(source, (target_shape[2], target_shape[3]), method='nearest') output = tensorflow.transpose(output, (0, 3, 1, 2)) return output else: return backend.resize_images(source, (target_shape[1], target_shape[2]), method='nearest') def compute_output_shape(self, input_shape): if keras.backend.image_data_format() == 'channels_first': return (input_shape[0][0], input_shape[0][1]) + input_shape[1][2:4] else: return (input_shape[0][0],) + input_shape[1][1:3] + (input_shape[0][-1],) class RegressBoxes(keras.layers.Layer): """ Keras layer for applying regression values to boxes. """ def __init__(self, mean=None, std=None, *args, **kwargs): """ Initializer for the RegressBoxes layer. Args mean: The mean value of the regression values which was used for normalization. std: The standard value of the regression values which was used for normalization. """ if mean is None: mean = np.array([0, 0, 0, 0]) if std is None: std = np.array([0.2, 0.2, 0.2, 0.2]) if isinstance(mean, (list, tuple)): mean = np.array(mean) elif not isinstance(mean, np.ndarray): raise ValueError('Expected mean to be a np.ndarray, list or tuple. Received: {}'.format(type(mean))) if isinstance(std, (list, tuple)): std = np.array(std) elif not isinstance(std, np.ndarray): raise ValueError('Expected std to be a np.ndarray, list or tuple. Received: {}'.format(type(std))) self.mean = mean self.std = std super(RegressBoxes, self).__init__(*args, **kwargs) def call(self, inputs, **kwargs): anchors, regression = inputs return backend.bbox_transform_inv(anchors, regression, mean=self.mean, std=self.std) def compute_output_shape(self, input_shape): return input_shape[0] def get_config(self): config = super(RegressBoxes, self).get_config() config.update({ 'mean': self.mean.tolist(), 'std' : self.std.tolist(), }) return config class ClipBoxes(keras.layers.Layer): """ Keras layer to clip box values to lie inside a given shape. """ def call(self, inputs, **kwargs): image, boxes = inputs shape = keras.backend.cast(keras.backend.shape(image), keras.backend.floatx()) if keras.backend.image_data_format() == 'channels_first': _, _, height, width = tensorflow.unstack(shape, axis=0) else: _, height, width, _ = tensorflow.unstack(shape, axis=0) x1, y1, x2, y2 = tensorflow.unstack(boxes, axis=-1) x1 = tensorflow.clip_by_value(x1, 0, width - 1) y1 = tensorflow.clip_by_value(y1, 0, height - 1) x2 = tensorflow.clip_by_value(x2, 0, width - 1) y2 = tensorflow.clip_by_value(y2, 0, height - 1) return keras.backend.stack([x1, y1, x2, y2], axis=2) def compute_output_shape(self, input_shape): return input_shape[1] ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/layers/filter_detections.py ================================================ """ Copyright 2017-2018 Fizyr (https://fizyr.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ import tensorflow from tensorflow import keras from .. import backend def filter_detections( boxes, classification, other = [], class_specific_filter = True, nms = True, score_threshold = 0.05, max_detections = 300, nms_threshold = 0.5 ): """ Filter detections using the boxes and classification values. Args boxes : Tensor of shape (num_boxes, 4) containing the boxes in (x1, y1, x2, y2) format. classification : Tensor of shape (num_boxes, num_classes) containing the classification scores. other : List of tensors of shape (num_boxes, ...) to filter along with the boxes and classification scores. class_specific_filter : Whether to perform filtering per class, or take the best scoring class and filter those. nms : Flag to enable/disable non maximum suppression. score_threshold : Threshold used to prefilter the boxes with. max_detections : Maximum number of detections to keep. nms_threshold : Threshold for the IoU value to determine when a box should be suppressed. Returns A list of [boxes, scores, labels, other[0], other[1], ...]. boxes is shaped (max_detections, 4) and contains the (x1, y1, x2, y2) of the non-suppressed boxes. scores is shaped (max_detections,) and contains the scores of the predicted class. labels is shaped (max_detections,) and contains the predicted label. other[i] is shaped (max_detections, ...) and contains the filtered other[i] data. In case there are less than max_detections detections, the tensors are padded with -1's. """ def _filter_detections(scores, labels): # threshold based on score indices = tensorflow.where(keras.backend.greater(scores, score_threshold)) if nms: filtered_boxes = tensorflow.gather_nd(boxes, indices) filtered_scores = keras.backend.gather(scores, indices)[:, 0] # perform NMS nms_indices = tensorflow.image.non_max_suppression(filtered_boxes, filtered_scores, max_output_size=max_detections, iou_threshold=nms_threshold) # filter indices based on NMS indices = keras.backend.gather(indices, nms_indices) # add indices to list of all indices labels = tensorflow.gather_nd(labels, indices) indices = keras.backend.stack([indices[:, 0], labels], axis=1) return indices if class_specific_filter: all_indices = [] # perform per class filtering for c in range(int(classification.shape[1])): scores = classification[:, c] labels = c * tensorflow.ones((keras.backend.shape(scores)[0],), dtype='int64') all_indices.append(_filter_detections(scores, labels)) # concatenate indices to single tensor indices = keras.backend.concatenate(all_indices, axis=0) else: scores = keras.backend.max(classification, axis = 1) labels = keras.backend.argmax(classification, axis = 1) indices = _filter_detections(scores, labels) # select top k scores = tensorflow.gather_nd(classification, indices) labels = indices[:, 1] scores, top_indices = tensorflow.nn.top_k(scores, k=keras.backend.minimum(max_detections, keras.backend.shape(scores)[0])) # filter input using the final set of indices indices = keras.backend.gather(indices[:, 0], top_indices) boxes = keras.backend.gather(boxes, indices) labels = keras.backend.gather(labels, top_indices) other_ = [keras.backend.gather(o, indices) for o in other] # zero pad the outputs pad_size = keras.backend.maximum(0, max_detections - keras.backend.shape(scores)[0]) boxes = tensorflow.pad(boxes, [[0, pad_size], [0, 0]], constant_values=-1) scores = tensorflow.pad(scores, [[0, pad_size]], constant_values=-1) labels = tensorflow.pad(labels, [[0, pad_size]], constant_values=-1) labels = keras.backend.cast(labels, 'int32') other_ = [tensorflow.pad(o, [[0, pad_size]] + [[0, 0] for _ in range(1, len(o.shape))], constant_values=-1) for o in other_] # set shapes, since we know what they are boxes.set_shape([max_detections, 4]) scores.set_shape([max_detections]) labels.set_shape([max_detections]) for o, s in zip(other_, [list(keras.backend.int_shape(o)) for o in other]): o.set_shape([max_detections] + s[1:]) return [boxes, scores, labels] + other_ class FilterDetections(keras.layers.Layer): """ Keras layer for filtering detections using score threshold and NMS. """ def __init__( self, nms = True, class_specific_filter = True, nms_threshold = 0.5, score_threshold = 0.05, max_detections = 300, parallel_iterations = 32, **kwargs ): """ Filters detections using score threshold, NMS and selecting the top-k detections. Args nms : Flag to enable/disable NMS. class_specific_filter : Whether to perform filtering per class, or take the best scoring class and filter those. nms_threshold : Threshold for the IoU value to determine when a box should be suppressed. score_threshold : Threshold used to prefilter the boxes with. max_detections : Maximum number of detections to keep. parallel_iterations : Number of batch items to process in parallel. """ self.nms = nms self.class_specific_filter = class_specific_filter self.nms_threshold = nms_threshold self.score_threshold = score_threshold self.max_detections = max_detections self.parallel_iterations = parallel_iterations super(FilterDetections, self).__init__(**kwargs) def call(self, inputs, **kwargs): """ Constructs the NMS graph. Args inputs : List of [boxes, classification, other[0], other[1], ...] tensors. """ boxes = inputs[0] classification = inputs[1] other = inputs[2:] # wrap nms with our parameters def _filter_detections(args): boxes = args[0] classification = args[1] other = args[2] return filter_detections( boxes, classification, other, nms = self.nms, class_specific_filter = self.class_specific_filter, score_threshold = self.score_threshold, max_detections = self.max_detections, nms_threshold = self.nms_threshold, ) # call filter_detections on each batch dtypes = [keras.backend.floatx(), keras.backend.floatx(), 'int32'] + [o.dtype for o in other] shapes = [(self.max_detections, 4), (self.max_detections,), (self.max_detections,)] shapes.extend([(self.max_detections,) + o.shape[2:] for o in other]) outputs = backend.map_fn( _filter_detections, elems=[boxes, classification, other], dtype=dtypes, shapes=shapes, parallel_iterations=self.parallel_iterations, ) return outputs def compute_output_shape(self, input_shape): """ Computes the output shapes given the input shapes. Args input_shape : List of input shapes [boxes, classification, other[0], other[1], ...]. Returns List of tuples representing the output shapes: [filtered_boxes.shape, filtered_scores.shape, filtered_labels.shape, filtered_other[0].shape, filtered_other[1].shape, ...] """ return [ (input_shape[0][0], self.max_detections, 4), (input_shape[1][0], self.max_detections), (input_shape[1][0], self.max_detections), ] + [ tuple([input_shape[i][0], self.max_detections] + list(input_shape[i][2:])) for i in range(2, len(input_shape)) ] def compute_mask(self, inputs, mask=None): """ This is required in Keras when there is more than 1 output. """ return (len(inputs) + 1) * [None] def get_config(self): """ Gets the configuration of this layer. Returns Dictionary containing the parameters of this layer. """ config = super(FilterDetections, self).get_config() config.update({ 'nms' : self.nms, 'class_specific_filter' : self.class_specific_filter, 'nms_threshold' : self.nms_threshold, 'score_threshold' : self.score_threshold, 'max_detections' : self.max_detections, 'parallel_iterations' : self.parallel_iterations, }) return config ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/losses.py ================================================ """ Copyright 2017-2018 Fizyr (https://fizyr.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ import tensorflow from tensorflow import keras def focal(alpha=0.25, gamma=2.0, cutoff=0.5): """ Create a functor for computing the focal loss. Args alpha: Scale the focal weight with alpha. gamma: Take the power of the focal weight with gamma. cutoff: Positive prediction cutoff for soft targets Returns A functor that computes the focal loss using the alpha and gamma. """ def _focal(y_true, y_pred): """ Compute the focal loss given the target tensor and the predicted tensor. As defined in https://arxiv.org/abs/1708.02002 Args y_true: Tensor of target data from the generator with shape (B, N, num_classes). y_pred: Tensor of predicted data from the network with shape (B, N, num_classes). Returns The focal loss of y_pred w.r.t. y_true. """ labels = y_true[:, :, :-1] anchor_state = y_true[:, :, -1] # -1 for ignore, 0 for background, 1 for object classification = y_pred # filter out "ignore" anchors indices = tensorflow.where(keras.backend.not_equal(anchor_state, -1)) labels = tensorflow.gather_nd(labels, indices) classification = tensorflow.gather_nd(classification, indices) # compute the focal loss alpha_factor = keras.backend.ones_like(labels) * alpha alpha_factor = tensorflow.where(keras.backend.greater(labels, cutoff), alpha_factor, 1 - alpha_factor) focal_weight = tensorflow.where(keras.backend.greater(labels, cutoff), 1 - classification, classification) focal_weight = alpha_factor * focal_weight ** gamma cls_loss = focal_weight * keras.backend.binary_crossentropy(labels, classification) # compute the normalizer: the number of positive anchors normalizer = tensorflow.where(keras.backend.equal(anchor_state, 1)) normalizer = keras.backend.cast(keras.backend.shape(normalizer)[0], keras.backend.floatx()) normalizer = keras.backend.maximum(keras.backend.cast_to_floatx(1.0), normalizer) return keras.backend.sum(cls_loss) / normalizer return _focal def smooth_l1(sigma=3.0): """ Create a smooth L1 loss functor. Args sigma: This argument defines the point where the loss changes from L2 to L1. Returns A functor for computing the smooth L1 loss given target data and predicted data. """ sigma_squared = sigma ** 2 def _smooth_l1(y_true, y_pred): """ Compute the smooth L1 loss of y_pred w.r.t. y_true. Args y_true: Tensor from the generator of shape (B, N, 5). The last value for each box is the state of the anchor (ignore, negative, positive). y_pred: Tensor from the network of shape (B, N, 4). Returns The smooth L1 loss of y_pred w.r.t. y_true. """ # separate target and state regression = y_pred regression_target = y_true[:, :, :-1] anchor_state = y_true[:, :, -1] # filter out "ignore" anchors indices = tensorflow.where(keras.backend.equal(anchor_state, 1)) regression = tensorflow.gather_nd(regression, indices) regression_target = tensorflow.gather_nd(regression_target, indices) # compute smooth L1 loss # f(x) = 0.5 * (sigma * x)^2 if |x| < 1 / sigma / sigma # |x| - 0.5 / sigma / sigma otherwise regression_diff = regression - regression_target regression_diff = keras.backend.abs(regression_diff) regression_loss = tensorflow.where( keras.backend.less(regression_diff, 1.0 / sigma_squared), 0.5 * sigma_squared * keras.backend.pow(regression_diff, 2), regression_diff - 0.5 / sigma_squared ) # compute the normalizer: the number of positive anchors normalizer = keras.backend.maximum(1, keras.backend.shape(indices)[0]) normalizer = keras.backend.cast(normalizer, dtype=keras.backend.floatx()) return keras.backend.sum(regression_loss) / normalizer return _smooth_l1 ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/models/__init__.py ================================================ from __future__ import print_function import sys class Backbone(object): """ This class stores additional information on backbones. """ def __init__(self, backbone): # a dictionary mapping custom layer names to the correct classes from .. import layers from .. import losses from .. import initializers self.custom_objects = { 'UpsampleLike' : layers.UpsampleLike, 'PriorProbability' : initializers.PriorProbability, 'RegressBoxes' : layers.RegressBoxes, 'FilterDetections' : layers.FilterDetections, 'Anchors' : layers.Anchors, 'ClipBoxes' : layers.ClipBoxes, '_smooth_l1' : losses.smooth_l1(), '_focal' : losses.focal(), } self.backbone = backbone self.validate() def retinanet(self, *args, **kwargs): """ Returns a retinanet model using the correct backbone. """ raise NotImplementedError('retinanet method not implemented.') def download_imagenet(self): """ Downloads ImageNet weights and returns path to weights file. """ raise NotImplementedError('download_imagenet method not implemented.') def validate(self): """ Checks whether the backbone string is correct. """ raise NotImplementedError('validate method not implemented.') def preprocess_image(self, inputs): """ Takes as input an image and prepares it for being passed through the network. Having this function in Backbone allows other backbones to define a specific preprocessing step. """ raise NotImplementedError('preprocess_image method not implemented.') def backbone(backbone_name): """ Returns a backbone object for the given backbone. """ if 'densenet' in backbone_name: from .densenet import DenseNetBackbone as b elif 'seresnext' in backbone_name or 'seresnet' in backbone_name or 'senet' in backbone_name: from .senet import SeBackbone as b elif 'resnet' in backbone_name: from .resnet import ResNetBackbone as b elif 'mobilenet' in backbone_name: from .mobilenet import MobileNetBackbone as b elif 'vgg' in backbone_name: from .vgg import VGGBackbone as b elif 'EfficientNet' in backbone_name: from .effnet import EfficientNetBackbone as b else: raise NotImplementedError('Backbone class for \'{}\' not implemented.'.format(backbone)) return b(backbone_name) def load_model(filepath, backbone_name='resnet50'): """ Loads a retinanet model using the correct custom objects. Args filepath: one of the following: - string, path to the saved model, or - h5py.File object from which to load the model backbone_name : Backbone with which the model was trained. Returns A keras.models.Model object. Raises ImportError: if h5py is not available. ValueError: In case of an invalid savefile. """ from tensorflow import keras return keras.models.load_model(filepath, custom_objects=backbone(backbone_name).custom_objects) def convert_model(model, nms=True, class_specific_filter=True, anchor_params=None, **kwargs): """ Converts a training model to an inference model. Args model : A retinanet training model. nms : Boolean, whether to add NMS filtering to the converted model. class_specific_filter : Whether to use class specific filtering or filter for the best scoring class only. anchor_params : Anchor parameters object. If omitted, default values are used. **kwargs : Inference and minimal retinanet model settings. Returns A keras.models.Model object. Raises ImportError: if h5py is not available. ValueError: In case of an invalid savefile. """ from .retinanet import retinanet_bbox return retinanet_bbox(model=model, nms=nms, class_specific_filter=class_specific_filter, anchor_params=anchor_params, **kwargs) def assert_training_model(model): """ Assert that the model is a training model. """ assert(all(output in model.output_names for output in ['regression', 'classification'])), \ "Input is not a training model (no 'regression' and 'classification' outputs were found, outputs are: {}).".format(model.output_names) def check_training_model(model): """ Check that model is a training model and exit otherwise. """ try: assert_training_model(model) except AssertionError as e: print(e, file=sys.stderr) sys.exit(1) ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/models/densenet.py ================================================ """ Copyright 2018 vidosits (https://github.com/vidosits/) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ from tensorflow import keras from . import retinanet from . import Backbone from ..utils.image import preprocess_image allowed_backbones = { 'densenet121': ([6, 12, 24, 16], keras.applications.densenet.DenseNet121), 'densenet169': ([6, 12, 32, 32], keras.applications.densenet.DenseNet169), 'densenet201': ([6, 12, 48, 32], keras.applications.densenet.DenseNet201), } class DenseNetBackbone(Backbone): """ Describes backbone information and provides utility functions. """ def retinanet(self, *args, **kwargs): """ Returns a retinanet model using the correct backbone. """ return densenet_retinanet(*args, backbone=self.backbone, **kwargs) def download_imagenet(self): """ Download pre-trained weights for the specified backbone name. This name is in the format {backbone}_weights_tf_dim_ordering_tf_kernels_notop where backbone is the densenet + number of layers (e.g. densenet121). For more info check the explanation from the keras densenet script itself: https://github.com/keras-team/keras/blob/master/keras/applications/densenet.py """ origin = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.8/' file_name = '{}_weights_tf_dim_ordering_tf_kernels_notop.h5' # load weights if keras.backend.image_data_format() == 'channels_first': raise ValueError('Weights for "channels_first" format are not available.') weights_url = origin + file_name.format(self.backbone) return keras.utils.get_file(file_name.format(self.backbone), weights_url, cache_subdir='models') def validate(self): """ Checks whether the backbone string is correct. """ backbone = self.backbone.split('_')[0] if backbone not in allowed_backbones: raise ValueError('Backbone (\'{}\') not in allowed backbones ({}).'.format(backbone, allowed_backbones.keys())) def preprocess_image(self, inputs): """ Takes as input an image and prepares it for being passed through the network. """ return preprocess_image(inputs, mode='tf') def densenet_retinanet(num_classes, backbone='densenet121', inputs=None, modifier=None, **kwargs): """ Constructs a retinanet model using a densenet backbone. Args num_classes: Number of classes to predict. backbone: Which backbone to use (one of ('densenet121', 'densenet169', 'densenet201')). inputs: The inputs to the network (defaults to a Tensor of shape (None, None, 3)). modifier: A function handler which can modify the backbone before using it in retinanet (this can be used to freeze backbone layers for example). Returns RetinaNet model with a DenseNet backbone. """ # choose default input if inputs is None: inputs = keras.layers.Input((None, None, 3)) blocks, creator = allowed_backbones[backbone] model = creator(input_tensor=inputs, include_top=False, pooling=None, weights=None) # get last conv layer from the end of each dense block layer_outputs = [model.get_layer(name='conv{}_block{}_concat'.format(idx + 2, block_num)).output for idx, block_num in enumerate(blocks)] # create the densenet backbone # layer_outputs contains 4 layers model = keras.models.Model(inputs=inputs, outputs=layer_outputs, name=model.name) # invoke modifier if given if modifier: model = modifier(model) # create the full model backbone_layers = { 'C2': model.outputs[0], 'C3': model.outputs[1], 'C4': model.outputs[2], 'C5': model.outputs[3] } model = retinanet.retinanet(inputs=inputs, num_classes=num_classes, backbone_layers=backbone_layers, **kwargs) return model ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/models/effnet.py ================================================ """ Copyright 2017-2018 Fizyr (https://fizyr.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ from tensorflow import keras from . import retinanet from . import Backbone import efficientnet.keras as efn class EfficientNetBackbone(Backbone): """ Describes backbone information and provides utility functions. """ def __init__(self, backbone): super(EfficientNetBackbone, self).__init__(backbone) self.preprocess_image_func = None def retinanet(self, *args, **kwargs): """ Returns a retinanet model using the correct backbone. """ return effnet_retinanet(*args, backbone=self.backbone, **kwargs) def download_imagenet(self): """ Downloads ImageNet weights and returns path to weights file. """ from efficientnet.weights import IMAGENET_WEIGHTS_PATH from efficientnet.weights import IMAGENET_WEIGHTS_HASHES model_name = 'efficientnet-b' + self.backbone[-1] file_name = model_name + '_weights_tf_dim_ordering_tf_kernels_autoaugment_notop.h5' file_hash = IMAGENET_WEIGHTS_HASHES[model_name][1] weights_path = keras.utils.get_file(file_name, IMAGENET_WEIGHTS_PATH + file_name, cache_subdir='models', file_hash=file_hash) return weights_path def validate(self): """ Checks whether the backbone string is correct. """ allowed_backbones = ['EfficientNetB0', 'EfficientNetB1', 'EfficientNetB2', 'EfficientNetB3', 'EfficientNetB4', 'EfficientNetB5', 'EfficientNetB6', 'EfficientNetB7'] backbone = self.backbone.split('_')[0] if backbone not in allowed_backbones: raise ValueError('Backbone (\'{}\') not in allowed backbones ({}).'.format(backbone, allowed_backbones)) def preprocess_image(self, inputs): """ Takes as input an image and prepares it for being passed through the network. """ return efn.preprocess_input(inputs) def effnet_retinanet(num_classes, backbone='EfficientNetB0', inputs=None, modifier=None, **kwargs): """ Constructs a retinanet model using a resnet backbone. Args num_classes: Number of classes to predict. backbone: Which backbone to use (one of ('resnet50', 'resnet101', 'resnet152')). inputs: The inputs to the network (defaults to a Tensor of shape (None, None, 3)). modifier: A function handler which can modify the backbone before using it in retinanet (this can be used to freeze backbone layers for example). Returns RetinaNet model with a ResNet backbone. """ # choose default input if inputs is None: if keras.backend.image_data_format() == 'channels_first': inputs = keras.layers.Input(shape=(3, None, None)) else: # inputs = keras.layers.Input(shape=(224, 224, 3)) inputs = keras.layers.Input(shape=(None, None, 3)) # get last conv layer from the end of each block [28x28, 14x14, 7x7] if backbone == 'EfficientNetB0': model = efn.EfficientNetB0(input_tensor=inputs, include_top=False, weights=None) elif backbone == 'EfficientNetB1': model = efn.EfficientNetB1(input_tensor=inputs, include_top=False, weights=None) elif backbone == 'EfficientNetB2': model = efn.EfficientNetB2(input_tensor=inputs, include_top=False, weights=None) elif backbone == 'EfficientNetB3': model = efn.EfficientNetB3(input_tensor=inputs, include_top=False, weights=None) elif backbone == 'EfficientNetB4': model = efn.EfficientNetB4(input_tensor=inputs, include_top=False, weights=None) elif backbone == 'EfficientNetB5': model = efn.EfficientNetB5(input_tensor=inputs, include_top=False, weights=None) elif backbone == 'EfficientNetB6': model = efn.EfficientNetB6(input_tensor=inputs, include_top=False, weights=None) elif backbone == 'EfficientNetB7': model = efn.EfficientNetB7(input_tensor=inputs, include_top=False, weights=None) else: raise ValueError('Backbone (\'{}\') is invalid.'.format(backbone)) layer_outputs = ['block4a_expand_activation', 'block6a_expand_activation', 'top_activation'] layer_outputs = [ model.get_layer(name=layer_outputs[0]).output, # 28x28 model.get_layer(name=layer_outputs[1]).output, # 14x14 model.get_layer(name=layer_outputs[2]).output, # 7x7 ] # create the densenet backbone model = keras.models.Model(inputs=inputs, outputs=layer_outputs, name=model.name) # invoke modifier if given if modifier: model = modifier(model) # C2 not provided backbone_layers = { 'C3': model.outputs[0], 'C4': model.outputs[1], 'C5': model.outputs[2] } # create the full model return retinanet.retinanet(inputs=inputs, num_classes=num_classes, backbone_layers=backbone_layers, **kwargs) def EfficientNetB0_retinanet(num_classes, inputs=None, **kwargs): return effnet_retinanet(num_classes=num_classes, backbone='EfficientNetB0', inputs=inputs, **kwargs) def EfficientNetB1_retinanet(num_classes, inputs=None, **kwargs): return effnet_retinanet(num_classes=num_classes, backbone='EfficientNetB1', inputs=inputs, **kwargs) def EfficientNetB2_retinanet(num_classes, inputs=None, **kwargs): return effnet_retinanet(num_classes=num_classes, backbone='EfficientNetB2', inputs=inputs, **kwargs) def EfficientNetB3_retinanet(num_classes, inputs=None, **kwargs): return effnet_retinanet(num_classes=num_classes, backbone='EfficientNetB3', inputs=inputs, **kwargs) def EfficientNetB4_retinanet(num_classes, inputs=None, **kwargs): return effnet_retinanet(num_classes=num_classes, backbone='EfficientNetB4', inputs=inputs, **kwargs) def EfficientNetB5_retinanet(num_classes, inputs=None, **kwargs): return effnet_retinanet(num_classes=num_classes, backbone='EfficientNetB5', inputs=inputs, **kwargs) def EfficientNetB6_retinanet(num_classes, inputs=None, **kwargs): return effnet_retinanet(num_classes=num_classes, backbone='EfficientNetB6', inputs=inputs, **kwargs) def EfficientNetB7_retinanet(num_classes, inputs=None, **kwargs): return effnet_retinanet(num_classes=num_classes, backbone='EfficientNetB7', inputs=inputs, **kwargs) ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/models/mobilenet.py ================================================ """ Copyright 2017-2018 lvaleriu (https://github.com/lvaleriu/) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ from tensorflow import keras from ..utils.image import preprocess_image from . import retinanet from . import Backbone class MobileNetBackbone(Backbone): """ Describes backbone information and provides utility functions. """ allowed_backbones = ['mobilenet128', 'mobilenet160', 'mobilenet192', 'mobilenet224'] def retinanet(self, *args, **kwargs): """ Returns a retinanet model using the correct backbone. """ return mobilenet_retinanet(*args, backbone=self.backbone, **kwargs) def download_imagenet(self): """ Download pre-trained weights for the specified backbone name. This name is in the format mobilenet{rows}_{alpha} where rows is the imagenet shape dimension and 'alpha' controls the width of the network. For more info check the explanation from the keras mobilenet script itself. """ alpha = float(self.backbone.split('_')[1]) rows = int(self.backbone.split('_')[0].replace('mobilenet', '')) # load weights if keras.backend.image_data_format() == 'channels_first': raise ValueError('Weights for "channels_last" format ' 'are not available.') if alpha == 1.0: alpha_text = '1_0' elif alpha == 0.75: alpha_text = '7_5' elif alpha == 0.50: alpha_text = '5_0' else: alpha_text = '2_5' model_name = 'mobilenet_{}_{}_tf_no_top.h5'.format(alpha_text, rows) weights_url = 'https://github.com/fchollet/deep-learning-models/releases/download/v0.6/' + model_name weights_path = keras.utils.get_file(model_name, weights_url, cache_subdir='models') return weights_path def validate(self): """ Checks whether the backbone string is correct. """ backbone = self.backbone.split('_')[0] if backbone not in MobileNetBackbone.allowed_backbones: raise ValueError('Backbone (\'{}\') not in allowed backbones ({}).'.format(backbone, MobileNetBackbone.allowed_backbones)) def preprocess_image(self, inputs): """ Takes as input an image and prepares it for being passed through the network. """ return preprocess_image(inputs, mode='tf') def mobilenet_retinanet(num_classes, backbone='mobilenet224_1.0', inputs=None, modifier=None, **kwargs): """ Constructs a retinanet model using a mobilenet backbone. Args num_classes: Number of classes to predict. backbone: Which backbone to use (one of ('mobilenet128', 'mobilenet160', 'mobilenet192', 'mobilenet224')). inputs: The inputs to the network (defaults to a Tensor of shape (None, None, 3)). modifier: A function handler which can modify the backbone before using it in retinanet (this can be used to freeze backbone layers for example). Returns RetinaNet model with a MobileNet backbone. """ alpha = float(backbone.split('_')[1]) # choose default input if inputs is None: inputs = keras.layers.Input((None, None, 3)) backbone = keras.applications.mobilenet.MobileNet(input_tensor=inputs, alpha=alpha, include_top=False, pooling=None, weights=None) # create the full model layer_names = ['conv_pw_5_relu', 'conv_pw_11_relu', 'conv_pw_13_relu'] layer_outputs = [backbone.get_layer(name).output for name in layer_names] backbone = keras.models.Model(inputs=inputs, outputs=layer_outputs, name=backbone.name) # invoke modifier if given if modifier: backbone = modifier(backbone) # C2 not provided backbone_layers = { 'C3': backbone.outputs[0], 'C4': backbone.outputs[1], 'C5': backbone.outputs[2] } return retinanet.retinanet(inputs=inputs, num_classes=num_classes, backbone_layers=backbone_layers, **kwargs) ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/models/resnet.py ================================================ """ Copyright 2017-2018 Fizyr (https://fizyr.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ from tensorflow import keras import keras_resnet import keras_resnet.models from . import retinanet from . import Backbone from ..utils.image import preprocess_image class ResNetBackbone(Backbone): """ Describes backbone information and provides utility functions. """ def __init__(self, backbone): super(ResNetBackbone, self).__init__(backbone) self.custom_objects.update(keras_resnet.custom_objects) def retinanet(self, *args, **kwargs): """ Returns a retinanet model using the correct backbone. """ return resnet_retinanet(*args, backbone=self.backbone, **kwargs) def download_imagenet(self): """ Downloads ImageNet weights and returns path to weights file. """ resnet_filename = 'ResNet-{}-model.keras.h5' resnet_resource = 'https://github.com/fizyr/keras-models/releases/download/v0.0.1/{}'.format(resnet_filename) depth = int(self.backbone.replace('resnet', '')) filename = resnet_filename.format(depth) resource = resnet_resource.format(depth) if depth == 50: checksum = '3e9f4e4f77bbe2c9bec13b53ee1c2319' elif depth == 101: checksum = '05dc86924389e5b401a9ea0348a3213c' elif depth == 152: checksum = '6ee11ef2b135592f8031058820bb9e71' return keras.utils.get_file( filename, resource, cache_subdir='models', md5_hash=checksum ) def validate(self): """ Checks whether the backbone string is correct. """ allowed_backbones = ['resnet50', 'resnet101', 'resnet152'] backbone = self.backbone.split('_')[0] if backbone not in allowed_backbones: raise ValueError('Backbone (\'{}\') not in allowed backbones ({}).'.format(backbone, allowed_backbones)) def preprocess_image(self, inputs): """ Takes as input an image and prepares it for being passed through the network. """ return preprocess_image(inputs, mode='caffe') def resnet_retinanet(num_classes, backbone='resnet50', inputs=None, modifier=None, **kwargs): """ Constructs a retinanet model using a resnet backbone. Args num_classes: Number of classes to predict. backbone: Which backbone to use (one of ('resnet50', 'resnet101', 'resnet152')). inputs: The inputs to the network (defaults to a Tensor of shape (None, None, 3)). modifier: A function handler which can modify the backbone before using it in retinanet (this can be used to freeze backbone layers for example). Returns RetinaNet model with a ResNet backbone. """ # choose default input if inputs is None: if keras.backend.image_data_format() == 'channels_first': inputs = keras.layers.Input(shape=(3, None, None)) else: inputs = keras.layers.Input(shape=(None, None, 3)) # create the resnet backbone if backbone == 'resnet50': resnet = keras_resnet.models.ResNet50(inputs, include_top=False, freeze_bn=True) elif backbone == 'resnet101': resnet = keras_resnet.models.ResNet101(inputs, include_top=False, freeze_bn=True) elif backbone == 'resnet152': resnet = keras_resnet.models.ResNet152(inputs, include_top=False, freeze_bn=True) else: raise ValueError('Backbone (\'{}\') is invalid.'.format(backbone)) # invoke modifier if given if modifier: resnet = modifier(resnet) # create the full model # resnet.outputs contains 4 layers backbone_layers = { 'C2': resnet.outputs[0], 'C3': resnet.outputs[1], 'C4': resnet.outputs[2], 'C5': resnet.outputs[3] } return retinanet.retinanet(inputs=inputs, num_classes=num_classes, backbone_layers=backbone_layers, **kwargs) def resnet50_retinanet(num_classes, inputs=None, **kwargs): return resnet_retinanet(num_classes=num_classes, backbone='resnet50', inputs=inputs, **kwargs) def resnet101_retinanet(num_classes, inputs=None, **kwargs): return resnet_retinanet(num_classes=num_classes, backbone='resnet101', inputs=inputs, **kwargs) def resnet152_retinanet(num_classes, inputs=None, **kwargs): return resnet_retinanet(num_classes=num_classes, backbone='resnet152', inputs=inputs, **kwargs) ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/models/retinanet.py ================================================ """ Copyright 2017-2018 Fizyr (https://fizyr.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ from tensorflow import keras from .. import initializers from .. import layers from ..utils.anchors import AnchorParameters from . import assert_training_model def default_classification_model( num_classes, num_anchors, pyramid_feature_size=256, prior_probability=0.01, classification_feature_size=256, name='classification_submodel' ): """ Creates the default classification submodel. Args num_classes : Number of classes to predict a score for at each feature level. num_anchors : Number of anchors to predict classification scores for at each feature level. pyramid_feature_size : The number of filters to expect from the feature pyramid levels. classification_feature_size : The number of filters to use in the layers in the classification submodel. name : The name of the submodel. Returns A keras.models.Model that predicts classes for each anchor. """ options = { 'kernel_size' : 3, 'strides' : 1, 'padding' : 'same', } if keras.backend.image_data_format() == 'channels_first': inputs = keras.layers.Input(shape=(pyramid_feature_size, None, None)) else: inputs = keras.layers.Input(shape=(None, None, pyramid_feature_size)) outputs = inputs for i in range(4): outputs = keras.layers.Conv2D( filters=classification_feature_size, activation='relu', name='pyramid_classification_{}'.format(i), kernel_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=0.01, seed=None), bias_initializer='zeros', **options )(outputs) outputs = keras.layers.Conv2D( filters=num_classes * num_anchors, kernel_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=0.01, seed=None), bias_initializer=initializers.PriorProbability(probability=prior_probability), name='pyramid_classification', **options )(outputs) # reshape output and apply sigmoid if keras.backend.image_data_format() == 'channels_first': outputs = keras.layers.Permute((2, 3, 1), name='pyramid_classification_permute')(outputs) outputs = keras.layers.Reshape((-1, num_classes), name='pyramid_classification_reshape')(outputs) outputs = keras.layers.Activation('sigmoid', name='pyramid_classification_sigmoid')(outputs) return keras.models.Model(inputs=inputs, outputs=outputs, name=name) def default_regression_model(num_values, num_anchors, pyramid_feature_size=256, regression_feature_size=256, name='regression_submodel'): """ Creates the default regression submodel. Args num_values : Number of values to regress. num_anchors : Number of anchors to regress for each feature level. pyramid_feature_size : The number of filters to expect from the feature pyramid levels. regression_feature_size : The number of filters to use in the layers in the regression submodel. name : The name of the submodel. Returns A keras.models.Model that predicts regression values for each anchor. """ # All new conv layers except the final one in the # RetinaNet (classification) subnets are initialized # with bias b = 0 and a Gaussian weight fill with stddev = 0.01. options = { 'kernel_size' : 3, 'strides' : 1, 'padding' : 'same', 'kernel_initializer' : keras.initializers.RandomNormal(mean=0.0, stddev=0.01, seed=None), 'bias_initializer' : 'zeros' } if keras.backend.image_data_format() == 'channels_first': inputs = keras.layers.Input(shape=(pyramid_feature_size, None, None)) else: inputs = keras.layers.Input(shape=(None, None, pyramid_feature_size)) outputs = inputs for i in range(4): outputs = keras.layers.Conv2D( filters=regression_feature_size, activation='relu', name='pyramid_regression_{}'.format(i), **options )(outputs) outputs = keras.layers.Conv2D(num_anchors * num_values, name='pyramid_regression', **options)(outputs) if keras.backend.image_data_format() == 'channels_first': outputs = keras.layers.Permute((2, 3, 1), name='pyramid_regression_permute')(outputs) outputs = keras.layers.Reshape((-1, num_values), name='pyramid_regression_reshape')(outputs) return keras.models.Model(inputs=inputs, outputs=outputs, name=name) def __create_pyramid_features(backbone_layers, pyramid_levels, feature_size=256): """ Creates the FPN layers on top of the backbone features. Args backbone_layers: a dictionary containing feature stages C3, C4, C5 from the backbone. Also contains C2 if provided. pyramid_levels: Pyramid levels in use. feature_size : The feature size to use for the resulting feature levels. Returns output_layers : A dict of feature levels. P3, P4, P5, P6 are always included. P2, P6, P7 included if in use. """ output_layers = {} # upsample C5 to get P5 from the FPN paper P5 = keras.layers.Conv2D(feature_size, kernel_size=1, strides=1, padding='same', name='C5_reduced')(backbone_layers['C5']) P5_upsampled = layers.UpsampleLike(name='P5_upsampled')([P5, backbone_layers['C4']]) P5 = keras.layers.Conv2D(feature_size, kernel_size=3, strides=1, padding='same', name='P5')(P5) output_layers["P5"] = P5 # add P5 elementwise to C4 P4 = keras.layers.Conv2D(feature_size, kernel_size=1, strides=1, padding='same', name='C4_reduced')(backbone_layers['C4']) P4 = keras.layers.Add(name='P4_merged')([P5_upsampled, P4]) P4_upsampled = layers.UpsampleLike(name='P4_upsampled')([P4, backbone_layers['C3']]) P4 = keras.layers.Conv2D(feature_size, kernel_size=3, strides=1, padding='same', name='P4')(P4) output_layers["P4"] = P4 # add P4 elementwise to C3 P3 = keras.layers.Conv2D(feature_size, kernel_size=1, strides=1, padding='same', name='C3_reduced')(backbone_layers['C3']) P3 = keras.layers.Add(name='P3_merged')([P4_upsampled, P3]) if 'C2' in backbone_layers and 2 in pyramid_levels: P3_upsampled = layers.UpsampleLike(name='P3_upsampled')([P3, backbone_layers['C2']]) P3 = keras.layers.Conv2D(feature_size, kernel_size=3, strides=1, padding='same', name='P3')(P3) output_layers["P3"] = P3 if 'C2' in backbone_layers and 2 in pyramid_levels: P2 = keras.layers.Conv2D(feature_size, kernel_size=1, strides=1, padding='same', name='C2_reduced')(backbone_layers['C2']) P2 = keras.layers.Add(name='P2_merged')([P3_upsampled, P2]) P2 = keras.layers.Conv2D(feature_size, kernel_size=3, strides=1, padding='same', name='P2')(P2) output_layers["P2"] = P2 # "P6 is obtained via a 3x3 stride-2 conv on C5" if 6 in pyramid_levels: P6 = keras.layers.Conv2D(feature_size, kernel_size=3, strides=2, padding='same', name='P6')(backbone_layers['C5']) output_layers["P6"] = P6 # "P7 is computed by applying ReLU followed by a 3x3 stride-2 conv on P6" if 7 in pyramid_levels: if 6 not in pyramid_levels: raise ValueError("P6 is required to use P7") P7 = keras.layers.Activation('relu', name='C6_relu')(P6) P7 = keras.layers.Conv2D(feature_size, kernel_size=3, strides=2, padding='same', name='P7')(P7) output_layers["P7"] = P7 return output_layers def default_submodels(num_classes, num_anchors): """ Create a list of default submodels used for object detection. The default submodels contains a regression submodel and a classification submodel. Args num_classes : Number of classes to use. num_anchors : Number of base anchors. Returns A list of tuple, where the first element is the name of the submodel and the second element is the submodel itself. """ return [ ('regression', default_regression_model(4, num_anchors)), ('classification', default_classification_model(num_classes, num_anchors)) ] def __build_model_pyramid(name, model, features): """ Applies a single submodel to each FPN level. Args name : Name of the submodel. model : The submodel to evaluate. features : The FPN features. Returns A tensor containing the response from the submodel on the FPN features. """ return keras.layers.Concatenate(axis=1, name=name)([model(f) for f in features]) def __build_pyramid(models, features): """ Applies all submodels to each FPN level. Args models : List of submodels to run on each pyramid level (by default only regression, classifcation). features : The FPN features. Returns A list of tensors, one for each submodel. """ return [__build_model_pyramid(n, m, features) for n, m in models] def __build_anchors(anchor_parameters, features): """ Builds anchors for the shape of the features from FPN. Args anchor_parameters : Parameteres that determine how anchors are generated. features : The FPN features. Returns A tensor containing the anchors for the FPN features. The shape is: ``` (batch_size, num_anchors, 4) ``` """ anchors = [ layers.Anchors( size=anchor_parameters.sizes[i], stride=anchor_parameters.strides[i], ratios=anchor_parameters.ratios, scales=anchor_parameters.scales, name='anchors_{}'.format(i) )(f) for i, f in enumerate(features) ] return keras.layers.Concatenate(axis=1, name='anchors')(anchors) def retinanet( inputs, backbone_layers, num_classes, num_anchors = None, create_pyramid_features = __create_pyramid_features, pyramid_levels = None, submodels = None, name = 'retinanet' ): """ Construct a RetinaNet model on top of a backbone. This model is the minimum model necessary for training (with the unfortunate exception of anchors as output). Args inputs : keras.layers.Input (or list of) for the input to the model. num_classes : Number of classes to classify. num_anchors : Number of base anchors. create_pyramid_features : Functor for creating pyramid features given the features C3, C4, C5, and possibly C2 from the backbone. pyramid_levels : pyramid levels to use. submodels : Submodels to run on each feature map (default is regression and classification submodels). name : Name of the model. Returns A keras.models.Model which takes an image as input and outputs generated anchors and the result from each submodel on every pyramid level. The order of the outputs is as defined in submodels: ``` [ regression, classification, other[0], other[1], ... ] ``` """ if num_anchors is None: num_anchors = AnchorParameters.default.num_anchors() if submodels is None: submodels = default_submodels(num_classes, num_anchors) if pyramid_levels is None: pyramid_levels = [3, 4, 5, 6, 7] if 2 in pyramid_levels and 'C2' not in backbone_layers: raise ValueError("C2 not provided by backbone model. Cannot create P2 layers.") if 3 not in pyramid_levels or 4 not in pyramid_levels or 5 not in pyramid_levels: raise ValueError("pyramid levels 3, 4, and 5 required for functionality") # compute pyramid features as per https://arxiv.org/abs/1708.02002 features = create_pyramid_features(backbone_layers, pyramid_levels) feature_list = [features['P{}'.format(p)] for p in pyramid_levels] # for all pyramid levels, run available submodels pyramids = __build_pyramid(submodels, feature_list) return keras.models.Model(inputs=inputs, outputs=pyramids, name=name) def retinanet_bbox( model = None, nms = True, class_specific_filter = True, name = 'retinanet-bbox', anchor_params = None, pyramid_levels = None, nms_threshold = 0.5, score_threshold = 0.05, max_detections = 300, parallel_iterations = 32, **kwargs ): """ Construct a RetinaNet model on top of a backbone and adds convenience functions to output boxes directly. This model uses the minimum retinanet model and appends a few layers to compute boxes within the graph. These layers include applying the regression values to the anchors and performing NMS. Args model : RetinaNet model to append bbox layers to. If None, it will create a RetinaNet model using **kwargs. nms : Whether to use non-maximum suppression for the filtering step. class_specific_filter : Whether to use class specific filtering or filter for the best scoring class only. name : Name of the model. anchor_params : Struct containing anchor parameters. If None, default values are used. pyramid_levels : pyramid levels to use. nms_threshold : Threshold for the IoU value to determine when a box should be suppressed. score_threshold : Threshold used to prefilter the boxes with. max_detections : Maximum number of detections to keep. parallel_iterations : Number of batch items to process in parallel. **kwargs : Additional kwargs to pass to the minimal retinanet model. Returns A keras.models.Model which takes an image as input and outputs the detections on the image. The order is defined as follows: ``` [ boxes, scores, labels, other[0], other[1], ... ] ``` """ # if no anchor parameters are passed, use default values if anchor_params is None: anchor_params = AnchorParameters.default # create RetinaNet model if model is None: model = retinanet(num_anchors=anchor_params.num_anchors(), **kwargs) else: assert_training_model(model) if pyramid_levels is None: pyramid_levels = [3, 4, 5, 6, 7] assert len(pyramid_levels) == len(anchor_params.sizes), \ "number of pyramid levels {} should match number of anchor parameter sizes {}".format(len(pyramid_levels), len(anchor_params.sizes)) pyramid_layer_names = ['P{}'.format(p) for p in pyramid_levels] # compute the anchors features = [model.get_layer(p_name).output for p_name in pyramid_layer_names] anchors = __build_anchors(anchor_params, features) # we expect the anchors, regression and classification values as first output regression = model.outputs[0] classification = model.outputs[1] # "other" can be any additional output from custom submodels, by default this will be [] other = model.outputs[2:] # apply predicted regression to anchors boxes = layers.RegressBoxes(name='boxes')([anchors, regression]) boxes = layers.ClipBoxes(name='clipped_boxes')([model.inputs[0], boxes]) # filter detections (apply NMS / score threshold / select top-k) detections = layers.FilterDetections( nms = nms, class_specific_filter = class_specific_filter, name = 'filtered_detections', nms_threshold = nms_threshold, score_threshold = score_threshold, max_detections = max_detections, parallel_iterations = parallel_iterations )([boxes, classification] + other) # construct the model return keras.models.Model(inputs=model.inputs, outputs=detections, name=name) ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/models/senet.py ================================================ """ Copyright 2017-2018 Fizyr (https://fizyr.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ from tensorflow import keras from . import retinanet from . import Backbone from classification_models.keras import Classifiers class SeBackbone(Backbone): """ Describes backbone information and provides utility functions. """ def __init__(self, backbone): super(SeBackbone, self).__init__(backbone) _, self.preprocess_image_func = Classifiers.get(self.backbone) def retinanet(self, *args, **kwargs): """ Returns a retinanet model using the correct backbone. """ return senet_retinanet(*args, backbone=self.backbone, **kwargs) def download_imagenet(self): """ Downloads ImageNet weights and returns path to weights file. """ from classification_models.weights import WEIGHTS_COLLECTION weights_path = None for el in WEIGHTS_COLLECTION: if el['model'] == self.backbone and not el['include_top']: weights_path = keras.utils.get_file(el['name'], el['url'], cache_subdir='models', file_hash=el['md5']) if weights_path is None: raise ValueError('Unable to find imagenet weights for backbone {}!'.format(self.backbone)) return weights_path def validate(self): """ Checks whether the backbone string is correct. """ allowed_backbones = ['seresnet18', 'seresnet34', 'seresnet50', 'seresnet101', 'seresnet152', 'seresnext50', 'seresnext101', 'senet154'] backbone = self.backbone.split('_')[0] if backbone not in allowed_backbones: raise ValueError('Backbone (\'{}\') not in allowed backbones ({}).'.format(backbone, allowed_backbones)) def preprocess_image(self, inputs): """ Takes as input an image and prepares it for being passed through the network. """ return self.preprocess_image_func(inputs) def senet_retinanet(num_classes, backbone='seresnext50', inputs=None, modifier=None, **kwargs): """ Constructs a retinanet model using a resnet backbone. Args num_classes: Number of classes to predict. backbone: Which backbone to use (one of ('resnet50', 'resnet101', 'resnet152')). inputs: The inputs to the network (defaults to a Tensor of shape (None, None, 3)). modifier: A function handler which can modify the backbone before using it in retinanet (this can be used to freeze backbone layers for example). Returns RetinaNet model with a ResNet backbone. """ # choose default input if inputs is None: if keras.backend.image_data_format() == 'channels_first': inputs = keras.layers.Input(shape=(3, None, None)) else: # inputs = keras.layers.Input(shape=(224, 224, 3)) inputs = keras.layers.Input(shape=(None, None, 3)) classifier, _ = Classifiers.get(backbone) model = classifier(input_tensor=inputs, include_top=False, weights=None) # get last conv layer from the end of each block [28x28, 14x14, 7x7] if backbone == 'seresnet18' or backbone == 'seresnet34': layer_outputs = ['stage3_unit1_relu1', 'stage4_unit1_relu1', 'relu1'] elif backbone == 'seresnet50': layer_outputs = ['activation_36', 'activation_66', 'activation_81'] elif backbone == 'seresnet101': layer_outputs = ['activation_36', 'activation_151', 'activation_166'] elif backbone == 'seresnet152': layer_outputs = ['activation_56', 'activation_236', 'activation_251'] elif backbone == 'seresnext50': layer_outputs = ['activation_37', 'activation_67', 'activation_81'] elif backbone == 'seresnext101': layer_outputs = ['activation_37', 'activation_152', 'activation_166'] elif backbone == 'senet154': layer_outputs = ['activation_59', 'activation_239', 'activation_253'] else: raise ValueError('Backbone (\'{}\') is invalid.'.format(backbone)) layer_outputs = [ model.get_layer(name=layer_outputs[0]).output, # 28x28 model.get_layer(name=layer_outputs[1]).output, # 14x14 model.get_layer(name=layer_outputs[2]).output, # 7x7 ] # create the densenet backbone model = keras.models.Model(inputs=inputs, outputs=layer_outputs, name=model.name) # invoke modifier if given if modifier: model = modifier(model) # C2 not provided backbone_layers = { 'C3': model.outputs[0], 'C4': model.outputs[1], 'C5': model.outputs[2] } # create the full model return retinanet.retinanet(inputs=inputs, num_classes=num_classes, backbone_layers=backbone_layers, **kwargs) def seresnet18_retinanet(num_classes, inputs=None, **kwargs): return senet_retinanet(num_classes=num_classes, backbone='seresnet18', inputs=inputs, **kwargs) def seresnet34_retinanet(num_classes, inputs=None, **kwargs): return senet_retinanet(num_classes=num_classes, backbone='seresnet34', inputs=inputs, **kwargs) def seresnet50_retinanet(num_classes, inputs=None, **kwargs): return senet_retinanet(num_classes=num_classes, backbone='seresnet50', inputs=inputs, **kwargs) def seresnet101_retinanet(num_classes, inputs=None, **kwargs): return senet_retinanet(num_classes=num_classes, backbone='seresnet101', inputs=inputs, **kwargs) def seresnet152_retinanet(num_classes, inputs=None, **kwargs): return senet_retinanet(num_classes=num_classes, backbone='seresnet152', inputs=inputs, **kwargs) def seresnext50_retinanet(num_classes, inputs=None, **kwargs): return senet_retinanet(num_classes=num_classes, backbone='seresnext50', inputs=inputs, **kwargs) def seresnext101_retinanet(num_classes, inputs=None, **kwargs): return senet_retinanet(num_classes=num_classes, backbone='seresnext101', inputs=inputs, **kwargs) def senet154_retinanet(num_classes, inputs=None, **kwargs): return senet_retinanet(num_classes=num_classes, backbone='senet154', inputs=inputs, **kwargs) ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/models/vgg.py ================================================ """ Copyright 2017-2018 cgratie (https://github.com/cgratie/) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ from tensorflow import keras from . import retinanet from . import Backbone from ..utils.image import preprocess_image class VGGBackbone(Backbone): """ Describes backbone information and provides utility functions. """ def retinanet(self, *args, **kwargs): """ Returns a retinanet model using the correct backbone. """ return vgg_retinanet(*args, backbone=self.backbone, **kwargs) def download_imagenet(self): """ Downloads ImageNet weights and returns path to weights file. Weights can be downloaded at https://github.com/fizyr/keras-models/releases . """ if self.backbone == 'vgg16': resource = keras.applications.vgg16.vgg16.WEIGHTS_PATH_NO_TOP checksum = '6d6bbae143d832006294945121d1f1fc' elif self.backbone == 'vgg19': resource = keras.applications.vgg19.vgg19.WEIGHTS_PATH_NO_TOP checksum = '253f8cb515780f3b799900260a226db6' else: raise ValueError("Backbone '{}' not recognized.".format(self.backbone)) return keras.utils.get_file( '{}_weights_tf_dim_ordering_tf_kernels_notop.h5'.format(self.backbone), resource, cache_subdir='models', file_hash=checksum ) def validate(self): """ Checks whether the backbone string is correct. """ allowed_backbones = ['vgg16', 'vgg19'] if self.backbone not in allowed_backbones: raise ValueError('Backbone (\'{}\') not in allowed backbones ({}).'.format(self.backbone, allowed_backbones)) def preprocess_image(self, inputs): """ Takes as input an image and prepares it for being passed through the network. """ return preprocess_image(inputs, mode='caffe') def vgg_retinanet(num_classes, backbone='vgg16', inputs=None, modifier=None, **kwargs): """ Constructs a retinanet model using a vgg backbone. Args num_classes: Number of classes to predict. backbone: Which backbone to use (one of ('vgg16', 'vgg19')). inputs: The inputs to the network (defaults to a Tensor of shape (None, None, 3)). modifier: A function handler which can modify the backbone before using it in retinanet (this can be used to freeze backbone layers for example). Returns RetinaNet model with a VGG backbone. """ # choose default input if inputs is None: inputs = keras.layers.Input(shape=(None, None, 3)) # create the vgg backbone if backbone == 'vgg16': vgg = keras.applications.VGG16(input_tensor=inputs, include_top=False, weights=None) elif backbone == 'vgg19': vgg = keras.applications.VGG19(input_tensor=inputs, include_top=False, weights=None) else: raise ValueError("Backbone '{}' not recognized.".format(backbone)) if modifier: vgg = modifier(vgg) # create the full model layer_names = ["block3_pool", "block4_pool", "block5_pool"] layer_outputs = [vgg.get_layer(name).output for name in layer_names] # C2 not provided backbone_layers = { 'C3': layer_outputs[0], 'C4': layer_outputs[1], 'C5': layer_outputs[2] } return retinanet.retinanet(inputs=inputs, num_classes=num_classes, backbone_layers=backbone_layers, **kwargs) ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/preprocessing/__init__.py ================================================ ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/preprocessing/coco.py ================================================ """ Copyright 2017-2018 Fizyr (https://fizyr.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ from ..preprocessing.generator import Generator from ..utils.image import read_image_bgr import os import numpy as np from pycocotools.coco import COCO class CocoGenerator(Generator): """ Generate data from the COCO dataset. See https://github.com/cocodataset/cocoapi/tree/master/PythonAPI for more information. """ def __init__(self, data_dir, set_name, **kwargs): """ Initialize a COCO data generator. Args data_dir: Path to where the COCO dataset is stored. set_name: Name of the set to parse. """ self.data_dir = data_dir self.set_name = set_name self.coco = COCO(os.path.join(data_dir, 'annotations', 'instances_' + set_name + '.json')) self.image_ids = self.coco.getImgIds() self.load_classes() super(CocoGenerator, self).__init__(**kwargs) def load_classes(self): """ Loads the class to label mapping (and inverse) for COCO. """ # load class names (name -> label) categories = self.coco.loadCats(self.coco.getCatIds()) categories.sort(key=lambda x: x['id']) self.classes = {} self.coco_labels = {} self.coco_labels_inverse = {} for c in categories: self.coco_labels[len(self.classes)] = c['id'] self.coco_labels_inverse[c['id']] = len(self.classes) self.classes[c['name']] = len(self.classes) # also load the reverse (label -> name) self.labels = {} for key, value in self.classes.items(): self.labels[value] = key def size(self): """ Size of the COCO dataset. """ return len(self.image_ids) def num_classes(self): """ Number of classes in the dataset. For COCO this is 80. """ return len(self.classes) def has_label(self, label): """ Return True if label is a known label. """ return label in self.labels def has_name(self, name): """ Returns True if name is a known class. """ return name in self.classes def name_to_label(self, name): """ Map name to label. """ return self.classes[name] def label_to_name(self, label): """ Map label to name. """ return self.labels[label] def coco_label_to_label(self, coco_label): """ Map COCO label to the label as used in the network. COCO has some gaps in the order of labels. The highest label is 90, but there are 80 classes. """ return self.coco_labels_inverse[coco_label] def coco_label_to_name(self, coco_label): """ Map COCO label to name. """ return self.label_to_name(self.coco_label_to_label(coco_label)) def label_to_coco_label(self, label): """ Map label as used by the network to labels as used by COCO. """ return self.coco_labels[label] def image_path(self, image_index): """ Returns the image path for image_index. """ image_info = self.coco.loadImgs(self.image_ids[image_index])[0] path = os.path.join(self.data_dir, 'images', self.set_name, image_info['file_name']) return path def image_aspect_ratio(self, image_index): """ Compute the aspect ratio for an image with image_index. """ image = self.coco.loadImgs(self.image_ids[image_index])[0] return float(image['width']) / float(image['height']) def load_image(self, image_index): """ Load an image at the image_index. """ path = self.image_path(image_index) return read_image_bgr(path) def load_annotations(self, image_index): """ Load annotations for an image_index. """ # get ground truth annotations annotations_ids = self.coco.getAnnIds(imgIds=self.image_ids[image_index], iscrowd=False) annotations = {'labels': np.empty((0,)), 'bboxes': np.empty((0, 4))} # some images appear to miss annotations (like image with id 257034) if len(annotations_ids) == 0: return annotations # parse annotations coco_annotations = self.coco.loadAnns(annotations_ids) for idx, a in enumerate(coco_annotations): # some annotations have basically no width / height, skip them if a['bbox'][2] < 1 or a['bbox'][3] < 1: continue annotations['labels'] = np.concatenate([annotations['labels'], [self.coco_label_to_label(a['category_id'])]], axis=0) annotations['bboxes'] = np.concatenate([annotations['bboxes'], [[ a['bbox'][0], a['bbox'][1], a['bbox'][0] + a['bbox'][2], a['bbox'][1] + a['bbox'][3], ]]], axis=0) return annotations ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/preprocessing/csv_generator.py ================================================ """ Copyright 2017-2018 yhenon (https://github.com/yhenon/) Copyright 2017-2018 Fizyr (https://fizyr.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ from .generator import Generator from ..utils.image import read_image_bgr import numpy as np from PIL import Image from six import raise_from import csv import sys import os.path from collections import OrderedDict def _parse(value, function, fmt): """ Parse a string into a value, and format a nice ValueError if it fails. Returns `function(value)`. Any `ValueError` raised is catched and a new `ValueError` is raised with message `fmt.format(e)`, where `e` is the caught `ValueError`. """ try: return function(value) except ValueError as e: raise_from(ValueError(fmt.format(e)), None) def _read_classes(csv_reader): """ Parse the classes file given by csv_reader. """ result = OrderedDict() for line, row in enumerate(csv_reader): line += 1 try: class_name, class_id = row except ValueError: raise_from(ValueError('line {}: format should be \'class_name,class_id\''.format(line)), None) class_id = _parse(class_id, int, 'line {}: malformed class ID: {{}}'.format(line)) if class_name in result: raise ValueError('line {}: duplicate class name: \'{}\''.format(line, class_name)) result[class_name] = class_id return result def _read_annotations(csv_reader, classes): """ Read annotations from the csv_reader. """ result = OrderedDict() for line, row in enumerate(csv_reader): line += 1 try: img_file, x1, y1, x2, y2, class_name = row[:6] except ValueError: raise_from(ValueError('line {}: format should be \'img_file,x1,y1,x2,y2,class_name\' or \'img_file,,,,,\''.format(line)), None) if img_file not in result: result[img_file] = [] # If a row contains only an image path, it's an image without annotations. if (x1, y1, x2, y2, class_name) == ('', '', '', '', ''): continue x1 = _parse(x1, int, 'line {}: malformed x1: {{}}'.format(line)) y1 = _parse(y1, int, 'line {}: malformed y1: {{}}'.format(line)) x2 = _parse(x2, int, 'line {}: malformed x2: {{}}'.format(line)) y2 = _parse(y2, int, 'line {}: malformed y2: {{}}'.format(line)) # Check that the bounding box is valid. if x2 <= x1: raise ValueError('line {}: x2 ({}) must be higher than x1 ({})'.format(line, x2, x1)) if y2 <= y1: raise ValueError('line {}: y2 ({}) must be higher than y1 ({})'.format(line, y2, y1)) # check if the current class name is correctly present if class_name not in classes: raise ValueError('line {}: unknown class name: \'{}\' (classes: {})'.format(line, class_name, classes)) result[img_file].append({'x1': x1, 'x2': x2, 'y1': y1, 'y2': y2, 'class': class_name}) return result def _open_for_csv(path): """ Open a file with flags suitable for csv.reader. This is different for python2 it means with mode 'rb', for python3 this means 'r' with "universal newlines". """ if sys.version_info[0] < 3: return open(path, 'rb') else: return open(path, 'r', newline='') class CSVGenerator(Generator): """ Generate data for a custom CSV dataset. See https://github.com/fizyr/keras-retinanet#csv-datasets for more information. """ def __init__( self, csv_data_file, csv_class_file, base_dir=None, **kwargs ): """ Initialize a CSV data generator. Args csv_data_file: Path to the CSV annotations file. csv_class_file: Path to the CSV classes file. base_dir: Directory w.r.t. where the files are to be searched (defaults to the directory containing the csv_data_file). """ self.image_names = [] self.image_data = {} self.base_dir = base_dir # Take base_dir from annotations file if not explicitly specified. if self.base_dir is None: self.base_dir = os.path.dirname(csv_data_file) # parse the provided class file try: with _open_for_csv(csv_class_file) as file: self.classes = _read_classes(csv.reader(file, delimiter=',')) except ValueError as e: raise_from(ValueError('invalid CSV class file: {}: {}'.format(csv_class_file, e)), None) self.labels = {} for key, value in self.classes.items(): self.labels[value] = key # csv with img_path, x1, y1, x2, y2, class_name try: with _open_for_csv(csv_data_file) as file: self.image_data = _read_annotations(csv.reader(file, delimiter=','), self.classes) except ValueError as e: raise_from(ValueError('invalid CSV annotations file: {}: {}'.format(csv_data_file, e)), None) self.image_names = list(self.image_data.keys()) super(CSVGenerator, self).__init__(**kwargs) def size(self): """ Size of the dataset. """ return len(self.image_names) def num_classes(self): """ Number of classes in the dataset. """ return max(self.classes.values()) + 1 def has_label(self, label): """ Return True if label is a known label. """ return label in self.labels def has_name(self, name): """ Returns True if name is a known class. """ return name in self.classes def name_to_label(self, name): """ Map name to label. """ return self.classes[name] def label_to_name(self, label): """ Map label to name. """ return self.labels[label] def image_path(self, image_index): """ Returns the image path for image_index. """ return os.path.join(self.base_dir, self.image_names[image_index]) def image_aspect_ratio(self, image_index): """ Compute the aspect ratio for an image with image_index. """ # PIL is fast for metadata image = Image.open(self.image_path(image_index)) return float(image.width) / float(image.height) def load_image(self, image_index): """ Load an image at the image_index. """ return read_image_bgr(self.image_path(image_index)) def load_annotations(self, image_index): """ Load annotations for an image_index. """ path = self.image_names[image_index] annotations = {'labels': np.empty((0,)), 'bboxes': np.empty((0, 4))} for idx, annot in enumerate(self.image_data[path]): annotations['labels'] = np.concatenate((annotations['labels'], [self.name_to_label(annot['class'])])) annotations['bboxes'] = np.concatenate((annotations['bboxes'], [[ float(annot['x1']), float(annot['y1']), float(annot['x2']), float(annot['y2']), ]])) return annotations ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/preprocessing/generator.py ================================================ """ Copyright 2017-2018 Fizyr (https://fizyr.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ import numpy as np import random import warnings from tensorflow import keras from ..utils.anchors import ( anchor_targets_bbox, anchors_for_shape, guess_shapes ) from ..utils.config import parse_anchor_parameters, parse_pyramid_levels from ..utils.image import ( TransformParameters, adjust_transform_for_image, apply_transform, preprocess_image, resize_image, ) from ..utils.transform import transform_aabb class Generator(keras.utils.Sequence): """ Abstract generator class. """ def __init__( self, transform_generator = None, visual_effect_generator=None, batch_size=1, group_method='ratio', # one of 'none', 'random', 'ratio' shuffle_groups=True, image_min_side=800, image_max_side=1333, no_resize=False, transform_parameters=None, compute_anchor_targets=anchor_targets_bbox, compute_shapes=guess_shapes, preprocess_image=preprocess_image, config=None ): """ Initialize Generator object. Args transform_generator : A generator used to randomly transform images and annotations. batch_size : The size of the batches to generate. group_method : Determines how images are grouped together (defaults to 'ratio', one of ('none', 'random', 'ratio')). shuffle_groups : If True, shuffles the groups each epoch. image_min_side : After resizing the minimum side of an image is equal to image_min_side. image_max_side : If after resizing the maximum side is larger than image_max_side, scales down further so that the max side is equal to image_max_side. no_resize : If True, no image/annotation resizing is performed. transform_parameters : The transform parameters used for data augmentation. compute_anchor_targets : Function handler for computing the targets of anchors for an image and its annotations. compute_shapes : Function handler for computing the shapes of the pyramid for a given input. preprocess_image : Function handler for preprocessing an image (scaling / normalizing) for passing through a network. """ self.transform_generator = transform_generator self.visual_effect_generator = visual_effect_generator self.batch_size = int(batch_size) self.group_method = group_method self.shuffle_groups = shuffle_groups self.image_min_side = image_min_side self.image_max_side = image_max_side self.no_resize = no_resize self.transform_parameters = transform_parameters or TransformParameters() self.compute_anchor_targets = compute_anchor_targets self.compute_shapes = compute_shapes self.preprocess_image = preprocess_image self.config = config # Define groups self.group_images() # Shuffle when initializing if self.shuffle_groups: self.on_epoch_end() def on_epoch_end(self): if self.shuffle_groups: random.shuffle(self.groups) def size(self): """ Size of the dataset. """ raise NotImplementedError('size method not implemented') def num_classes(self): """ Number of classes in the dataset. """ raise NotImplementedError('num_classes method not implemented') def has_label(self, label): """ Returns True if label is a known label. """ raise NotImplementedError('has_label method not implemented') def has_name(self, name): """ Returns True if name is a known class. """ raise NotImplementedError('has_name method not implemented') def name_to_label(self, name): """ Map name to label. """ raise NotImplementedError('name_to_label method not implemented') def label_to_name(self, label): """ Map label to name. """ raise NotImplementedError('label_to_name method not implemented') def image_aspect_ratio(self, image_index): """ Compute the aspect ratio for an image with image_index. """ raise NotImplementedError('image_aspect_ratio method not implemented') def image_path(self, image_index): """ Get the path to an image. """ raise NotImplementedError('image_path method not implemented') def load_image(self, image_index): """ Load an image at the image_index. """ raise NotImplementedError('load_image method not implemented') def load_annotations(self, image_index): """ Load annotations for an image_index. """ raise NotImplementedError('load_annotations method not implemented') def load_annotations_group(self, group): """ Load annotations for all images in group. """ annotations_group = [self.load_annotations(image_index) for image_index in group] for annotations in annotations_group: assert(isinstance(annotations, dict)), '\'load_annotations\' should return a list of dictionaries, received: {}'.format(type(annotations)) assert('labels' in annotations), '\'load_annotations\' should return a list of dictionaries that contain \'labels\' and \'bboxes\'.' assert('bboxes' in annotations), '\'load_annotations\' should return a list of dictionaries that contain \'labels\' and \'bboxes\'.' return annotations_group def filter_annotations(self, image_group, annotations_group, group): """ Filter annotations by removing those that are outside of the image bounds or whose width/height < 0. """ # test all annotations for index, (image, annotations) in enumerate(zip(image_group, annotations_group)): # test x2 < x1 | y2 < y1 | x1 < 0 | y1 < 0 | x2 <= 0 | y2 <= 0 | x2 >= image.shape[1] | y2 >= image.shape[0] invalid_indices = np.where( (annotations['bboxes'][:, 2] <= annotations['bboxes'][:, 0]) | (annotations['bboxes'][:, 3] <= annotations['bboxes'][:, 1]) | (annotations['bboxes'][:, 0] < 0) | (annotations['bboxes'][:, 1] < 0) | (annotations['bboxes'][:, 2] > image.shape[1]) | (annotations['bboxes'][:, 3] > image.shape[0]) )[0] # delete invalid indices if len(invalid_indices): warnings.warn('Image {} with id {} (shape {}) contains the following invalid boxes: {}.'.format( self.image_path(group[index]), group[index], image.shape, annotations['bboxes'][invalid_indices, :] )) for k in annotations_group[index].keys(): annotations_group[index][k] = np.delete(annotations[k], invalid_indices, axis=0) return image_group, annotations_group def load_image_group(self, group): """ Load images for all images in a group. """ return [self.load_image(image_index) for image_index in group] def random_visual_effect_group_entry(self, image, annotations): """ Randomly transforms image and annotation. """ visual_effect = next(self.visual_effect_generator) # apply visual effect image = visual_effect(image) return image, annotations def random_visual_effect_group(self, image_group, annotations_group): """ Randomly apply visual effect on each image. """ assert(len(image_group) == len(annotations_group)) if self.visual_effect_generator is None: # do nothing return image_group, annotations_group for index in range(len(image_group)): # apply effect on a single group entry image_group[index], annotations_group[index] = self.random_visual_effect_group_entry( image_group[index], annotations_group[index] ) return image_group, annotations_group def random_transform_group_entry(self, image, annotations, transform=None): """ Randomly transforms image and annotation. """ # randomly transform both image and annotations if transform is not None or self.transform_generator: if transform is None: transform = adjust_transform_for_image(next(self.transform_generator), image, self.transform_parameters.relative_translation) # apply transformation to image image = apply_transform(transform, image, self.transform_parameters) # Transform the bounding boxes in the annotations. annotations['bboxes'] = annotations['bboxes'].copy() for index in range(annotations['bboxes'].shape[0]): annotations['bboxes'][index, :] = transform_aabb(transform, annotations['bboxes'][index, :]) return image, annotations def random_transform_group(self, image_group, annotations_group): """ Randomly transforms each image and its annotations. """ assert(len(image_group) == len(annotations_group)) for index in range(len(image_group)): # transform a single group entry image_group[index], annotations_group[index] = self.random_transform_group_entry(image_group[index], annotations_group[index]) return image_group, annotations_group def resize_image(self, image): """ Resize an image using image_min_side and image_max_side. """ if self.no_resize: return image, 1 else: return resize_image(image, min_side=self.image_min_side, max_side=self.image_max_side) def preprocess_group_entry(self, image, annotations): """ Preprocess image and its annotations. """ # resize image image, image_scale = self.resize_image(image) # preprocess the image image = self.preprocess_image(image) # apply resizing to annotations too annotations['bboxes'] *= image_scale # convert to the wanted keras floatx image = keras.backend.cast_to_floatx(image) return image, annotations def preprocess_group(self, image_group, annotations_group): """ Preprocess each image and its annotations in its group. """ assert(len(image_group) == len(annotations_group)) for index in range(len(image_group)): # preprocess a single group entry image_group[index], annotations_group[index] = self.preprocess_group_entry(image_group[index], annotations_group[index]) return image_group, annotations_group def group_images(self): """ Order the images according to self.order and makes groups of self.batch_size. """ # determine the order of the images order = list(range(self.size())) if self.group_method == 'random': random.shuffle(order) elif self.group_method == 'ratio': order.sort(key=lambda x: self.image_aspect_ratio(x)) # divide into groups, one group = one batch self.groups = [[order[x % len(order)] for x in range(i, i + self.batch_size)] for i in range(0, len(order), self.batch_size)] def compute_inputs(self, image_group): """ Compute inputs for the network using an image_group. """ # get the max image shape max_shape = tuple(max(image.shape[x] for image in image_group) for x in range(3)) # construct an image batch object image_batch = np.zeros((self.batch_size,) + max_shape, dtype=keras.backend.floatx()) # copy all images to the upper left part of the image batch object for image_index, image in enumerate(image_group): image_batch[image_index, :image.shape[0], :image.shape[1], :image.shape[2]] = image if keras.backend.image_data_format() == 'channels_first': image_batch = image_batch.transpose((0, 3, 1, 2)) return image_batch def generate_anchors(self, image_shape): anchor_params = None pyramid_levels = None if self.config and 'anchor_parameters' in self.config: anchor_params = parse_anchor_parameters(self.config) if self.config and 'pyramid_levels' in self.config: pyramid_levels = parse_pyramid_levels(self.config) return anchors_for_shape(image_shape, anchor_params=anchor_params, pyramid_levels=pyramid_levels, shapes_callback=self.compute_shapes) def compute_targets(self, image_group, annotations_group): """ Compute target outputs for the network using images and their annotations. """ # get the max image shape max_shape = tuple(max(image.shape[x] for image in image_group) for x in range(3)) anchors = self.generate_anchors(max_shape) batches = self.compute_anchor_targets( anchors, image_group, annotations_group, self.num_classes() ) return list(batches) def compute_input_output(self, group): """ Compute inputs and target outputs for the network. """ # load images and annotations image_group = self.load_image_group(group) annotations_group = self.load_annotations_group(group) # check validity of annotations image_group, annotations_group = self.filter_annotations(image_group, annotations_group, group) # randomly apply visual effect image_group, annotations_group = self.random_visual_effect_group(image_group, annotations_group) # randomly transform data image_group, annotations_group = self.random_transform_group(image_group, annotations_group) # perform preprocessing steps image_group, annotations_group = self.preprocess_group(image_group, annotations_group) # compute network inputs inputs = self.compute_inputs(image_group) # compute network targets targets = self.compute_targets(image_group, annotations_group) return inputs, targets def __len__(self): """ Number of batches for generator. """ return len(self.groups) def __getitem__(self, index): """ Keras sequence method for generating batches. """ group = self.groups[index] inputs, targets = self.compute_input_output(group) return inputs, targets ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/preprocessing/kitti.py ================================================ """ Copyright 2017-2018 lvaleriu (https://github.com/lvaleriu/) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ import csv import os.path import numpy as np from PIL import Image from .generator import Generator from ..utils.image import read_image_bgr kitti_classes = { 'Car': 0, 'Van': 1, 'Truck': 2, 'Pedestrian': 3, 'Person_sitting': 4, 'Cyclist': 5, 'Tram': 6, 'Misc': 7, 'DontCare': 7 } class KittiGenerator(Generator): """ Generate data for a KITTI dataset. See http://www.cvlibs.net/datasets/kitti/ for more information. """ def __init__( self, base_dir, subset='train', **kwargs ): """ Initialize a KITTI data generator. Args base_dir: Directory w.r.t. where the files are to be searched (defaults to the directory containing the csv_data_file). subset: The subset to generate data for (defaults to 'train'). """ self.base_dir = base_dir label_dir = os.path.join(self.base_dir, subset, 'labels') image_dir = os.path.join(self.base_dir, subset, 'images') """ 1 type Describes the type of object: 'Car', 'Van', 'Truck', 'Pedestrian', 'Person_sitting', 'Cyclist', 'Tram', 'Misc' or 'DontCare' 1 truncated Float from 0 (non-truncated) to 1 (truncated), where truncated refers to the object leaving image boundaries 1 occluded Integer (0,1,2,3) indicating occlusion state: 0 = fully visible, 1 = partly occluded 2 = largely occluded, 3 = unknown 1 alpha Observation angle of object, ranging [-pi..pi] 4 bbox 2D bounding box of object in the image (0-based index): contains left, top, right, bottom pixel coordinates 3 dimensions 3D object dimensions: height, width, length (in meters) 3 location 3D object location x,y,z in camera coordinates (in meters) 1 rotation_y Rotation ry around Y-axis in camera coordinates [-pi..pi] """ self.labels = {} self.classes = kitti_classes for name, label in self.classes.items(): self.labels[label] = name self.image_data = dict() self.images = [] for i, fn in enumerate(os.listdir(label_dir)): label_fp = os.path.join(label_dir, fn) image_fp = os.path.join(image_dir, fn.replace('.txt', '.png')) self.images.append(image_fp) fieldnames = ['type', 'truncated', 'occluded', 'alpha', 'left', 'top', 'right', 'bottom', 'dh', 'dw', 'dl', 'lx', 'ly', 'lz', 'ry'] with open(label_fp, 'r') as csv_file: reader = csv.DictReader(csv_file, delimiter=' ', fieldnames=fieldnames) boxes = [] for line, row in enumerate(reader): label = row['type'] cls_id = kitti_classes[label] annotation = {'cls_id': cls_id, 'x1': row['left'], 'x2': row['right'], 'y2': row['bottom'], 'y1': row['top']} boxes.append(annotation) self.image_data[i] = boxes super(KittiGenerator, self).__init__(**kwargs) def size(self): """ Size of the dataset. """ return len(self.images) def num_classes(self): """ Number of classes in the dataset. """ return max(self.classes.values()) + 1 def has_label(self, label): """ Return True if label is a known label. """ return label in self.labels def has_name(self, name): """ Returns True if name is a known class. """ return name in self.classes def name_to_label(self, name): """ Map name to label. """ raise NotImplementedError() def label_to_name(self, label): """ Map label to name. """ return self.labels[label] def image_aspect_ratio(self, image_index): """ Compute the aspect ratio for an image with image_index. """ # PIL is fast for metadata image = Image.open(self.images[image_index]) return float(image.width) / float(image.height) def image_path(self, image_index): """ Get the path to an image. """ return self.images[image_index] def load_image(self, image_index): """ Load an image at the image_index. """ return read_image_bgr(self.image_path(image_index)) def load_annotations(self, image_index): """ Load annotations for an image_index. """ image_data = self.image_data[image_index] annotations = {'labels': np.empty((len(image_data),)), 'bboxes': np.empty((len(image_data), 4))} for idx, ann in enumerate(image_data): annotations['bboxes'][idx, 0] = float(ann['x1']) annotations['bboxes'][idx, 1] = float(ann['y1']) annotations['bboxes'][idx, 2] = float(ann['x2']) annotations['bboxes'][idx, 3] = float(ann['y2']) annotations['labels'][idx] = int(ann['cls_id']) return annotations ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/preprocessing/open_images.py ================================================ """ Copyright 2017-2018 lvaleriu (https://github.com/lvaleriu/) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ import csv import json import os import warnings import numpy as np from PIL import Image from .generator import Generator from ..utils.image import read_image_bgr def load_hierarchy(metadata_dir, version='v4'): hierarchy = None if version == 'challenge2018': hierarchy = 'bbox_labels_500_hierarchy.json' elif version == 'v4': hierarchy = 'bbox_labels_600_hierarchy.json' elif version == 'v3': hierarchy = 'bbox_labels_600_hierarchy.json' hierarchy_json = os.path.join(metadata_dir, hierarchy) with open(hierarchy_json) as f: hierarchy_data = json.loads(f.read()) return hierarchy_data def load_hierarchy_children(hierarchy): res = [hierarchy['LabelName']] if 'Subcategory' in hierarchy: for subcategory in hierarchy['Subcategory']: children = load_hierarchy_children(subcategory) for c in children: res.append(c) return res def find_hierarchy_parent(hierarchy, parent_cls): if hierarchy['LabelName'] == parent_cls: return hierarchy elif 'Subcategory' in hierarchy: for child in hierarchy['Subcategory']: res = find_hierarchy_parent(child, parent_cls) if res is not None: return res return None def get_labels(metadata_dir, version='v4'): if version == 'v4' or version == 'challenge2018': csv_file = 'class-descriptions-boxable.csv' if version == 'v4' else 'challenge-2018-class-descriptions-500.csv' boxable_classes_descriptions = os.path.join(metadata_dir, csv_file) id_to_labels = {} cls_index = {} i = 0 with open(boxable_classes_descriptions) as f: for row in csv.reader(f): # make sure the csv row is not empty (usually the last one) if len(row): label = row[0] description = row[1].replace("\"", "").replace("'", "").replace('`', '') id_to_labels[i] = description cls_index[label] = i i += 1 else: trainable_classes_path = os.path.join(metadata_dir, 'classes-bbox-trainable.txt') description_path = os.path.join(metadata_dir, 'class-descriptions.csv') description_table = {} with open(description_path) as f: for row in csv.reader(f): # make sure the csv row is not empty (usually the last one) if len(row): description_table[row[0]] = row[1].replace("\"", "").replace("'", "").replace('`', '') with open(trainable_classes_path, 'rb') as f: trainable_classes = f.read().split('\n') id_to_labels = dict([(i, description_table[c]) for i, c in enumerate(trainable_classes)]) cls_index = dict([(c, i) for i, c in enumerate(trainable_classes)]) return id_to_labels, cls_index def generate_images_annotations_json(main_dir, metadata_dir, subset, cls_index, version='v4'): validation_image_ids = {} if version == 'v4': annotations_path = os.path.join(metadata_dir, subset, '{}-annotations-bbox.csv'.format(subset)) elif version == 'challenge2018': validation_image_ids_path = os.path.join(metadata_dir, 'challenge-2018-image-ids-valset-od.csv') with open(validation_image_ids_path, 'r') as csv_file: reader = csv.DictReader(csv_file, fieldnames=['ImageID']) next(reader) for line, row in enumerate(reader): image_id = row['ImageID'] validation_image_ids[image_id] = True annotations_path = os.path.join(metadata_dir, 'challenge-2018-train-annotations-bbox.csv') else: annotations_path = os.path.join(metadata_dir, subset, 'annotations-human-bbox.csv') fieldnames = ['ImageID', 'Source', 'LabelName', 'Confidence', 'XMin', 'XMax', 'YMin', 'YMax', 'IsOccluded', 'IsTruncated', 'IsGroupOf', 'IsDepiction', 'IsInside'] id_annotations = dict() with open(annotations_path, 'r') as csv_file: reader = csv.DictReader(csv_file, fieldnames=fieldnames) next(reader) images_sizes = {} for line, row in enumerate(reader): frame = row['ImageID'] if version == 'challenge2018': if subset == 'train': if frame in validation_image_ids: continue elif subset == 'validation': if frame not in validation_image_ids: continue else: raise NotImplementedError('This generator handles only the train and validation subsets') class_name = row['LabelName'] if class_name not in cls_index: continue cls_id = cls_index[class_name] if version == 'challenge2018': # We recommend participants to use the provided subset of the training set as a validation set. # This is preferable over using the V4 val/test sets, as the training set is more densely annotated. img_path = os.path.join(main_dir, 'images', 'train', frame + '.jpg') else: img_path = os.path.join(main_dir, 'images', subset, frame + '.jpg') if frame in images_sizes: width, height = images_sizes[frame] else: try: with Image.open(img_path) as img: width, height = img.width, img.height images_sizes[frame] = (width, height) except Exception as ex: if version == 'challenge2018': raise ex continue x1 = float(row['XMin']) x2 = float(row['XMax']) y1 = float(row['YMin']) y2 = float(row['YMax']) x1_int = int(round(x1 * width)) x2_int = int(round(x2 * width)) y1_int = int(round(y1 * height)) y2_int = int(round(y2 * height)) # Check that the bounding box is valid. if x2 <= x1: raise ValueError('line {}: x2 ({}) must be higher than x1 ({})'.format(line, x2, x1)) if y2 <= y1: raise ValueError('line {}: y2 ({}) must be higher than y1 ({})'.format(line, y2, y1)) if y2_int == y1_int: warnings.warn('filtering line {}: rounding y2 ({}) and y1 ({}) makes them equal'.format(line, y2, y1)) continue if x2_int == x1_int: warnings.warn('filtering line {}: rounding x2 ({}) and x1 ({}) makes them equal'.format(line, x2, x1)) continue img_id = row['ImageID'] annotation = {'cls_id': cls_id, 'x1': x1, 'x2': x2, 'y1': y1, 'y2': y2} if img_id in id_annotations: annotations = id_annotations[img_id] annotations['boxes'].append(annotation) else: id_annotations[img_id] = {'w': width, 'h': height, 'boxes': [annotation]} return id_annotations class OpenImagesGenerator(Generator): def __init__( self, main_dir, subset, version='v4', labels_filter=None, annotation_cache_dir='.', parent_label=None, **kwargs ): if version == 'challenge2018': metadata = 'challenge2018' elif version == 'v4': metadata = '2018_04' elif version == 'v3': metadata = '2017_11' else: raise NotImplementedError('There is currently no implementation for versions older than v3') if version == 'challenge2018': self.base_dir = os.path.join(main_dir, 'images', 'train') else: self.base_dir = os.path.join(main_dir, 'images', subset) metadata_dir = os.path.join(main_dir, metadata) annotation_cache_json = os.path.join(annotation_cache_dir, subset + '.json') self.hierarchy = load_hierarchy(metadata_dir, version=version) id_to_labels, cls_index = get_labels(metadata_dir, version=version) if os.path.exists(annotation_cache_json): with open(annotation_cache_json, 'r') as f: self.annotations = json.loads(f.read()) else: self.annotations = generate_images_annotations_json(main_dir, metadata_dir, subset, cls_index, version=version) json.dump(self.annotations, open(annotation_cache_json, "w")) if labels_filter is not None or parent_label is not None: self.id_to_labels, self.annotations = self.__filter_data(id_to_labels, cls_index, labels_filter, parent_label) else: self.id_to_labels = id_to_labels self.id_to_image_id = dict([(i, k) for i, k in enumerate(self.annotations)]) super(OpenImagesGenerator, self).__init__(**kwargs) def __filter_data(self, id_to_labels, cls_index, labels_filter=None, parent_label=None): """ If you want to work with a subset of the labels just set a list with trainable labels :param labels_filter: Ex: labels_filter = ['Helmet', 'Hat', 'Analog television'] :param parent_label: If parent_label is set this will bring you the parent label but also its children in the semantic hierarchy as defined in OID, ex: Animal hierarchical tree :return: """ children_id_to_labels = {} if parent_label is None: # there is/are no other sublabel(s) other than the labels itself for label in labels_filter: for i, lb in id_to_labels.items(): if lb == label: children_id_to_labels[i] = label break else: parent_cls = None for i, lb in id_to_labels.items(): if lb == parent_label: parent_id = i for c, index in cls_index.items(): if index == parent_id: parent_cls = c break if parent_cls is None: raise Exception('Couldnt find label {}'.format(parent_label)) parent_tree = find_hierarchy_parent(self.hierarchy, parent_cls) if parent_tree is None: raise Exception('Couldnt find parent {} in the semantic hierarchical tree'.format(parent_label)) children = load_hierarchy_children(parent_tree) for cls in children: index = cls_index[cls] label = id_to_labels[index] children_id_to_labels[index] = label id_map = dict([(ind, i) for i, ind in enumerate(children_id_to_labels.keys())]) filtered_annotations = {} for k in self.annotations: img_ann = self.annotations[k] filtered_boxes = [] for ann in img_ann['boxes']: cls_id = ann['cls_id'] if cls_id in children_id_to_labels: ann['cls_id'] = id_map[cls_id] filtered_boxes.append(ann) if len(filtered_boxes) > 0: filtered_annotations[k] = {'w': img_ann['w'], 'h': img_ann['h'], 'boxes': filtered_boxes} children_id_to_labels = dict([(id_map[i], l) for (i, l) in children_id_to_labels.items()]) return children_id_to_labels, filtered_annotations def size(self): return len(self.annotations) def num_classes(self): return len(self.id_to_labels) def has_label(self, label): """ Return True if label is a known label. """ return label in self.id_to_labels def has_name(self, name): """ Returns True if name is a known class. """ raise NotImplementedError() def name_to_label(self, name): raise NotImplementedError() def label_to_name(self, label): return self.id_to_labels[label] def image_aspect_ratio(self, image_index): img_annotations = self.annotations[self.id_to_image_id[image_index]] height, width = img_annotations['h'], img_annotations['w'] return float(width) / float(height) def image_path(self, image_index): path = os.path.join(self.base_dir, self.id_to_image_id[image_index] + '.jpg') return path def load_image(self, image_index): return read_image_bgr(self.image_path(image_index)) def load_annotations(self, image_index): image_annotations = self.annotations[self.id_to_image_id[image_index]] labels = image_annotations['boxes'] height, width = image_annotations['h'], image_annotations['w'] annotations = {'labels': np.empty((len(labels),)), 'bboxes': np.empty((len(labels), 4))} for idx, ann in enumerate(labels): cls_id = ann['cls_id'] x1 = ann['x1'] * width x2 = ann['x2'] * width y1 = ann['y1'] * height y2 = ann['y2'] * height annotations['bboxes'][idx, 0] = x1 annotations['bboxes'][idx, 1] = y1 annotations['bboxes'][idx, 2] = x2 annotations['bboxes'][idx, 3] = y2 annotations['labels'][idx] = cls_id return annotations ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/preprocessing/pascal_voc.py ================================================ """ Copyright 2017-2018 Fizyr (https://fizyr.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ from ..preprocessing.generator import Generator from ..utils.image import read_image_bgr import os import numpy as np from six import raise_from from PIL import Image try: import xml.etree.cElementTree as ET except ImportError: import xml.etree.ElementTree as ET voc_classes = { 'aeroplane' : 0, 'bicycle' : 1, 'bird' : 2, 'boat' : 3, 'bottle' : 4, 'bus' : 5, 'car' : 6, 'cat' : 7, 'chair' : 8, 'cow' : 9, 'diningtable' : 10, 'dog' : 11, 'horse' : 12, 'motorbike' : 13, 'person' : 14, 'pottedplant' : 15, 'sheep' : 16, 'sofa' : 17, 'train' : 18, 'tvmonitor' : 19 } def _findNode(parent, name, debug_name=None, parse=None): if debug_name is None: debug_name = name result = parent.find(name) if result is None: raise ValueError('missing element \'{}\''.format(debug_name)) if parse is not None: try: return parse(result.text) except ValueError as e: raise_from(ValueError('illegal value for \'{}\': {}'.format(debug_name, e)), None) return result class PascalVocGenerator(Generator): """ Generate data for a Pascal VOC dataset. See http://host.robots.ox.ac.uk/pascal/VOC/ for more information. """ def __init__( self, data_dir, set_name, classes=voc_classes, image_extension='.jpg', skip_truncated=False, skip_difficult=False, **kwargs ): """ Initialize a Pascal VOC data generator. Args base_dir: Directory w.r.t. where the files are to be searched (defaults to the directory containing the csv_data_file). csv_class_file: Path to the CSV classes file. """ self.data_dir = data_dir self.set_name = set_name self.classes = classes self.image_names = [line.strip().split(None, 1)[0] for line in open(os.path.join(data_dir, 'ImageSets', 'Main', set_name + '.txt')).readlines()] self.image_extension = image_extension self.skip_truncated = skip_truncated self.skip_difficult = skip_difficult self.labels = {} for key, value in self.classes.items(): self.labels[value] = key super(PascalVocGenerator, self).__init__(**kwargs) def size(self): """ Size of the dataset. """ return len(self.image_names) def num_classes(self): """ Number of classes in the dataset. """ return len(self.classes) def has_label(self, label): """ Return True if label is a known label. """ return label in self.labels def has_name(self, name): """ Returns True if name is a known class. """ return name in self.classes def name_to_label(self, name): """ Map name to label. """ return self.classes[name] def label_to_name(self, label): """ Map label to name. """ return self.labels[label] def image_aspect_ratio(self, image_index): """ Compute the aspect ratio for an image with image_index. """ path = os.path.join(self.data_dir, 'JPEGImages', self.image_names[image_index] + self.image_extension) image = Image.open(path) return float(image.width) / float(image.height) def image_path(self, image_index): """ Get the path to an image. """ return os.path.join(self.data_dir, 'JPEGImages', self.image_names[image_index] + self.image_extension) def load_image(self, image_index): """ Load an image at the image_index. """ return read_image_bgr(self.image_path(image_index)) def __parse_annotation(self, element): """ Parse an annotation given an XML element. """ truncated = _findNode(element, 'truncated', parse=int) difficult = _findNode(element, 'difficult', parse=int) class_name = _findNode(element, 'name').text if class_name not in self.classes: raise ValueError('class name \'{}\' not found in classes: {}'.format(class_name, list(self.classes.keys()))) box = np.zeros((4,)) label = self.name_to_label(class_name) bndbox = _findNode(element, 'bndbox') box[0] = _findNode(bndbox, 'xmin', 'bndbox.xmin', parse=float) - 1 box[1] = _findNode(bndbox, 'ymin', 'bndbox.ymin', parse=float) - 1 box[2] = _findNode(bndbox, 'xmax', 'bndbox.xmax', parse=float) - 1 box[3] = _findNode(bndbox, 'ymax', 'bndbox.ymax', parse=float) - 1 return truncated, difficult, box, label def __parse_annotations(self, xml_root): """ Parse all annotations under the xml_root. """ annotations = {'labels': np.empty((len(xml_root.findall('object')),)), 'bboxes': np.empty((len(xml_root.findall('object')), 4))} for i, element in enumerate(xml_root.iter('object')): try: truncated, difficult, box, label = self.__parse_annotation(element) except ValueError as e: raise_from(ValueError('could not parse object #{}: {}'.format(i, e)), None) if truncated and self.skip_truncated: continue if difficult and self.skip_difficult: continue annotations['bboxes'][i, :] = box annotations['labels'][i] = label return annotations def load_annotations(self, image_index): """ Load annotations for an image_index. """ filename = self.image_names[image_index] + '.xml' try: tree = ET.parse(os.path.join(self.data_dir, 'Annotations', filename)) return self.__parse_annotations(tree.getroot()) except ET.ParseError as e: raise_from(ValueError('invalid annotations file: {}: {}'.format(filename, e)), None) except ValueError as e: raise_from(ValueError('invalid annotations file: {}: {}'.format(filename, e)), None) ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/utils/__init__.py ================================================ ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/utils/anchors.py ================================================ """ Copyright 2017-2018 Fizyr (https://fizyr.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ import numpy as np from tensorflow import keras #from ..utils.compute_overlap import compute_overlap class AnchorParameters: """ The parameteres that define how anchors are generated. Args sizes : List of sizes to use. Each size corresponds to one feature level. strides : List of strides to use. Each stride correspond to one feature level. ratios : List of ratios to use per location in a feature map. scales : List of scales to use per location in a feature map. """ def __init__(self, sizes, strides, ratios, scales): self.sizes = sizes self.strides = strides self.ratios = ratios self.scales = scales def num_anchors(self): return len(self.ratios) * len(self.scales) """ The default anchor parameters. """ AnchorParameters.default = AnchorParameters( sizes = [32, 64, 128, 256, 512], strides = [8, 16, 32, 64, 128], ratios = np.array([0.5, 1, 2], keras.backend.floatx()), scales = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)], keras.backend.floatx()), ) def anchor_targets_bbox( anchors, image_group, annotations_group, num_classes, negative_overlap=0.4, positive_overlap=0.5 ): """ Generate anchor targets for bbox detection. Args anchors: np.array of annotations of shape (N, 4) for (x1, y1, x2, y2). image_group: List of BGR images. annotations_group: List of annotation dictionaries with each annotation containing 'labels' and 'bboxes' of an image. num_classes: Number of classes to predict. mask_shape: If the image is padded with zeros, mask_shape can be used to mark the relevant part of the image. negative_overlap: IoU overlap for negative anchors (all anchors with overlap < negative_overlap are negative). positive_overlap: IoU overlap or positive anchors (all anchors with overlap > positive_overlap are positive). Returns labels_batch: batch that contains labels & anchor states (np.array of shape (batch_size, N, num_classes + 1), where N is the number of anchors for an image and the last column defines the anchor state (-1 for ignore, 0 for bg, 1 for fg). regression_batch: batch that contains bounding-box regression targets for an image & anchor states (np.array of shape (batch_size, N, 4 + 1), where N is the number of anchors for an image, the first 4 columns define regression targets for (x1, y1, x2, y2) and the last column defines anchor states (-1 for ignore, 0 for bg, 1 for fg). """ assert(len(image_group) == len(annotations_group)), "The length of the images and annotations need to be equal." assert(len(annotations_group) > 0), "No data received to compute anchor targets for." for annotations in annotations_group: assert('bboxes' in annotations), "Annotations should contain bboxes." assert('labels' in annotations), "Annotations should contain labels." batch_size = len(image_group) regression_batch = np.zeros((batch_size, anchors.shape[0], 4 + 1), dtype=keras.backend.floatx()) labels_batch = np.zeros((batch_size, anchors.shape[0], num_classes + 1), dtype=keras.backend.floatx()) # compute labels and regression targets for index, (image, annotations) in enumerate(zip(image_group, annotations_group)): if annotations['bboxes'].shape[0]: # obtain indices of gt annotations with the greatest overlap positive_indices, ignore_indices, argmax_overlaps_inds = compute_gt_annotations(anchors, annotations['bboxes'], negative_overlap, positive_overlap) labels_batch[index, ignore_indices, -1] = -1 labels_batch[index, positive_indices, -1] = 1 regression_batch[index, ignore_indices, -1] = -1 regression_batch[index, positive_indices, -1] = 1 # compute target class labels labels_batch[index, positive_indices, annotations['labels'][argmax_overlaps_inds[positive_indices]].astype(int)] = 1 regression_batch[index, :, :-1] = bbox_transform(anchors, annotations['bboxes'][argmax_overlaps_inds, :]) # ignore annotations outside of image if image.shape: anchors_centers = np.vstack([(anchors[:, 0] + anchors[:, 2]) / 2, (anchors[:, 1] + anchors[:, 3]) / 2]).T indices = np.logical_or(anchors_centers[:, 0] >= image.shape[1], anchors_centers[:, 1] >= image.shape[0]) labels_batch[index, indices, -1] = -1 regression_batch[index, indices, -1] = -1 return regression_batch, labels_batch def layer_shapes(image_shape, model): """Compute layer shapes given input image shape and the model. Args image_shape: The shape of the image. model: The model to use for computing how the image shape is transformed in the pyramid. Returns A dictionary mapping layer names to image shapes. """ shape = { model.layers[0].name: (None,) + image_shape, } for layer in model.layers[1:]: nodes = layer._inbound_nodes for node in nodes: if isinstance(node.inbound_layers, keras.layers.Layer): inputs = [shape[node.inbound_layers.name]] else: inputs = [shape[lr.name] for lr in node.inbound_layers] if not inputs: continue shape[layer.name] = layer.compute_output_shape(inputs[0] if len(inputs) == 1 else inputs) return shape def make_shapes_callback(model): """ Make a function for getting the shape of the pyramid levels. """ def get_shapes(image_shape, pyramid_levels): shape = layer_shapes(image_shape, model) image_shapes = [shape["P{}".format(level)][1:3] for level in pyramid_levels] return image_shapes return get_shapes def guess_shapes(image_shape, pyramid_levels): """Guess shapes based on pyramid levels. Args image_shape: The shape of the image. pyramid_levels: A list of what pyramid levels are used. Returns A list of image shapes at each pyramid level. """ image_shape = np.array(image_shape[:2]) image_shapes = [(image_shape + 2 ** x - 1) // (2 ** x) for x in pyramid_levels] return image_shapes def anchors_for_shape( image_shape, pyramid_levels=None, anchor_params=None, shapes_callback=None, ): """ Generators anchors for a given shape. Args image_shape: The shape of the image. pyramid_levels: List of ints representing which pyramids to use (defaults to [3, 4, 5, 6, 7]). anchor_params: Struct containing anchor parameters. If None, default values are used. shapes_callback: Function to call for getting the shape of the image at different pyramid levels. Returns np.array of shape (N, 4) containing the (x1, y1, x2, y2) coordinates for the anchors. """ if pyramid_levels is None: pyramid_levels = [3, 4, 5, 6, 7] if anchor_params is None: anchor_params = AnchorParameters.default if shapes_callback is None: shapes_callback = guess_shapes image_shapes = shapes_callback(image_shape, pyramid_levels) # compute anchors over all pyramid levels all_anchors = np.zeros((0, 4)) for idx, p in enumerate(pyramid_levels): anchors = generate_anchors( base_size=anchor_params.sizes[idx], ratios=anchor_params.ratios, scales=anchor_params.scales ) shifted_anchors = shift(image_shapes[idx], anchor_params.strides[idx], anchors) all_anchors = np.append(all_anchors, shifted_anchors, axis=0) return all_anchors def shift(shape, stride, anchors): """ Produce shifted anchors based on shape of the map and stride size. Args shape : Shape to shift the anchors over. stride : Stride to shift the anchors with over the shape. anchors: The anchors to apply at each location. """ # create a grid starting from half stride from the top left corner shift_x = (np.arange(0, shape[1]) + 0.5) * stride shift_y = (np.arange(0, shape[0]) + 0.5) * stride shift_x, shift_y = np.meshgrid(shift_x, shift_y) shifts = np.vstack(( shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel() )).transpose() # add A anchors (1, A, 4) to # cell K shifts (K, 1, 4) to get # shift anchors (K, A, 4) # reshape to (K*A, 4) shifted anchors A = anchors.shape[0] K = shifts.shape[0] all_anchors = (anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2))) all_anchors = all_anchors.reshape((K * A, 4)) return all_anchors def generate_anchors(base_size=16, ratios=None, scales=None): """ Generate anchor (reference) windows by enumerating aspect ratios X scales w.r.t. a reference window. """ if ratios is None: ratios = AnchorParameters.default.ratios if scales is None: scales = AnchorParameters.default.scales num_anchors = len(ratios) * len(scales) # initialize output anchors anchors = np.zeros((num_anchors, 4)) # scale base_size anchors[:, 2:] = base_size * np.tile(scales, (2, len(ratios))).T # compute areas of anchors areas = anchors[:, 2] * anchors[:, 3] # correct for ratios anchors[:, 2] = np.sqrt(areas / np.repeat(ratios, len(scales))) anchors[:, 3] = anchors[:, 2] * np.repeat(ratios, len(scales)) # transform from (x_ctr, y_ctr, w, h) -> (x1, y1, x2, y2) anchors[:, 0::2] -= np.tile(anchors[:, 2] * 0.5, (2, 1)).T anchors[:, 1::2] -= np.tile(anchors[:, 3] * 0.5, (2, 1)).T return anchors def bbox_transform(anchors, gt_boxes, mean=None, std=None): """Compute bounding-box regression targets for an image.""" # The Mean and std are calculated from COCO dataset. # Bounding box normalization was firstly introduced in the Fast R-CNN paper. # See https://github.com/fizyr/keras-retinanet/issues/1273#issuecomment-585828825 for more details if mean is None: mean = np.array([0, 0, 0, 0]) if std is None: std = np.array([0.2, 0.2, 0.2, 0.2]) if isinstance(mean, (list, tuple)): mean = np.array(mean) elif not isinstance(mean, np.ndarray): raise ValueError('Expected mean to be a np.ndarray, list or tuple. Received: {}'.format(type(mean))) if isinstance(std, (list, tuple)): std = np.array(std) elif not isinstance(std, np.ndarray): raise ValueError('Expected std to be a np.ndarray, list or tuple. Received: {}'.format(type(std))) anchor_widths = anchors[:, 2] - anchors[:, 0] anchor_heights = anchors[:, 3] - anchors[:, 1] # According to the information provided by a keras-retinanet author, they got marginally better results using # the following way of bounding box parametrization. # See https://github.com/fizyr/keras-retinanet/issues/1273#issuecomment-585828825 for more details targets_dx1 = (gt_boxes[:, 0] - anchors[:, 0]) / anchor_widths targets_dy1 = (gt_boxes[:, 1] - anchors[:, 1]) / anchor_heights targets_dx2 = (gt_boxes[:, 2] - anchors[:, 2]) / anchor_widths targets_dy2 = (gt_boxes[:, 3] - anchors[:, 3]) / anchor_heights targets = np.stack((targets_dx1, targets_dy1, targets_dx2, targets_dy2)) targets = targets.T targets = (targets - mean) / std return targets ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/utils/coco_eval.py ================================================ """ Copyright 2017-2018 Fizyr (https://fizyr.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ from pycocotools.cocoeval import COCOeval from tensorflow import keras import numpy as np import json import progressbar assert(callable(progressbar.progressbar)), "Using wrong progressbar module, install 'progressbar2' instead." def evaluate_coco(generator, model, threshold=0.05): """ Use the pycocotools to evaluate a COCO model on a dataset. Args generator : The generator for generating the evaluation data. model : The model to evaluate. threshold : The score threshold to use. """ # start collecting results results = [] image_ids = [] for index in progressbar.progressbar(range(generator.size()), prefix='COCO evaluation: '): image = generator.load_image(index) image = generator.preprocess_image(image) image, scale = generator.resize_image(image) if keras.backend.image_data_format() == 'channels_first': image = image.transpose((2, 0, 1)) # run network boxes, scores, labels = model.predict_on_batch(np.expand_dims(image, axis=0)) # correct boxes for image scale boxes /= scale # change to (x, y, w, h) (MS COCO standard) boxes[:, :, 2] -= boxes[:, :, 0] boxes[:, :, 3] -= boxes[:, :, 1] # compute predicted labels and scores for box, score, label in zip(boxes[0], scores[0], labels[0]): # scores are sorted, so we can break if score < threshold: break # append detection for each positively labeled class image_result = { 'image_id' : generator.image_ids[index], 'category_id' : generator.label_to_coco_label(label), 'score' : float(score), 'bbox' : box.tolist(), } # append detection to results results.append(image_result) # append image to list of processed images image_ids.append(generator.image_ids[index]) if not len(results): return # write output json.dump(results, open('{}_bbox_results.json'.format(generator.set_name), 'w'), indent=4) json.dump(image_ids, open('{}_processed_image_ids.json'.format(generator.set_name), 'w'), indent=4) # load results in COCO evaluation tool coco_true = generator.coco coco_pred = coco_true.loadRes('{}_bbox_results.json'.format(generator.set_name)) # run COCO evaluation coco_eval = COCOeval(coco_true, coco_pred, 'bbox') coco_eval.params.imgIds = image_ids coco_eval.evaluate() coco_eval.accumulate() coco_eval.summarize() return coco_eval.stats ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/utils/colors.py ================================================ import warnings def label_color(label): """ Return a color from a set of predefined colors. Contains 80 colors in total. Args label: The label to get the color for. Returns A list of three values representing a RGB color. If no color is defined for a certain label, the color green is returned and a warning is printed. """ if label < len(colors): return colors[label] else: warnings.warn('Label {} has no color, returning default.'.format(label)) return (0, 255, 0) """ Generated using: ``` colors = [list((matplotlib.colors.hsv_to_rgb([x, 1.0, 1.0]) * 255).astype(int)) for x in np.arange(0, 1, 1.0 / 80)] shuffle(colors) pprint(colors) ``` """ colors = [ [31 , 0 , 255] , [0 , 159 , 255] , [255 , 95 , 0] , [255 , 19 , 0] , [255 , 0 , 0] , [255 , 38 , 0] , [0 , 255 , 25] , [255 , 0 , 133] , [255 , 172 , 0] , [108 , 0 , 255] , [0 , 82 , 255] , [0 , 255 , 6] , [255 , 0 , 152] , [223 , 0 , 255] , [12 , 0 , 255] , [0 , 255 , 178] , [108 , 255 , 0] , [184 , 0 , 255] , [255 , 0 , 76] , [146 , 255 , 0] , [51 , 0 , 255] , [0 , 197 , 255] , [255 , 248 , 0] , [255 , 0 , 19] , [255 , 0 , 38] , [89 , 255 , 0] , [127 , 255 , 0] , [255 , 153 , 0] , [0 , 255 , 255] , [0 , 255 , 216] , [0 , 255 , 121] , [255 , 0 , 248] , [70 , 0 , 255] , [0 , 255 , 159] , [0 , 216 , 255] , [0 , 6 , 255] , [0 , 63 , 255] , [31 , 255 , 0] , [255 , 57 , 0] , [255 , 0 , 210] , [0 , 255 , 102] , [242 , 255 , 0] , [255 , 191 , 0] , [0 , 255 , 63] , [255 , 0 , 95] , [146 , 0 , 255] , [184 , 255 , 0] , [255 , 114 , 0] , [0 , 255 , 235] , [255 , 229 , 0] , [0 , 178 , 255] , [255 , 0 , 114] , [255 , 0 , 57] , [0 , 140 , 255] , [0 , 121 , 255] , [12 , 255 , 0] , [255 , 210 , 0] , [0 , 255 , 44] , [165 , 255 , 0] , [0 , 25 , 255] , [0 , 255 , 140] , [0 , 101 , 255] , [0 , 255 , 82] , [223 , 255 , 0] , [242 , 0 , 255] , [89 , 0 , 255] , [165 , 0 , 255] , [70 , 255 , 0] , [255 , 0 , 172] , [255 , 76 , 0] , [203 , 255 , 0] , [204 , 0 , 255] , [255 , 0 , 229] , [255 , 133 , 0] , [127 , 0 , 255] , [0 , 235 , 255] , [0 , 255 , 197] , [255 , 0 , 191] , [0 , 44 , 255] , [50 , 255 , 0] ] ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/utils/compute_overlap.pyx ================================================ # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Sergey Karayev # -------------------------------------------------------- cimport cython import numpy as np cimport numpy as np def compute_overlap( np.ndarray[double, ndim=2] boxes, np.ndarray[double, ndim=2] query_boxes ): """ Args a: (N, 4) ndarray of float b: (K, 4) ndarray of float Returns overlaps: (N, K) ndarray of overlap between boxes and query_boxes """ cdef unsigned int N = boxes.shape[0] cdef unsigned int K = query_boxes.shape[0] cdef np.ndarray[double, ndim=2] overlaps = np.zeros((N, K), dtype=np.float64) cdef double iw, ih, box_area cdef double ua cdef unsigned int k, n for k in range(K): box_area = ( (query_boxes[k, 2] - query_boxes[k, 0]) * (query_boxes[k, 3] - query_boxes[k, 1]) ) for n in range(N): iw = ( min(boxes[n, 2], query_boxes[k, 2]) - max(boxes[n, 0], query_boxes[k, 0]) ) if iw > 0: ih = ( min(boxes[n, 3], query_boxes[k, 3]) - max(boxes[n, 1], query_boxes[k, 1]) ) if ih > 0: ua = np.float64( (boxes[n, 2] - boxes[n, 0]) * (boxes[n, 3] - boxes[n, 1]) + box_area - iw * ih ) overlaps[n, k] = iw * ih / ua return overlaps ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/utils/config.py ================================================ """ Copyright 2017-2018 Fizyr (https://fizyr.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ import configparser import numpy as np from tensorflow import keras from ..utils.anchors import AnchorParameters def read_config_file(config_path): config = configparser.ConfigParser() with open(config_path, 'r') as file: config.read_file(file) assert 'anchor_parameters' in config, \ "Malformed config file. Verify that it contains the anchor_parameters section." config_keys = set(config['anchor_parameters']) default_keys = set(AnchorParameters.default.__dict__.keys()) assert config_keys <= default_keys, \ "Malformed config file. These keys are not valid: {}".format(config_keys - default_keys) if 'pyramid_levels' in config: assert('levels' in config['pyramid_levels']), "pyramid levels specified by levels key" return config def parse_anchor_parameters(config): ratios = np.array(list(map(float, config['anchor_parameters']['ratios'].split(' '))), keras.backend.floatx()) scales = np.array(list(map(float, config['anchor_parameters']['scales'].split(' '))), keras.backend.floatx()) sizes = list(map(int, config['anchor_parameters']['sizes'].split(' '))) strides = list(map(int, config['anchor_parameters']['strides'].split(' '))) assert (len(sizes) == len(strides)), "sizes and strides should have an equal number of values" return AnchorParameters(sizes, strides, ratios, scales) def parse_pyramid_levels(config): levels = list(map(int, config['pyramid_levels']['levels'].split(' '))) return levels ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/utils/eval.py ================================================ """ Copyright 2017-2018 Fizyr (https://fizyr.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ from .anchors import compute_overlap from .visualization import draw_detections, draw_annotations from tensorflow import keras import numpy as np import os import time import cv2 import progressbar assert(callable(progressbar.progressbar)), "Using wrong progressbar module, install 'progressbar2' instead." def _compute_ap(recall, precision): """ Compute the average precision, given the recall and precision curves. Code originally from https://github.com/rbgirshick/py-faster-rcnn. # Arguments recall: The recall curve (list). precision: The precision curve (list). # Returns The average precision as computed in py-faster-rcnn. """ # correct AP calculation # first append sentinel values at the end mrec = np.concatenate(([0.], recall, [1.])) mpre = np.concatenate(([0.], precision, [0.])) # compute the precision envelope for i in range(mpre.size - 1, 0, -1): mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) # to calculate area under PR curve, look for points # where X axis (recall) changes value i = np.where(mrec[1:] != mrec[:-1])[0] # and sum (\Delta recall) * prec ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) return ap def _get_detections(generator, model, score_threshold=0.05, max_detections=100, save_path=None): """ Get the detections from the model using the generator. The result is a list of lists such that the size is: all_detections[num_images][num_classes] = detections[num_detections, 4 + num_classes] # Arguments generator : The generator used to run images through the model. model : The model to run on the images. score_threshold : The score confidence threshold to use. max_detections : The maximum number of detections to use per image. save_path : The path to save the images with visualized detections to. # Returns A list of lists containing the detections for each image in the generator. """ all_detections = [[None for i in range(generator.num_classes()) if generator.has_label(i)] for j in range(generator.size())] all_inferences = [None for i in range(generator.size())] for i in progressbar.progressbar(range(generator.size()), prefix='Running network: '): raw_image = generator.load_image(i) image, scale = generator.resize_image(raw_image.copy()) image = generator.preprocess_image(image) if keras.backend.image_data_format() == 'channels_first': image = image.transpose((2, 0, 1)) # run network start = time.time() boxes, scores, labels = model.predict_on_batch(np.expand_dims(image, axis=0))[:3] inference_time = time.time() - start # correct boxes for image scale boxes /= scale # select indices which have a score above the threshold indices = np.where(scores[0, :] > score_threshold)[0] # select those scores scores = scores[0][indices] # find the order with which to sort the scores scores_sort = np.argsort(-scores)[:max_detections] # select detections image_boxes = boxes[0, indices[scores_sort], :] image_scores = scores[scores_sort] image_labels = labels[0, indices[scores_sort]] image_detections = np.concatenate([image_boxes, np.expand_dims(image_scores, axis=1), np.expand_dims(image_labels, axis=1)], axis=1) if save_path is not None: draw_annotations(raw_image, generator.load_annotations(i), label_to_name=generator.label_to_name) draw_detections(raw_image, image_boxes, image_scores, image_labels, label_to_name=generator.label_to_name, score_threshold=score_threshold) cv2.imwrite(os.path.join(save_path, '{}.png'.format(i)), raw_image) # copy detections to all_detections for label in range(generator.num_classes()): if not generator.has_label(label): continue all_detections[i][label] = image_detections[image_detections[:, -1] == label, :-1] all_inferences[i] = inference_time return all_detections, all_inferences def _get_annotations(generator): """ Get the ground truth annotations from the generator. The result is a list of lists such that the size is: all_detections[num_images][num_classes] = annotations[num_detections, 5] # Arguments generator : The generator used to retrieve ground truth annotations. # Returns A list of lists containing the annotations for each image in the generator. """ all_annotations = [[None for i in range(generator.num_classes())] for j in range(generator.size())] for i in progressbar.progressbar(range(generator.size()), prefix='Parsing annotations: '): # load the annotations annotations = generator.load_annotations(i) # copy detections to all_annotations for label in range(generator.num_classes()): if not generator.has_label(label): continue all_annotations[i][label] = annotations['bboxes'][annotations['labels'] == label, :].copy() return all_annotations def evaluate( generator, model, iou_threshold=0.5, score_threshold=0.05, max_detections=100, save_path=None ): """ Evaluate a given dataset using a given model. # Arguments generator : The generator that represents the dataset to evaluate. model : The model to evaluate. iou_threshold : The threshold used to consider when a detection is positive or negative. score_threshold : The score confidence threshold to use for detections. max_detections : The maximum number of detections to use per image. save_path : The path to save images with visualized detections to. # Returns A dict mapping class names to mAP scores. """ # gather all detections and annotations all_detections, all_inferences = _get_detections(generator, model, score_threshold=score_threshold, max_detections=max_detections, save_path=save_path) all_annotations = _get_annotations(generator) average_precisions = {} # all_detections = pickle.load(open('all_detections.pkl', 'rb')) # all_annotations = pickle.load(open('all_annotations.pkl', 'rb')) # pickle.dump(all_detections, open('all_detections.pkl', 'wb')) # pickle.dump(all_annotations, open('all_annotations.pkl', 'wb')) # process detections and annotations for label in range(generator.num_classes()): if not generator.has_label(label): continue false_positives = np.zeros((0,)) true_positives = np.zeros((0,)) scores = np.zeros((0,)) num_annotations = 0.0 for i in range(generator.size()): detections = all_detections[i][label] annotations = all_annotations[i][label] num_annotations += annotations.shape[0] detected_annotations = [] for d in detections: scores = np.append(scores, d[4]) if annotations.shape[0] == 0: false_positives = np.append(false_positives, 1) true_positives = np.append(true_positives, 0) continue overlaps = compute_overlap(np.expand_dims(d, axis=0), annotations) assigned_annotation = np.argmax(overlaps, axis=1) max_overlap = overlaps[0, assigned_annotation] if max_overlap >= iou_threshold and assigned_annotation not in detected_annotations: false_positives = np.append(false_positives, 0) true_positives = np.append(true_positives, 1) detected_annotations.append(assigned_annotation) else: false_positives = np.append(false_positives, 1) true_positives = np.append(true_positives, 0) # no annotations -> AP for this class is 0 (is this correct?) if num_annotations == 0: average_precisions[label] = 0, 0 continue # sort by score indices = np.argsort(-scores) false_positives = false_positives[indices] true_positives = true_positives[indices] # compute false positives and true positives false_positives = np.cumsum(false_positives) true_positives = np.cumsum(true_positives) # compute recall and precision recall = true_positives / num_annotations precision = true_positives / np.maximum(true_positives + false_positives, np.finfo(np.float64).eps) # compute average precision average_precision = _compute_ap(recall, precision) average_precisions[label] = average_precision, num_annotations # inference time inference_time = np.sum(all_inferences) / generator.size() return average_precisions, inference_time ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/utils/gpu.py ================================================ """ Copyright 2017-2019 Fizyr (https://fizyr.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ import tensorflow as tf def setup_gpu(gpu_id): try: visible_gpu_indices = [int(id) for id in gpu_id.split(',')] available_gpus = tf.config.list_physical_devices('GPU') visible_gpus = [gpu for idx, gpu in enumerate(available_gpus) if idx in visible_gpu_indices] if visible_gpus: try: # Currently, memory growth needs to be the same across GPUs. for gpu in available_gpus: tf.config.experimental.set_memory_growth(gpu, True) # Use only the selcted gpu. tf.config.set_visible_devices(visible_gpus, 'GPU') except RuntimeError as e: # Visible devices must be set before GPUs have been initialized. print(e) logical_gpus = tf.config.list_logical_devices('GPU') print(len(available_gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs") else: tf.config.set_visible_devices([], 'GPU') except ValueError: tf.config.set_visible_devices([], 'GPU') ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/utils/image.py ================================================ """ Copyright 2017-2018 Fizyr (https://fizyr.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ from __future__ import division import numpy as np import cv2 from PIL import Image from .transform import change_transform_origin def read_image_bgr(path): """ Read an image in BGR format. Args path: Path to the image. """ # We deliberately don't use cv2.imread here, since it gives no feedback on errors while reading the image. image = np.ascontiguousarray(Image.open(path).convert('RGB')) return image[:, :, ::-1] def preprocess_image(x, mode='caffe'): """ Preprocess an image by subtracting the ImageNet mean. Args x: np.array of shape (None, None, 3) or (3, None, None). mode: One of "caffe" or "tf". - caffe: will zero-center each color channel with respect to the ImageNet dataset, without scaling. - tf: will scale pixels between -1 and 1, sample-wise. Returns The input with the ImageNet mean subtracted. """ # mostly identical to "https://github.com/keras-team/keras-applications/blob/master/keras_applications/imagenet_utils.py" # except for converting RGB -> BGR since we assume BGR already # covert always to float32 to keep compatibility with opencv x = x.astype(np.float32) if mode == 'tf': x /= 127.5 x -= 1. elif mode == 'caffe': x -= [103.939, 116.779, 123.68] return x def adjust_transform_for_image(transform, image, relative_translation): """ Adjust a transformation for a specific image. The translation of the matrix will be scaled with the size of the image. The linear part of the transformation will adjusted so that the origin of the transformation will be at the center of the image. """ height, width, channels = image.shape result = transform # Scale the translation with the image size if specified. if relative_translation: result[0:2, 2] *= [width, height] # Move the origin of transformation. result = change_transform_origin(transform, (0.5 * width, 0.5 * height)) return result class TransformParameters: """ Struct holding parameters determining how to apply a transformation to an image. Args fill_mode: One of: 'constant', 'nearest', 'reflect', 'wrap' interpolation: One of: 'nearest', 'linear', 'cubic', 'area', 'lanczos4' cval: Fill value to use with fill_mode='constant' relative_translation: If true (the default), interpret translation as a factor of the image size. If false, interpret it as absolute pixels. """ def __init__( self, fill_mode = 'nearest', interpolation = 'linear', cval = 0, relative_translation = True, ): self.fill_mode = fill_mode self.cval = cval self.interpolation = interpolation self.relative_translation = relative_translation def cvBorderMode(self): if self.fill_mode == 'constant': return cv2.BORDER_CONSTANT if self.fill_mode == 'nearest': return cv2.BORDER_REPLICATE if self.fill_mode == 'reflect': return cv2.BORDER_REFLECT_101 if self.fill_mode == 'wrap': return cv2.BORDER_WRAP def cvInterpolation(self): if self.interpolation == 'nearest': return cv2.INTER_NEAREST if self.interpolation == 'linear': return cv2.INTER_LINEAR if self.interpolation == 'cubic': return cv2.INTER_CUBIC if self.interpolation == 'area': return cv2.INTER_AREA if self.interpolation == 'lanczos4': return cv2.INTER_LANCZOS4 def apply_transform(matrix, image, params): """ Apply a transformation to an image. The origin of transformation is at the top left corner of the image. The matrix is interpreted such that a point (x, y) on the original image is moved to transform * (x, y) in the generated image. Mathematically speaking, that means that the matrix is a transformation from the transformed image space to the original image space. Args matrix: A homogeneous 3 by 3 matrix holding representing the transformation to apply. image: The image to transform. params: The transform parameters (see TransformParameters) """ output = cv2.warpAffine( image, matrix[:2, :], dsize = (image.shape[1], image.shape[0]), flags = params.cvInterpolation(), borderMode = params.cvBorderMode(), borderValue = params.cval, ) return output def compute_resize_scale(image_shape, min_side=800, max_side=1333): """ Compute an image scale such that the image size is constrained to min_side and max_side. Args min_side: The image's min side will be equal to min_side after resizing. max_side: If after resizing the image's max side is above max_side, resize until the max side is equal to max_side. Returns A resizing scale. """ (rows, cols, _) = image_shape smallest_side = min(rows, cols) # rescale the image so the smallest side is min_side scale = min_side / smallest_side # check if the largest side is now greater than max_side, which can happen # when images have a large aspect ratio largest_side = max(rows, cols) if largest_side * scale > max_side: scale = max_side / largest_side return scale def resize_image(img, min_side=800, max_side=1333): """ Resize an image such that the size is constrained to min_side and max_side. Args min_side: The image's min side will be equal to min_side after resizing. max_side: If after resizing the image's max side is above max_side, resize until the max side is equal to max_side. Returns A resized image. """ # compute scale to resize the image scale = compute_resize_scale(img.shape, min_side=min_side, max_side=max_side) # resize the image with the computed scale img = cv2.resize(img, None, fx=scale, fy=scale) return img, scale def _uniform(val_range): """ Uniformly sample from the given range. Args val_range: A pair of lower and upper bound. """ return np.random.uniform(val_range[0], val_range[1]) def _check_range(val_range, min_val=None, max_val=None): """ Check whether the range is a valid range. Args val_range: A pair of lower and upper bound. min_val: Minimal value for the lower bound. max_val: Maximal value for the upper bound. """ if val_range[0] > val_range[1]: raise ValueError('interval lower bound > upper bound') if min_val is not None and val_range[0] < min_val: raise ValueError('invalid interval lower bound') if max_val is not None and val_range[1] > max_val: raise ValueError('invalid interval upper bound') def _clip(image): """ Clip and convert an image to np.uint8. Args image: Image to clip. """ return np.clip(image, 0, 255).astype(np.uint8) class VisualEffect: """ Struct holding parameters and applying image color transformation. Args contrast_factor: A factor for adjusting contrast. Should be between 0 and 3. brightness_delta: Brightness offset between -1 and 1 added to the pixel values. hue_delta: Hue offset between -1 and 1 added to the hue channel. saturation_factor: A factor multiplying the saturation values of each pixel. """ def __init__( self, contrast_factor, brightness_delta, hue_delta, saturation_factor, ): self.contrast_factor = contrast_factor self.brightness_delta = brightness_delta self.hue_delta = hue_delta self.saturation_factor = saturation_factor def __call__(self, image): """ Apply a visual effect on the image. Args image: Image to adjust """ if self.contrast_factor: image = adjust_contrast(image, self.contrast_factor) if self.brightness_delta: image = adjust_brightness(image, self.brightness_delta) if self.hue_delta or self.saturation_factor: image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) if self.hue_delta: image = adjust_hue(image, self.hue_delta) if self.saturation_factor: image = adjust_saturation(image, self.saturation_factor) image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR) return image def random_visual_effect_generator( contrast_range=(0.9, 1.1), brightness_range=(-.1, .1), hue_range=(-0.05, 0.05), saturation_range=(0.95, 1.05) ): """ Generate visual effect parameters uniformly sampled from the given intervals. Args contrast_factor: A factor interval for adjusting contrast. Should be between 0 and 3. brightness_delta: An interval between -1 and 1 for the amount added to the pixels. hue_delta: An interval between -1 and 1 for the amount added to the hue channel. The values are rotated if they exceed 180. saturation_factor: An interval for the factor multiplying the saturation values of each pixel. """ _check_range(contrast_range, 0) _check_range(brightness_range, -1, 1) _check_range(hue_range, -1, 1) _check_range(saturation_range, 0) def _generate(): while True: yield VisualEffect( contrast_factor=_uniform(contrast_range), brightness_delta=_uniform(brightness_range), hue_delta=_uniform(hue_range), saturation_factor=_uniform(saturation_range), ) return _generate() def adjust_contrast(image, factor): """ Adjust contrast of an image. Args image: Image to adjust. factor: A factor for adjusting contrast. """ mean = image.mean(axis=0).mean(axis=0) return _clip((image - mean) * factor + mean) def adjust_brightness(image, delta): """ Adjust brightness of an image Args image: Image to adjust. delta: Brightness offset between -1 and 1 added to the pixel values. """ return _clip(image + delta * 255) def adjust_hue(image, delta): """ Adjust hue of an image. Args image: Image to adjust. delta: An interval between -1 and 1 for the amount added to the hue channel. The values are rotated if they exceed 180. """ image[..., 0] = np.mod(image[..., 0] + delta * 180, 180) return image def adjust_saturation(image, factor): """ Adjust saturation of an image. Args image: Image to adjust. factor: An interval for the factor multiplying the saturation values of each pixel. """ image[..., 1] = np.clip(image[..., 1] * factor, 0 , 255) return image ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/utils/model.py ================================================ """ Copyright 2017-2018 Fizyr (https://fizyr.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ def freeze(model): """ Set all layers in a model to non-trainable. The weights for these layers will not be updated during training. This function modifies the given model in-place, but it also returns the modified model to allow easy chaining with other functions. """ for layer in model.layers: layer.trainable = False return model ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/utils/tf_version.py ================================================ """ Copyright 2017-2019 Fizyr (https://fizyr.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ from __future__ import print_function import tensorflow as tf import sys MINIMUM_TF_VERSION = 2, 3, 0 BLACKLISTED_TF_VERSIONS = [] def tf_version(): """ Get the Tensorflow version. Returns tuple of (major, minor, patch). """ return tuple(map(int, tf.version.VERSION.split('-')[0].split('.'))) def tf_version_ok(minimum_tf_version=MINIMUM_TF_VERSION, blacklisted=BLACKLISTED_TF_VERSIONS): """ Check if the current Tensorflow version is higher than the minimum version. """ return tf_version() >= minimum_tf_version and tf_version() not in blacklisted def assert_tf_version(minimum_tf_version=MINIMUM_TF_VERSION, blacklisted=BLACKLISTED_TF_VERSIONS): """ Assert that the Tensorflow version is up to date. """ detected = tf.version.VERSION required = '.'.join(map(str, minimum_tf_version)) assert(tf_version_ok(minimum_tf_version, blacklisted)), 'You are using tensorflow version {}. The minimum required version is {} (blacklisted: {}).'.format(detected, required, blacklisted) def check_tf_version(): """ Check that the Tensorflow version is up to date. If it isn't, print an error message and exit the script. """ try: assert_tf_version() except AssertionError as e: print(e, file=sys.stderr) sys.exit(1) ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/utils/transform.py ================================================ """ Copyright 2017-2018 Fizyr (https://fizyr.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ import numpy as np DEFAULT_PRNG = np.random def colvec(*args): """ Create a numpy array representing a column vector. """ return np.array([args]).T def transform_aabb(transform, aabb): """ Apply a transformation to an axis aligned bounding box. The result is a new AABB in the same coordinate system as the original AABB. The new AABB contains all corner points of the original AABB after applying the given transformation. Args transform: The transformation to apply. x1: The minimum x value of the AABB. y1: The minimum y value of the AABB. x2: The maximum x value of the AABB. y2: The maximum y value of the AABB. Returns The new AABB as tuple (x1, y1, x2, y2) """ x1, y1, x2, y2 = aabb # Transform all 4 corners of the AABB. points = transform.dot([ [x1, x2, x1, x2], [y1, y2, y2, y1], [1, 1, 1, 1 ], ]) # Extract the min and max corners again. min_corner = points.min(axis=1) max_corner = points.max(axis=1) return [min_corner[0], min_corner[1], max_corner[0], max_corner[1]] def _random_vector(min, max, prng=DEFAULT_PRNG): """ Construct a random vector between min and max. Args min: the minimum value for each component max: the maximum value for each component """ min = np.array(min) max = np.array(max) assert min.shape == max.shape assert len(min.shape) == 1 return prng.uniform(min, max) def rotation(angle): """ Construct a homogeneous 2D rotation matrix. Args angle: the angle in radians Returns the rotation matrix as 3 by 3 numpy array """ return np.array([ [np.cos(angle), -np.sin(angle), 0], [np.sin(angle), np.cos(angle), 0], [0, 0, 1] ]) def random_rotation(min, max, prng=DEFAULT_PRNG): """ Construct a random rotation between -max and max. Args min: a scalar for the minimum absolute angle in radians max: a scalar for the maximum absolute angle in radians prng: the pseudo-random number generator to use. Returns a homogeneous 3 by 3 rotation matrix """ return rotation(prng.uniform(min, max)) def translation(translation): """ Construct a homogeneous 2D translation matrix. # Arguments translation: the translation 2D vector # Returns the translation matrix as 3 by 3 numpy array """ return np.array([ [1, 0, translation[0]], [0, 1, translation[1]], [0, 0, 1] ]) def random_translation(min, max, prng=DEFAULT_PRNG): """ Construct a random 2D translation between min and max. Args min: a 2D vector with the minimum translation for each dimension max: a 2D vector with the maximum translation for each dimension prng: the pseudo-random number generator to use. Returns a homogeneous 3 by 3 translation matrix """ return translation(_random_vector(min, max, prng)) def shear(angle): """ Construct a homogeneous 2D shear matrix. Args angle: the shear angle in radians Returns the shear matrix as 3 by 3 numpy array """ return np.array([ [1, -np.sin(angle), 0], [0, np.cos(angle), 0], [0, 0, 1] ]) def random_shear(min, max, prng=DEFAULT_PRNG): """ Construct a random 2D shear matrix with shear angle between -max and max. Args min: the minimum shear angle in radians. max: the maximum shear angle in radians. prng: the pseudo-random number generator to use. Returns a homogeneous 3 by 3 shear matrix """ return shear(prng.uniform(min, max)) def scaling(factor): """ Construct a homogeneous 2D scaling matrix. Args factor: a 2D vector for X and Y scaling Returns the zoom matrix as 3 by 3 numpy array """ return np.array([ [factor[0], 0, 0], [0, factor[1], 0], [0, 0, 1] ]) def random_scaling(min, max, prng=DEFAULT_PRNG): """ Construct a random 2D scale matrix between -max and max. Args min: a 2D vector containing the minimum scaling factor for X and Y. min: a 2D vector containing The maximum scaling factor for X and Y. prng: the pseudo-random number generator to use. Returns a homogeneous 3 by 3 scaling matrix """ return scaling(_random_vector(min, max, prng)) def random_flip(flip_x_chance, flip_y_chance, prng=DEFAULT_PRNG): """ Construct a transformation randomly containing X/Y flips (or not). Args flip_x_chance: The chance that the result will contain a flip along the X axis. flip_y_chance: The chance that the result will contain a flip along the Y axis. prng: The pseudo-random number generator to use. Returns a homogeneous 3 by 3 transformation matrix """ flip_x = prng.uniform(0, 1) < flip_x_chance flip_y = prng.uniform(0, 1) < flip_y_chance # 1 - 2 * bool gives 1 for False and -1 for True. return scaling((1 - 2 * flip_x, 1 - 2 * flip_y)) def change_transform_origin(transform, center): """ Create a new transform representing the same transformation, only with the origin of the linear part changed. Args transform: the transformation matrix center: the new origin of the transformation Returns translate(center) * transform * translate(-center) """ center = np.array(center) return np.linalg.multi_dot([translation(center), transform, translation(-center)]) def random_transform( min_rotation=0, max_rotation=0, min_translation=(0, 0), max_translation=(0, 0), min_shear=0, max_shear=0, min_scaling=(1, 1), max_scaling=(1, 1), flip_x_chance=0, flip_y_chance=0, prng=DEFAULT_PRNG ): """ Create a random transformation. The transformation consists of the following operations in this order (from left to right): * rotation * translation * shear * scaling * flip x (if applied) * flip y (if applied) Note that by default, the data generators in `keras_retinanet.preprocessing.generators` interpret the translation as factor of the image size. So an X translation of 0.1 would translate the image by 10% of it's width. Set `relative_translation` to `False` in the `TransformParameters` of a data generator to have it interpret the translation directly as pixel distances instead. Args min_rotation: The minimum rotation in radians for the transform as scalar. max_rotation: The maximum rotation in radians for the transform as scalar. min_translation: The minimum translation for the transform as 2D column vector. max_translation: The maximum translation for the transform as 2D column vector. min_shear: The minimum shear angle for the transform in radians. max_shear: The maximum shear angle for the transform in radians. min_scaling: The minimum scaling for the transform as 2D column vector. max_scaling: The maximum scaling for the transform as 2D column vector. flip_x_chance: The chance (0 to 1) that a transform will contain a flip along X direction. flip_y_chance: The chance (0 to 1) that a transform will contain a flip along Y direction. prng: The pseudo-random number generator to use. """ return np.linalg.multi_dot([ random_rotation(min_rotation, max_rotation, prng), random_translation(min_translation, max_translation, prng), random_shear(min_shear, max_shear, prng), random_scaling(min_scaling, max_scaling, prng), random_flip(flip_x_chance, flip_y_chance, prng) ]) def random_transform_generator(prng=None, **kwargs): """ Create a random transform generator. Uses a dedicated, newly created, properly seeded PRNG by default instead of the global DEFAULT_PRNG. The transformation consists of the following operations in this order (from left to right): * rotation * translation * shear * scaling * flip x (if applied) * flip y (if applied) Note that by default, the data generators in `keras_retinanet.preprocessing.generators` interpret the translation as factor of the image size. So an X translation of 0.1 would translate the image by 10% of it's width. Set `relative_translation` to `False` in the `TransformParameters` of a data generator to have it interpret the translation directly as pixel distances instead. Args min_rotation: The minimum rotation in radians for the transform as scalar. max_rotation: The maximum rotation in radians for the transform as scalar. min_translation: The minimum translation for the transform as 2D column vector. max_translation: The maximum translation for the transform as 2D column vector. min_shear: The minimum shear angle for the transform in radians. max_shear: The maximum shear angle for the transform in radians. min_scaling: The minimum scaling for the transform as 2D column vector. max_scaling: The maximum scaling for the transform as 2D column vector. flip_x_chance: The chance (0 to 1) that a transform will contain a flip along X direction. flip_y_chance: The chance (0 to 1) that a transform will contain a flip along Y direction. prng: The pseudo-random number generator to use. """ if prng is None: # RandomState automatically seeds using the best available method. prng = np.random.RandomState() while True: yield random_transform(prng=prng, **kwargs) ================================================ FILE: imageai_tf_deprecated/Detection/keras_retinanet/utils/visualization.py ================================================ """ Copyright 2017-2018 Fizyr (https://fizyr.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ import cv2 import numpy as np from .colors import label_color def draw_box(image, box, color, thickness=2): """ Draws a box on an image with a given color. # Arguments image : The image to draw on. box : A list of 4 elements (x1, y1, x2, y2). color : The color of the box. thickness : The thickness of the lines to draw a box with. """ b = np.array(box).astype(int) cv2.rectangle(image, (b[0], b[1]), (b[2], b[3]), color, thickness, cv2.LINE_AA) def draw_caption(image, box, caption): """ Draws a caption above the box in an image. # Arguments image : The image to draw on. box : A list of 4 elements (x1, y1, x2, y2). caption : String containing the text to draw. """ b = np.array(box).astype(int) cv2.putText(image, caption, (b[0], b[1] - 10), cv2.FONT_HERSHEY_PLAIN, 1, (0, 0, 0), 2) cv2.putText(image, caption, (b[0], b[1] - 10), cv2.FONT_HERSHEY_PLAIN, 1, (255, 255, 255), 1) def draw_boxes(image, boxes, color, thickness=2): """ Draws boxes on an image with a given color. # Arguments image : The image to draw on. boxes : A [N, 4] matrix (x1, y1, x2, y2). color : The color of the boxes. thickness : The thickness of the lines to draw boxes with. """ for b in boxes: draw_box(image, b, color, thickness=thickness) def draw_detections(image, boxes, scores, labels, color=None, label_to_name=None, score_threshold=0.5): """ Draws detections in an image. # Arguments image : The image to draw on. boxes : A [N, 4] matrix (x1, y1, x2, y2). scores : A list of N classification scores. labels : A list of N labels. color : The color of the boxes. By default the color from keras_retinanet.utils.colors.label_color will be used. label_to_name : (optional) Functor for mapping a label to a name. score_threshold : Threshold used for determining what detections to draw. """ selection = np.where(scores > score_threshold)[0] for i in selection: c = color if color is not None else label_color(labels[i]) draw_box(image, boxes[i, :], color=c) # draw labels caption = (label_to_name(labels[i]) if label_to_name else labels[i]) + ': {0:.2f}'.format(scores[i]) draw_caption(image, boxes[i, :], caption) def draw_annotations(image, annotations, color=(0, 255, 0), label_to_name=None): """ Draws annotations in an image. # Arguments image : The image to draw on. annotations : A [N, 5] matrix (x1, y1, x2, y2, label) or dictionary containing bboxes (shaped [N, 4]) and labels (shaped [N]). color : The color of the boxes. By default the color from keras_retinanet.utils.colors.label_color will be used. label_to_name : (optional) Functor for mapping a label to a name. """ if isinstance(annotations, np.ndarray): annotations = {'bboxes': annotations[:, :4], 'labels': annotations[:, 4]} assert('bboxes' in annotations) assert('labels' in annotations) assert(annotations['bboxes'].shape[0] == annotations['labels'].shape[0]) for i in range(annotations['bboxes'].shape[0]): label = annotations['labels'][i] c = color if color is not None else label_color(label) caption = '{}'.format(label_to_name(label) if label_to_name else label) draw_caption(image, annotations['bboxes'][i], caption) draw_box(image, annotations['bboxes'][i], color=c) ================================================ FILE: imageai_tf_deprecated/Prediction/Custom/__init__.py ================================================ from ...Classification.Custom import ClassificationModelTrainer, CustomImageClassification class ModelTraining(ClassificationModelTrainer): """ Deprecated! Replaced with 'imageai.Classification.Custom.ClassificationModelTrainer' """ def __call__(self): None class CustomImagePrediction(CustomImageClassification): """ Deprecated! Replaced with 'imageai.Classification.Custom.CustomImageClassification' """ def __call__(self): None ================================================ FILE: imageai_tf_deprecated/Prediction/Custom/custom_utils.py ================================================ import json CLASS_INDEX = None def preprocess_input(x): """Preprocesses a tensor encoding a batch of images. # Arguments x: input Numpy tensor, 4D. data_format: data format of the image tensor. # Returns Preprocessed tensor. """ # 'RGB'->'BGR' x *= (1./255) return x def decode_predictions(preds, top=5, model_json=""): global CLASS_INDEX if CLASS_INDEX is None: CLASS_INDEX = json.load(open(model_json)) results = [] for pred in preds: top_indices = pred.argsort()[-top:][::-1] for i in top_indices: each_result = [] each_result.append(CLASS_INDEX[str(i)]) each_result.append(pred[i]) results.append(each_result) return results ================================================ FILE: imageai_tf_deprecated/Prediction/__init__.py ================================================ from ..Classification import ImageClassification from matplotlib.cbook import deprecated class ImagePrediction(ImageClassification): """ Deprecated! Replaced with 'imageai.Classification.ImageClassification' """ def __call__(self): None ================================================ FILE: imageai_tf_deprecated/Prediction/imagenet_utils.py ================================================ CLASS_INDEX = None def preprocess_input(x): """Preprocesses a tensor encoding a batch of images. # Arguments x: input Numpy tensor, 4D. data_format: data format of the image tensor. # Returns Preprocessed tensor. """ # 'RGB'->'BGR' x = x[..., ::-1] # Zero-center by mean pixel x[..., 0] -= 103.939 x[..., 1] -= 116.779 x[..., 2] -= 123.68 return x def decode_predictions(preds, top=5): """Decodes the prediction of an ImageNet model. # Arguments preds: Numpy tensor encoding a batch of predictions. top: integer, how many top-guesses to return. # Returns A list of lists of top class prediction tuples `(class_name, class_description, score)`. One list of tuples per sample in batch input. # Raises ValueError: in case of invalid shape of the `pred` array (must be 2D). """ global CLASS_INDEX if len(preds.shape) != 2 or preds.shape[1] != 1000: raise ValueError('`decode_predictions` expects ' 'a batch of predictions ' '(i.e. a 2D array of shape (samples, 1000)). ' 'Found array with shape: ' + str(preds.shape)) if CLASS_INDEX is None: CLASS_INDEX = {"0": ["n01440764", "tench"], "1": ["n01443537", "goldfish"], "2": ["n01484850", "great_white_shark"], "3": ["n01491361", "tiger_shark"], "4": ["n01494475", "hammerhead"], "5": ["n01496331", "electric_ray"], "6": ["n01498041", "stingray"], "7": ["n01514668", "cock"], "8": ["n01514859", "hen"], "9": ["n01518878", "ostrich"], "10": ["n01530575", "brambling"], "11": ["n01531178", "goldfinch"], "12": ["n01532829", "house_finch"], "13": ["n01534433", "junco"], "14": ["n01537544", "indigo_bunting"], "15": ["n01558993", "robin"], "16": ["n01560419", "bulbul"], "17": ["n01580077", "jay"], "18": ["n01582220", "magpie"], "19": ["n01592084", "chickadee"], "20": ["n01601694", "water_ouzel"], "21": ["n01608432", "kite"], "22": ["n01614925", "bald_eagle"], "23": ["n01616318", "vulture"], "24": ["n01622779", "great_grey_owl"], "25": ["n01629819", "European_fire_salamander"], "26": ["n01630670", "common_newt"], "27": ["n01631663", "eft"], "28": ["n01632458", "spotted_salamander"], "29": ["n01632777", "axolotl"], "30": ["n01641577", "bullfrog"], "31": ["n01644373", "tree_frog"], "32": ["n01644900", "tailed_frog"], "33": ["n01664065", "loggerhead"], "34": ["n01665541", "leatherback_turtle"], "35": ["n01667114", "mud_turtle"], "36": ["n01667778", "terrapin"], "37": ["n01669191", "box_turtle"], "38": ["n01675722", "banded_gecko"], "39": ["n01677366", "common_iguana"], "40": ["n01682714", "American_chameleon"], "41": ["n01685808", "whiptail"], "42": ["n01687978", "agama"], "43": ["n01688243", "frilled_lizard"], "44": ["n01689811", "alligator_lizard"], "45": ["n01692333", "Gila_monster"], "46": ["n01693334", "green_lizard"], "47": ["n01694178", "African_chameleon"], "48": ["n01695060", "Komodo_dragon"], "49": ["n01697457", "African_crocodile"], "50": ["n01698640", "American_alligator"], "51": ["n01704323", "triceratops"], "52": ["n01728572", "thunder_snake"], "53": ["n01728920", "ringneck_snake"], "54": ["n01729322", "hognose_snake"], "55": ["n01729977", "green_snake"], "56": ["n01734418", "king_snake"], "57": ["n01735189", "garter_snake"], "58": ["n01737021", "water_snake"], "59": ["n01739381", "vine_snake"], "60": ["n01740131", "night_snake"], "61": ["n01742172", "boa_constrictor"], "62": ["n01744401", "rock_python"], "63": ["n01748264", "Indian_cobra"], "64": ["n01749939", "green_mamba"], "65": ["n01751748", "sea_snake"], "66": ["n01753488", "horned_viper"], "67": ["n01755581", "diamondback"], "68": ["n01756291", "sidewinder"], "69": ["n01768244", "trilobite"], "70": ["n01770081", "harvestman"], "71": ["n01770393", "scorpion"], "72": ["n01773157", "black_and_gold_garden_spider"], "73": ["n01773549", "barn_spider"], "74": ["n01773797", "garden_spider"], "75": ["n01774384", "black_widow"], "76": ["n01774750", "tarantula"], "77": ["n01775062", "wolf_spider"], "78": ["n01776313", "tick"], "79": ["n01784675", "centipede"], "80": ["n01795545", "black_grouse"], "81": ["n01796340", "ptarmigan"], "82": ["n01797886", "ruffed_grouse"], "83": ["n01798484", "prairie_chicken"], "84": ["n01806143", "peacock"], "85": ["n01806567", "quail"], "86": ["n01807496", "partridge"], "87": ["n01817953", "African_grey"], "88": ["n01818515", "macaw"], "89": ["n01819313", "sulphur-crested_cockatoo"], "90": ["n01820546", "lorikeet"], "91": ["n01824575", "coucal"], "92": ["n01828970", "bee_eater"], "93": ["n01829413", "hornbill"], "94": ["n01833805", "hummingbird"], "95": ["n01843065", "jacamar"], "96": ["n01843383", "toucan"], "97": ["n01847000", "drake"], "98": ["n01855032", "red-breasted_merganser"], "99": ["n01855672", "goose"], "100": ["n01860187", "black_swan"], "101": ["n01871265", "tusker"], "102": ["n01872401", "echidna"], "103": ["n01873310", "platypus"], "104": ["n01877812", "wallaby"], "105": ["n01882714", "koala"], "106": ["n01883070", "wombat"], "107": ["n01910747", "jellyfish"], "108": ["n01914609", "sea_anemone"], "109": ["n01917289", "brain_coral"], "110": ["n01924916", "flatworm"], "111": ["n01930112", "nematode"], "112": ["n01943899", "conch"], "113": ["n01944390", "snail"], "114": ["n01945685", "slug"], "115": ["n01950731", "sea_slug"], "116": ["n01955084", "chiton"], "117": ["n01968897", "chambered_nautilus"], "118": ["n01978287", "Dungeness_crab"], "119": ["n01978455", "rock_crab"], "120": ["n01980166", "fiddler_crab"], "121": ["n01981276", "king_crab"], "122": ["n01983481", "American_lobster"], "123": ["n01984695", "spiny_lobster"], "124": ["n01985128", "crayfish"], "125": ["n01986214", "hermit_crab"], "126": ["n01990800", "isopod"], "127": ["n02002556", "white_stork"], "128": ["n02002724", "black_stork"], "129": ["n02006656", "spoonbill"], "130": ["n02007558", "flamingo"], "131": ["n02009229", "little_blue_heron"], "132": ["n02009912", "American_egret"], "133": ["n02011460", "bittern"], "134": ["n02012849", "crane"], "135": ["n02013706", "limpkin"], "136": ["n02017213", "European_gallinule"], "137": ["n02018207", "American_coot"], "138": ["n02018795", "bustard"], "139": ["n02025239", "ruddy_turnstone"], "140": ["n02027492", "red-backed_sandpiper"], "141": ["n02028035", "redshank"], "142": ["n02033041", "dowitcher"], "143": ["n02037110", "oystercatcher"], "144": ["n02051845", "pelican"], "145": ["n02056570", "king_penguin"], "146": ["n02058221", "albatross"], "147": ["n02066245", "grey_whale"], "148": ["n02071294", "killer_whale"], "149": ["n02074367", "dugong"], "150": ["n02077923", "sea_lion"], "151": ["n02085620", "Chihuahua"], "152": ["n02085782", "Japanese_spaniel"], "153": ["n02085936", "Maltese_dog"], "154": ["n02086079", "Pekinese"], "155": ["n02086240", "Shih-Tzu"], "156": ["n02086646", "Blenheim_spaniel"], "157": ["n02086910", "papillon"], "158": ["n02087046", "toy_terrier"], "159": ["n02087394", "Rhodesian_ridgeback"], "160": ["n02088094", "Afghan_hound"], "161": ["n02088238", "basset"], "162": ["n02088364", "beagle"], "163": ["n02088466", "bloodhound"], "164": ["n02088632", "bluetick"], "165": ["n02089078", "black-and-tan_coonhound"], "166": ["n02089867", "Walker_hound"], "167": ["n02089973", "English_foxhound"], "168": ["n02090379", "redbone"], "169": ["n02090622", "borzoi"], "170": ["n02090721", "Irish_wolfhound"], "171": ["n02091032", "Italian_greyhound"], "172": ["n02091134", "whippet"], "173": ["n02091244", "Ibizan_hound"], "174": ["n02091467", "Norwegian_elkhound"], "175": ["n02091635", "otterhound"], "176": ["n02091831", "Saluki"], "177": ["n02092002", "Scottish_deerhound"], "178": ["n02092339", "Weimaraner"], "179": ["n02093256", "Staffordshire_bullterrier"], "180": ["n02093428", "American_Staffordshire_terrier"], "181": ["n02093647", "Bedlington_terrier"], "182": ["n02093754", "Border_terrier"], "183": ["n02093859", "Kerry_blue_terrier"], "184": ["n02093991", "Irish_terrier"], "185": ["n02094114", "Norfolk_terrier"], "186": ["n02094258", "Norwich_terrier"], "187": ["n02094433", "Yorkshire_terrier"], "188": ["n02095314", "wire-haired_fox_terrier"], "189": ["n02095570", "Lakeland_terrier"], "190": ["n02095889", "Sealyham_terrier"], "191": ["n02096051", "Airedale"], "192": ["n02096177", "cairn"], "193": ["n02096294", "Australian_terrier"], "194": ["n02096437", "Dandie_Dinmont"], "195": ["n02096585", "Boston_bull"], "196": ["n02097047", "miniature_schnauzer"], "197": ["n02097130", "giant_schnauzer"], "198": ["n02097209", "standard_schnauzer"], "199": ["n02097298", "Scotch_terrier"], "200": ["n02097474", "Tibetan_terrier"], "201": ["n02097658", "silky_terrier"], "202": ["n02098105", "soft-coated_wheaten_terrier"], "203": ["n02098286", "West_Highland_white_terrier"], "204": ["n02098413", "Lhasa"], "205": ["n02099267", "flat-coated_retriever"], "206": ["n02099429", "curly-coated_retriever"], "207": ["n02099601", "golden_retriever"], "208": ["n02099712", "Labrador_retriever"], "209": ["n02099849", "Chesapeake_Bay_retriever"], "210": ["n02100236", "German_short-haired_pointer"], "211": ["n02100583", "vizsla"], "212": ["n02100735", "English_setter"], "213": ["n02100877", "Irish_setter"], "214": ["n02101006", "Gordon_setter"], "215": ["n02101388", "Brittany_spaniel"], "216": ["n02101556", "clumber"], "217": ["n02102040", "English_springer"], "218": ["n02102177", "Welsh_springer_spaniel"], "219": ["n02102318", "cocker_spaniel"], "220": ["n02102480", "Sussex_spaniel"], "221": ["n02102973", "Irish_water_spaniel"], "222": ["n02104029", "kuvasz"], "223": ["n02104365", "schipperke"], "224": ["n02105056", "groenendael"], "225": ["n02105162", "malinois"], "226": ["n02105251", "briard"], "227": ["n02105412", "kelpie"], "228": ["n02105505", "komondor"], "229": ["n02105641", "Old_English_sheepdog"], "230": ["n02105855", "Shetland_sheepdog"], "231": ["n02106030", "collie"], "232": ["n02106166", "Border_collie"], "233": ["n02106382", "Bouvier_des_Flandres"], "234": ["n02106550", "Rottweiler"], "235": ["n02106662", "German_shepherd"], "236": ["n02107142", "Doberman"], "237": ["n02107312", "miniature_pinscher"], "238": ["n02107574", "Greater_Swiss_Mountain_dog"], "239": ["n02107683", "Bernese_mountain_dog"], "240": ["n02107908", "Appenzeller"], "241": ["n02108000", "EntleBucher"], "242": ["n02108089", "boxer"], "243": ["n02108422", "bull_mastiff"], "244": ["n02108551", "Tibetan_mastiff"], "245": ["n02108915", "French_bulldog"], "246": ["n02109047", "Great_Dane"], "247": ["n02109525", "Saint_Bernard"], "248": ["n02109961", "Eskimo_dog"], "249": ["n02110063", "malamute"], "250": ["n02110185", "Siberian_husky"], "251": ["n02110341", "dalmatian"], "252": ["n02110627", "affenpinscher"], "253": ["n02110806", "basenji"], "254": ["n02110958", "pug"], "255": ["n02111129", "Leonberg"], "256": ["n02111277", "Newfoundland"], "257": ["n02111500", "Great_Pyrenees"], "258": ["n02111889", "Samoyed"], "259": ["n02112018", "Pomeranian"], "260": ["n02112137", "chow"], "261": ["n02112350", "keeshond"], "262": ["n02112706", "Brabancon_griffon"], "263": ["n02113023", "Pembroke"], "264": ["n02113186", "Cardigan"], "265": ["n02113624", "toy_poodle"], "266": ["n02113712", "miniature_poodle"], "267": ["n02113799", "standard_poodle"], "268": ["n02113978", "Mexican_hairless"], "269": ["n02114367", "timber_wolf"], "270": ["n02114548", "white_wolf"], "271": ["n02114712", "red_wolf"], "272": ["n02114855", "coyote"], "273": ["n02115641", "dingo"], "274": ["n02115913", "dhole"], "275": ["n02116738", "African_hunting_dog"], "276": ["n02117135", "hyena"], "277": ["n02119022", "red_fox"], "278": ["n02119789", "kit_fox"], "279": ["n02120079", "Arctic_fox"], "280": ["n02120505", "grey_fox"], "281": ["n02123045", "tabby"], "282": ["n02123159", "tiger_cat"], "283": ["n02123394", "Persian_cat"], "284": ["n02123597", "Siamese_cat"], "285": ["n02124075", "Egyptian_cat"], "286": ["n02125311", "cougar"], "287": ["n02127052", "lynx"], "288": ["n02128385", "leopard"], "289": ["n02128757", "snow_leopard"], "290": ["n02128925", "jaguar"], "291": ["n02129165", "lion"], "292": ["n02129604", "tiger"], "293": ["n02130308", "cheetah"], "294": ["n02132136", "brown_bear"], "295": ["n02133161", "American_black_bear"], "296": ["n02134084", "ice_bear"], "297": ["n02134418", "sloth_bear"], "298": ["n02137549", "mongoose"], "299": ["n02138441", "meerkat"], "300": ["n02165105", "tiger_beetle"], "301": ["n02165456", "ladybug"], "302": ["n02167151", "ground_beetle"], "303": ["n02168699", "long-horned_beetle"], "304": ["n02169497", "leaf_beetle"], "305": ["n02172182", "dung_beetle"], "306": ["n02174001", "rhinoceros_beetle"], "307": ["n02177972", "weevil"], "308": ["n02190166", "fly"], "309": ["n02206856", "bee"], "310": ["n02219486", "ant"], "311": ["n02226429", "grasshopper"], "312": ["n02229544", "cricket"], "313": ["n02231487", "walking_stick"], "314": ["n02233338", "cockroach"], "315": ["n02236044", "mantis"], "316": ["n02256656", "cicada"], "317": ["n02259212", "leafhopper"], "318": ["n02264363", "lacewing"], "319": ["n02268443", "dragonfly"], "320": ["n02268853", "damselfly"], "321": ["n02276258", "admiral"], "322": ["n02277742", "ringlet"], "323": ["n02279972", "monarch"], "324": ["n02280649", "cabbage_butterfly"], "325": ["n02281406", "sulphur_butterfly"], "326": ["n02281787", "lycaenid"], "327": ["n02317335", "starfish"], "328": ["n02319095", "sea_urchin"], "329": ["n02321529", "sea_cucumber"], "330": ["n02325366", "wood_rabbit"], "331": ["n02326432", "hare"], "332": ["n02328150", "Angora"], "333": ["n02342885", "hamster"], "334": ["n02346627", "porcupine"], "335": ["n02356798", "fox_squirrel"], "336": ["n02361337", "marmot"], "337": ["n02363005", "beaver"], "338": ["n02364673", "guinea_pig"], "339": ["n02389026", "sorrel"], "340": ["n02391049", "zebra"], "341": ["n02395406", "hog"], "342": ["n02396427", "wild_boar"], "343": ["n02397096", "warthog"], "344": ["n02398521", "hippopotamus"], "345": ["n02403003", "ox"], "346": ["n02408429", "water_buffalo"], "347": ["n02410509", "bison"], "348": ["n02412080", "ram"], "349": ["n02415577", "bighorn"], "350": ["n02417914", "ibex"], "351": ["n02422106", "hartebeest"], "352": ["n02422699", "impala"], "353": ["n02423022", "gazelle"], "354": ["n02437312", "Arabian_camel"], "355": ["n02437616", "llama"], "356": ["n02441942", "weasel"], "357": ["n02442845", "mink"], "358": ["n02443114", "polecat"], "359": ["n02443484", "black-footed_ferret"], "360": ["n02444819", "otter"], "361": ["n02445715", "skunk"], "362": ["n02447366", "badger"], "363": ["n02454379", "armadillo"], "364": ["n02457408", "three-toed_sloth"], "365": ["n02480495", "orangutan"], "366": ["n02480855", "gorilla"], "367": ["n02481823", "chimpanzee"], "368": ["n02483362", "gibbon"], "369": ["n02483708", "siamang"], "370": ["n02484975", "guenon"], "371": ["n02486261", "patas"], "372": ["n02486410", "baboon"], "373": ["n02487347", "macaque"], "374": ["n02488291", "langur"], "375": ["n02488702", "colobus"], "376": ["n02489166", "proboscis_monkey"], "377": ["n02490219", "marmoset"], "378": ["n02492035", "capuchin"], "379": ["n02492660", "howler_monkey"], "380": ["n02493509", "titi"], "381": ["n02493793", "spider_monkey"], "382": ["n02494079", "squirrel_monkey"], "383": ["n02497673", "Madagascar_cat"], "384": ["n02500267", "indri"], "385": ["n02504013", "Indian_elephant"], "386": ["n02504458", "African_elephant"], "387": ["n02509815", "lesser_panda"], "388": ["n02510455", "giant_panda"], "389": ["n02514041", "barracouta"], "390": ["n02526121", "eel"], "391": ["n02536864", "coho"], "392": ["n02606052", "rock_beauty"], "393": ["n02607072", "anemone_fish"], "394": ["n02640242", "sturgeon"], "395": ["n02641379", "gar"], "396": ["n02643566", "lionfish"], "397": ["n02655020", "puffer"], "398": ["n02666196", "abacus"], "399": ["n02667093", "abaya"], "400": ["n02669723", "academic_gown"], "401": ["n02672831", "accordion"], "402": ["n02676566", "acoustic_guitar"], "403": ["n02687172", "aircraft_carrier"], "404": ["n02690373", "airliner"], "405": ["n02692877", "airship"], "406": ["n02699494", "altar"], "407": ["n02701002", "ambulance"], "408": ["n02704792", "amphibian"], "409": ["n02708093", "analog_clock"], "410": ["n02727426", "apiary"], "411": ["n02730930", "apron"], "412": ["n02747177", "ashcan"], "413": ["n02749479", "assault_rifle"], "414": ["n02769748", "backpack"], "415": ["n02776631", "bakery"], "416": ["n02777292", "balance_beam"], "417": ["n02782093", "balloon"], "418": ["n02783161", "ballpoint"], "419": ["n02786058", "Band_Aid"], "420": ["n02787622", "banjo"], "421": ["n02788148", "bannister"], "422": ["n02790996", "barbell"], "423": ["n02791124", "barber_chair"], "424": ["n02791270", "barbershop"], "425": ["n02793495", "barn"], "426": ["n02794156", "barometer"], "427": ["n02795169", "barrel"], "428": ["n02797295", "barrow"], "429": ["n02799071", "baseball"], "430": ["n02802426", "basketball"], "431": ["n02804414", "bassinet"], "432": ["n02804610", "bassoon"], "433": ["n02807133", "bathing_cap"], "434": ["n02808304", "bath_towel"], "435": ["n02808440", "bathtub"], "436": ["n02814533", "beach_wagon"], "437": ["n02814860", "beacon"], "438": ["n02815834", "beaker"], "439": ["n02817516", "bearskin"], "440": ["n02823428", "beer_bottle"], "441": ["n02823750", "beer_glass"], "442": ["n02825657", "bell_cote"], "443": ["n02834397", "bib"], "444": ["n02835271", "bicycle-built-for-two"], "445": ["n02837789", "bikini"], "446": ["n02840245", "binder"], "447": ["n02841315", "binoculars"], "448": ["n02843684", "birdhouse"], "449": ["n02859443", "boathouse"], "450": ["n02860847", "bobsled"], "451": ["n02865351", "bolo_tie"], "452": ["n02869837", "bonnet"], "453": ["n02870880", "bookcase"], "454": ["n02871525", "bookshop"], "455": ["n02877765", "bottlecap"], "456": ["n02879718", "bow"], "457": ["n02883205", "bow_tie"], "458": ["n02892201", "brass"], "459": ["n02892767", "brassiere"], "460": ["n02894605", "breakwater"], "461": ["n02895154", "breastplate"], "462": ["n02906734", "broom"], "463": ["n02909870", "bucket"], "464": ["n02910353", "buckle"], "465": ["n02916936", "bulletproof_vest"], "466": ["n02917067", "bullet_train"], "467": ["n02927161", "butcher_shop"], "468": ["n02930766", "cab"], "469": ["n02939185", "caldron"], "470": ["n02948072", "candle"], "471": ["n02950826", "cannon"], "472": ["n02951358", "canoe"], "473": ["n02951585", "can_opener"], "474": ["n02963159", "cardigan"], "475": ["n02965783", "car_mirror"], "476": ["n02966193", "carousel"], "477": ["n02966687", "carpenter's_kit"], "478": ["n02971356", "carton"], "479": ["n02974003", "car_wheel"], "480": ["n02977058", "cash_machine"], "481": ["n02978881", "cassette"], "482": ["n02979186", "cassette_player"], "483": ["n02980441", "castle"], "484": ["n02981792", "catamaran"], "485": ["n02988304", "CD_player"], "486": ["n02992211", "cello"], "487": ["n02992529", "cellular_telephone"], "488": ["n02999410", "chain"], "489": ["n03000134", "chainlink_fence"], "490": ["n03000247", "chain_mail"], "491": ["n03000684", "chain_saw"], "492": ["n03014705", "chest"], "493": ["n03016953", "chiffonier"], "494": ["n03017168", "chime"], "495": ["n03018349", "china_cabinet"], "496": ["n03026506", "Christmas_stocking"], "497": ["n03028079", "church"], "498": ["n03032252", "cinema"], "499": ["n03041632", "cleaver"], "500": ["n03042490", "cliff_dwelling"], "501": ["n03045698", "cloak"], "502": ["n03047690", "clog"], "503": ["n03062245", "cocktail_shaker"], "504": ["n03063599", "coffee_mug"], "505": ["n03063689", "coffeepot"], "506": ["n03065424", "coil"], "507": ["n03075370", "combination_lock"], "508": ["n03085013", "computer_keyboard"], "509": ["n03089624", "confectionery"], "510": ["n03095699", "container_ship"], "511": ["n03100240", "convertible"], "512": ["n03109150", "corkscrew"], "513": ["n03110669", "cornet"], "514": ["n03124043", "cowboy_boot"], "515": ["n03124170", "cowboy_hat"], "516": ["n03125729", "cradle"], "517": ["n03126707", "crane"], "518": ["n03127747", "crash_helmet"], "519": ["n03127925", "crate"], "520": ["n03131574", "crib"], "521": ["n03133878", "Crock_Pot"], "522": ["n03134739", "croquet_ball"], "523": ["n03141823", "crutch"], "524": ["n03146219", "cuirass"], "525": ["n03160309", "dam"], "526": ["n03179701", "desk"], "527": ["n03180011", "desktop_computer"], "528": ["n03187595", "dial_telephone"], "529": ["n03188531", "diaper"], "530": ["n03196217", "digital_clock"], "531": ["n03197337", "digital_watch"], "532": ["n03201208", "dining_table"], "533": ["n03207743", "dishrag"], "534": ["n03207941", "dishwasher"], "535": ["n03208938", "disk_brake"], "536": ["n03216828", "dock"], "537": ["n03218198", "dogsled"], "538": ["n03220513", "dome"], "539": ["n03223299", "doormat"], "540": ["n03240683", "drilling_platform"], "541": ["n03249569", "drum"], "542": ["n03250847", "drumstick"], "543": ["n03255030", "dumbbell"], "544": ["n03259280", "Dutch_oven"], "545": ["n03271574", "electric_fan"], "546": ["n03272010", "electric_guitar"], "547": ["n03272562", "electric_locomotive"], "548": ["n03290653", "entertainment_center"], "549": ["n03291819", "envelope"], "550": ["n03297495", "espresso_maker"], "551": ["n03314780", "face_powder"], "552": ["n03325584", "feather_boa"], "553": ["n03337140", "file"], "554": ["n03344393", "fireboat"], "555": ["n03345487", "fire_engine"], "556": ["n03347037", "fire_screen"], "557": ["n03355925", "flagpole"], "558": ["n03372029", "flute"], "559": ["n03376595", "folding_chair"], "560": ["n03379051", "football_helmet"], "561": ["n03384352", "forklift"], "562": ["n03388043", "fountain"], "563": ["n03388183", "fountain_pen"], "564": ["n03388549", "four-poster"], "565": ["n03393912", "freight_car"], "566": ["n03394916", "French_horn"], "567": ["n03400231", "frying_pan"], "568": ["n03404251", "fur_coat"], "569": ["n03417042", "garbage_truck"], "570": ["n03424325", "gasmask"], "571": ["n03425413", "gas_pump"], "572": ["n03443371", "goblet"], "573": ["n03444034", "go-kart"], "574": ["n03445777", "golf_ball"], "575": ["n03445924", "golfcart"], "576": ["n03447447", "gondola"], "577": ["n03447721", "gong"], "578": ["n03450230", "gown"], "579": ["n03452741", "grand_piano"], "580": ["n03457902", "greenhouse"], "581": ["n03459775", "grille"], "582": ["n03461385", "grocery_store"], "583": ["n03467068", "guillotine"], "584": ["n03476684", "hair_slide"], "585": ["n03476991", "hair_spray"], "586": ["n03478589", "half_track"], "587": ["n03481172", "hammer"], "588": ["n03482405", "hamper"], "589": ["n03483316", "hand_blower"], "590": ["n03485407", "hand-held_computer"], "591": ["n03485794", "handkerchief"], "592": ["n03492542", "hard_disc"], "593": ["n03494278", "harmonica"], "594": ["n03495258", "harp"], "595": ["n03496892", "harvester"], "596": ["n03498962", "hatchet"], "597": ["n03527444", "holster"], "598": ["n03529860", "home_theater"], "599": ["n03530642", "honeycomb"], "600": ["n03532672", "hook"], "601": ["n03534580", "hoopskirt"], "602": ["n03535780", "horizontal_bar"], "603": ["n03538406", "horse_cart"], "604": ["n03544143", "hourglass"], "605": ["n03584254", "iPod"], "606": ["n03584829", "iron"], "607": ["n03590841", "jack-o'-lantern"], "608": ["n03594734", "jean"], "609": ["n03594945", "jeep"], "610": ["n03595614", "jersey"], "611": ["n03598930", "jigsaw_puzzle"], "612": ["n03599486", "jinrikisha"], "613": ["n03602883", "joystick"], "614": ["n03617480", "kimono"], "615": ["n03623198", "knee_pad"], "616": ["n03627232", "knot"], "617": ["n03630383", "lab_coat"], "618": ["n03633091", "ladle"], "619": ["n03637318", "lampshade"], "620": ["n03642806", "laptop"], "621": ["n03649909", "lawn_mower"], "622": ["n03657121", "lens_cap"], "623": ["n03658185", "letter_opener"], "624": ["n03661043", "library"], "625": ["n03662601", "lifeboat"], "626": ["n03666591", "lighter"], "627": ["n03670208", "limousine"], "628": ["n03673027", "liner"], "629": ["n03676483", "lipstick"], "630": ["n03680355", "Loafer"], "631": ["n03690938", "lotion"], "632": ["n03691459", "loudspeaker"], "633": ["n03692522", "loupe"], "634": ["n03697007", "lumbermill"], "635": ["n03706229", "magnetic_compass"], "636": ["n03709823", "mailbag"], "637": ["n03710193", "mailbox"], "638": ["n03710637", "maillot"], "639": ["n03710721", "maillot"], "640": ["n03717622", "manhole_cover"], "641": ["n03720891", "maraca"], "642": ["n03721384", "marimba"], "643": ["n03724870", "mask"], "644": ["n03729826", "matchstick"], "645": ["n03733131", "maypole"], "646": ["n03733281", "maze"], "647": ["n03733805", "measuring_cup"], "648": ["n03742115", "medicine_chest"], "649": ["n03743016", "megalith"], "650": ["n03759954", "microphone"], "651": ["n03761084", "microwave"], "652": ["n03763968", "military_uniform"], "653": ["n03764736", "milk_can"], "654": ["n03769881", "minibus"], "655": ["n03770439", "miniskirt"], "656": ["n03770679", "minivan"], "657": ["n03773504", "missile"], "658": ["n03775071", "mitten"], "659": ["n03775546", "mixing_bowl"], "660": ["n03776460", "mobile_home"], "661": ["n03777568", "Model_T"], "662": ["n03777754", "modem"], "663": ["n03781244", "monastery"], "664": ["n03782006", "monitor"], "665": ["n03785016", "moped"], "666": ["n03786901", "mortar"], "667": ["n03787032", "mortarboard"], "668": ["n03788195", "mosque"], "669": ["n03788365", "mosquito_net"], "670": ["n03791053", "motor_scooter"], "671": ["n03792782", "mountain_bike"], "672": ["n03792972", "mountain_tent"], "673": ["n03793489", "mouse"], "674": ["n03794056", "mousetrap"], "675": ["n03796401", "moving_van"], "676": ["n03803284", "muzzle"], "677": ["n03804744", "nail"], "678": ["n03814639", "neck_brace"], "679": ["n03814906", "necklace"], "680": ["n03825788", "nipple"], "681": ["n03832673", "notebook"], "682": ["n03837869", "obelisk"], "683": ["n03838899", "oboe"], "684": ["n03840681", "ocarina"], "685": ["n03841143", "odometer"], "686": ["n03843555", "oil_filter"], "687": ["n03854065", "organ"], "688": ["n03857828", "oscilloscope"], "689": ["n03866082", "overskirt"], "690": ["n03868242", "oxcart"], "691": ["n03868863", "oxygen_mask"], "692": ["n03871628", "packet"], "693": ["n03873416", "paddle"], "694": ["n03874293", "paddlewheel"], "695": ["n03874599", "padlock"], "696": ["n03876231", "paintbrush"], "697": ["n03877472", "pajama"], "698": ["n03877845", "palace"], "699": ["n03884397", "panpipe"], "700": ["n03887697", "paper_towel"], "701": ["n03888257", "parachute"], "702": ["n03888605", "parallel_bars"], "703": ["n03891251", "park_bench"], "704": ["n03891332", "parking_meter"], "705": ["n03895866", "passenger_car"], "706": ["n03899768", "patio"], "707": ["n03902125", "pay-phone"], "708": ["n03903868", "pedestal"], "709": ["n03908618", "pencil_box"], "710": ["n03908714", "pencil_sharpener"], "711": ["n03916031", "perfume"], "712": ["n03920288", "Petri_dish"], "713": ["n03924679", "photocopier"], "714": ["n03929660", "pick"], "715": ["n03929855", "pickelhaube"], "716": ["n03930313", "picket_fence"], "717": ["n03930630", "pickup"], "718": ["n03933933", "pier"], "719": ["n03935335", "piggy_bank"], "720": ["n03937543", "pill_bottle"], "721": ["n03938244", "pillow"], "722": ["n03942813", "ping-pong_ball"], "723": ["n03944341", "pinwheel"], "724": ["n03947888", "pirate"], "725": ["n03950228", "pitcher"], "726": ["n03954731", "plane"], "727": ["n03956157", "planetarium"], "728": ["n03958227", "plastic_bag"], "729": ["n03961711", "plate_rack"], "730": ["n03967562", "plow"], "731": ["n03970156", "plunger"], "732": ["n03976467", "Polaroid_camera"], "733": ["n03976657", "pole"], "734": ["n03977966", "police_van"], "735": ["n03980874", "poncho"], "736": ["n03982430", "pool_table"], "737": ["n03983396", "pop_bottle"], "738": ["n03991062", "pot"], "739": ["n03992509", "potter's_wheel"], "740": ["n03995372", "power_drill"], "741": ["n03998194", "prayer_rug"], "742": ["n04004767", "printer"], "743": ["n04005630", "prison"], "744": ["n04008634", "projectile"], "745": ["n04009552", "projector"], "746": ["n04019541", "puck"], "747": ["n04023962", "punching_bag"], "748": ["n04026417", "purse"], "749": ["n04033901", "quill"], "750": ["n04033995", "quilt"], "751": ["n04037443", "racer"], "752": ["n04039381", "racket"], "753": ["n04040759", "radiator"], "754": ["n04041544", "radio"], "755": ["n04044716", "radio_telescope"], "756": ["n04049303", "rain_barrel"], "757": ["n04065272", "recreational_vehicle"], "758": ["n04067472", "reel"], "759": ["n04069434", "reflex_camera"], "760": ["n04070727", "refrigerator"], "761": ["n04074963", "remote_control"], "762": ["n04081281", "restaurant"], "763": ["n04086273", "revolver"], "764": ["n04090263", "rifle"], "765": ["n04099969", "rocking_chair"], "766": ["n04111531", "rotisserie"], "767": ["n04116512", "rubber_eraser"], "768": ["n04118538", "rugby_ball"], "769": ["n04118776", "rule"], "770": ["n04120489", "running_shoe"], "771": ["n04125021", "safe"], "772": ["n04127249", "safety_pin"], "773": ["n04131690", "saltshaker"], "774": ["n04133789", "sandal"], "775": ["n04136333", "sarong"], "776": ["n04141076", "sax"], "777": ["n04141327", "scabbard"], "778": ["n04141975", "scale"], "779": ["n04146614", "school_bus"], "780": ["n04147183", "schooner"], "781": ["n04149813", "scoreboard"], "782": ["n04152593", "screen"], "783": ["n04153751", "screw"], "784": ["n04154565", "screwdriver"], "785": ["n04162706", "seat_belt"], "786": ["n04179913", "sewing_machine"], "787": ["n04192698", "shield"], "788": ["n04200800", "shoe_shop"], "789": ["n04201297", "shoji"], "790": ["n04204238", "shopping_basket"], "791": ["n04204347", "shopping_cart"], "792": ["n04208210", "shovel"], "793": ["n04209133", "shower_cap"], "794": ["n04209239", "shower_curtain"], "795": ["n04228054", "ski"], "796": ["n04229816", "ski_mask"], "797": ["n04235860", "sleeping_bag"], "798": ["n04238763", "slide_rule"], "799": ["n04239074", "sliding_door"], "800": ["n04243546", "slot"], "801": ["n04251144", "snorkel"], "802": ["n04252077", "snowmobile"], "803": ["n04252225", "snowplow"], "804": ["n04254120", "soap_dispenser"], "805": ["n04254680", "soccer_ball"], "806": ["n04254777", "sock"], "807": ["n04258138", "solar_dish"], "808": ["n04259630", "sombrero"], "809": ["n04263257", "soup_bowl"], "810": ["n04264628", "space_bar"], "811": ["n04265275", "space_heater"], "812": ["n04266014", "space_shuttle"], "813": ["n04270147", "spatula"], "814": ["n04273569", "speedboat"], "815": ["n04275548", "spider_web"], "816": ["n04277352", "spindle"], "817": ["n04285008", "sports_car"], "818": ["n04286575", "spotlight"], "819": ["n04296562", "stage"], "820": ["n04310018", "steam_locomotive"], "821": ["n04311004", "steel_arch_bridge"], "822": ["n04311174", "steel_drum"], "823": ["n04317175", "stethoscope"], "824": ["n04325704", "stole"], "825": ["n04326547", "stone_wall"], "826": ["n04328186", "stopwatch"], "827": ["n04330267", "stove"], "828": ["n04332243", "strainer"], "829": ["n04335435", "streetcar"], "830": ["n04336792", "stretcher"], "831": ["n04344873", "studio_couch"], "832": ["n04346328", "stupa"], "833": ["n04347754", "submarine"], "834": ["n04350905", "suit"], "835": ["n04355338", "sundial"], "836": ["n04355933", "sunglass"], "837": ["n04356056", "sunglasses"], "838": ["n04357314", "sunscreen"], "839": ["n04366367", "suspension_bridge"], "840": ["n04367480", "swab"], "841": ["n04370456", "sweatshirt"], "842": ["n04371430", "swimming_trunks"], "843": ["n04371774", "swing"], "844": ["n04372370", "switch"], "845": ["n04376876", "syringe"], "846": ["n04380533", "table_lamp"], "847": ["n04389033", "tank"], "848": ["n04392985", "tape_player"], "849": ["n04398044", "teapot"], "850": ["n04399382", "teddy"], "851": ["n04404412", "television"], "852": ["n04409515", "tennis_ball"], "853": ["n04417672", "thatch"], "854": ["n04418357", "theater_curtain"], "855": ["n04423845", "thimble"], "856": ["n04428191", "thresher"], "857": ["n04429376", "throne"], "858": ["n04435653", "tile_roof"], "859": ["n04442312", "toaster"], "860": ["n04443257", "tobacco_shop"], "861": ["n04447861", "toilet_seat"], "862": ["n04456115", "torch"], "863": ["n04458633", "totem_pole"], "864": ["n04461696", "tow_truck"], "865": ["n04462240", "toyshop"], "866": ["n04465501", "tractor"], "867": ["n04467665", "trailer_truck"], "868": ["n04476259", "tray"], "869": ["n04479046", "trench_coat"], "870": ["n04482393", "tricycle"], "871": ["n04483307", "trimaran"], "872": ["n04485082", "tripod"], "873": ["n04486054", "triumphal_arch"], "874": ["n04487081", "trolleybus"], "875": ["n04487394", "trombone"], "876": ["n04493381", "tub"], "877": ["n04501370", "turnstile"], "878": ["n04505470", "typewriter_keyboard"], "879": ["n04507155", "umbrella"], "880": ["n04509417", "unicycle"], "881": ["n04515003", "upright"], "882": ["n04517823", "vacuum"], "883": ["n04522168", "vase"], "884": ["n04523525", "vault"], "885": ["n04525038", "velvet"], "886": ["n04525305", "vending_machine"], "887": ["n04532106", "vestment"], "888": ["n04532670", "viaduct"], "889": ["n04536866", "violin"], "890": ["n04540053", "volleyball"], "891": ["n04542943", "waffle_iron"], "892": ["n04548280", "wall_clock"], "893": ["n04548362", "wallet"], "894": ["n04550184", "wardrobe"], "895": ["n04552348", "warplane"], "896": ["n04553703", "washbasin"], "897": ["n04554684", "washer"], "898": ["n04557648", "water_bottle"], "899": ["n04560804", "water_jug"], "900": ["n04562935", "water_tower"], "901": ["n04579145", "whiskey_jug"], "902": ["n04579432", "whistle"], "903": ["n04584207", "wig"], "904": ["n04589890", "window_screen"], "905": ["n04590129", "window_shade"], "906": ["n04591157", "Windsor_tie"], "907": ["n04591713", "wine_bottle"], "908": ["n04592741", "wing"], "909": ["n04596742", "wok"], "910": ["n04597913", "wooden_spoon"], "911": ["n04599235", "wool"], "912": ["n04604644", "worm_fence"], "913": ["n04606251", "wreck"], "914": ["n04612504", "yawl"], "915": ["n04613696", "yurt"], "916": ["n06359193", "web_site"], "917": ["n06596364", "comic_book"], "918": ["n06785654", "crossword_puzzle"], "919": ["n06794110", "street_sign"], "920": ["n06874185", "traffic_light"], "921": ["n07248320", "book_jacket"], "922": ["n07565083", "menu"], "923": ["n07579787", "plate"], "924": ["n07583066", "guacamole"], "925": ["n07584110", "consomme"], "926": ["n07590611", "hot_pot"], "927": ["n07613480", "trifle"], "928": ["n07614500", "ice_cream"], "929": ["n07615774", "ice_lolly"], "930": ["n07684084", "French_loaf"], "931": ["n07693725", "bagel"], "932": ["n07695742", "pretzel"], "933": ["n07697313", "cheeseburger"], "934": ["n07697537", "hotdog"], "935": ["n07711569", "mashed_potato"], "936": ["n07714571", "head_cabbage"], "937": ["n07714990", "broccoli"], "938": ["n07715103", "cauliflower"], "939": ["n07716358", "zucchini"], "940": ["n07716906", "spaghetti_squash"], "941": ["n07717410", "acorn_squash"], "942": ["n07717556", "butternut_squash"], "943": ["n07718472", "cucumber"], "944": ["n07718747", "artichoke"], "945": ["n07720875", "bell_pepper"], "946": ["n07730033", "cardoon"], "947": ["n07734744", "mushroom"], "948": ["n07742313", "Granny_Smith"], "949": ["n07745940", "strawberry"], "950": ["n07747607", "orange"], "951": ["n07749582", "lemon"], "952": ["n07753113", "fig"], "953": ["n07753275", "pineapple"], "954": ["n07753592", "banana"], "955": ["n07754684", "jackfruit"], "956": ["n07760859", "custard_apple"], "957": ["n07768694", "pomegranate"], "958": ["n07802026", "hay"], "959": ["n07831146", "carbonara"], "960": ["n07836838", "chocolate_sauce"], "961": ["n07860988", "dough"], "962": ["n07871810", "meat_loaf"], "963": ["n07873807", "pizza"], "964": ["n07875152", "potpie"], "965": ["n07880968", "burrito"], "966": ["n07892512", "red_wine"], "967": ["n07920052", "espresso"], "968": ["n07930864", "cup"], "969": ["n07932039", "eggnog"], "970": ["n09193705", "alp"], "971": ["n09229709", "bubble"], "972": ["n09246464", "cliff"], "973": ["n09256479", "coral_reef"], "974": ["n09288635", "geyser"], "975": ["n09332890", "lakeside"], "976": ["n09399592", "promontory"], "977": ["n09421951", "sandbar"], "978": ["n09428293", "seashore"], "979": ["n09468604", "valley"], "980": ["n09472597", "volcano"], "981": ["n09835506", "ballplayer"], "982": ["n10148035", "groom"], "983": ["n10565667", "scuba_diver"], "984": ["n11879895", "rapeseed"], "985": ["n11939491", "daisy"], "986": ["n12057211", "yellow_lady's_slipper"], "987": ["n12144580", "corn"], "988": ["n12267677", "acorn"], "989": ["n12620546", "hip"], "990": ["n12768682", "buckeye"], "991": ["n12985857", "coral_fungus"], "992": ["n12998815", "agaric"], "993": ["n13037406", "gyromitra"], "994": ["n13040303", "stinkhorn"], "995": ["n13044778", "earthstar"], "996": ["n13052670", "hen-of-the-woods"], "997": ["n13054560", "bolete"], "998": ["n13133613", "ear"], "999": ["n15075141", "toilet_tissue"]} results = [] for pred in preds: top_indices = pred.argsort()[-top:][::-1] result = [tuple(CLASS_INDEX[str(i)]) + (pred[i],) for i in top_indices] result.sort(key=lambda x: x[2], reverse=True) results.append(result) return results ================================================ FILE: imageai_tf_deprecated/__init__.py ================================================ ================================================ FILE: requirements.txt ================================================ cython pillow>=7.0.0 numpy>=1.18.1 opencv-python>=4.1.2 torch>=1.9.0 --extra-index-url https://download.pytorch.org/whl/cpu torchvision>=0.10.0 --extra-index-url https://download.pytorch.org/whl/cpu pytest==7.1.3 tqdm==4.64.1 scipy>=1.7.3 matplotlib>=3.4.3 mock==4.0.3 ================================================ FILE: requirements_extra.txt ================================================ pycocotools@git+https://github.com/gautamchitnis/cocoapi.git@cocodataset-master#subdirectory=PythonAPI ================================================ FILE: requirements_gpu.txt ================================================ cython pillow>=7.0.0 numpy>=1.18.1 opencv-python>=4.1.2 torch>=1.9.0 --extra-index-url https://download.pytorch.org/whl/cu102 torchvision>=0.10.0 --extra-index-url https://download.pytorch.org/whl/cu102 pytest==7.1.3 tqdm==4.64.1 scipy>=1.7.3 matplotlib>=3.4.3 mock==4.0.3 ================================================ FILE: scripts/pascal_voc_to_yolo.py ================================================ import glob import os import argparse import pickle import xml.etree.ElementTree as ET from os import listdir, getcwd from os.path import join import shutil dirs = ['train', 'validation'] sub_dirs = ["images", "annotations"] classes = [] def convert(size, box): dw = 1./(size[0]) dh = 1./(size[1]) x = (box[0] + box[1])/2.0 - 1 y = (box[2] + box[3])/2.0 - 1 w = box[1] - box[0] h = box[3] - box[2] x = x*dw w = w*dw y = y*dh h = h*dh return (x,y,w,h) def convert_annotation(input_ann_path): tree = ET.parse(input_ann_path) root = tree.getroot() size = root.find('size') w = int(size.find('width').text) h = int(size.find('height').text) ann_list = [] for obj in root.iter('object'): obj_class = obj.find('name').text if obj_class not in classes: classes.append(obj_class) xmlbox = obj.find('bndbox') b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), float(xmlbox.find('ymin').text), float(xmlbox.find('ymax').text)) bb = convert((w,h), b) ann_list.append( { "class": obj_class, "bbox": bb } ) return ann_list def main(dataset_dir: str): yolo_dataset = os.path.join( os.path.dirname(dataset_dir), os.path.basename(f"{dataset_dir}-yolo") ) for dir in dirs: dir_path = os.path.join( yolo_dataset, dir ) os.makedirs(dir_path, exist_ok=True) for sub_dir in sub_dirs: os.makedirs( os.path.join( dir_path, sub_dir ), exist_ok=True ) train_anns = {} validation_anns = {} for dir in dirs: dir_path = os.path.join( dataset_dir, dir ) images = [file for file in os.listdir( os.path.join(dir_path, "images") ) if file.endswith(".png") or file.endswith(".jpg") or file.endswith(".jpeg")] annotations = [file for file in os.listdir( os.path.join(dir_path, "annotations") ) if file.endswith(".xml")] for image, annotation in zip(images, annotations): shutil.copy( os.path.join( dataset_dir, dir, "images", image ), os.path.join( yolo_dataset, dir, "images", image ) ) ann_list = convert_annotation( os.path.join( dataset_dir, dir, "annotations", annotation ) ) if dir == "train": train_anns[annotation] = ann_list elif dir == "validation": validation_anns[annotation] = ann_list all_classes = sorted(classes) for k,v in {"train": train_anns, "validation": validation_anns}.items(): for anns_k, anns_v in v.items(): output_ann_path = os.path.join( yolo_dataset, k, "annotations", anns_k.replace(".xml", ".txt") ) anns_str = "" for ann in anns_v: class_idx = all_classes.index(ann["class"]) bbox = [str(f) for f in ann["bbox"]] anns_str += f"{class_idx} {' '.join(bbox)}\n" with open(output_ann_path, "w") as ann_writer: ann_writer.write(anns_str) with open(os.path.join( yolo_dataset, k, "annotations", "classes.txt" ), "w") as classes_writer: classes_writer.write("\n".join(all_classes)) if __name__ == "__main__": parse = argparse.ArgumentParser( description="Convert Pascal VOC dataset to YOLO format") parse.add_argument( "--dataset_dir", help="Dataset directory", type=str, required=True, ) args = parse.parse_args() main(args.dataset_dir) ================================================ FILE: setup.py ================================================ from setuptools import setup,find_packages setup(name="imageai", version='3.0.3', description='A python library built to empower developers to build applications and systems with self-contained Computer Vision capabilities', url="https://github.com/OlafenwaMoses/ImageAI", author='Moses Olafenwa', author_email='guymodscientist@gmail.com', license='MIT', packages= find_packages(exclude=["*imageai_tf_deprecated*"]), install_requires=[], include_package_data=True, zip_safe=False) ================================================ FILE: test/test_custom_classification.py ================================================ import os, sys import cv2 from PIL import Image import pytest from os.path import dirname sys.path.insert(1, os.path.join(dirname(dirname(os.path.abspath(__file__))))) from imageai.Classification.Custom import CustomImageClassification test_folder = dirname(os.path.abspath(__file__)) @pytest.mark.parametrize( "image_input", [ (os.path.join(test_folder, "data-images", "1.jpg")), (cv2.imread(os.path.join(test_folder, "data-images", "1.jpg"))), (Image.open(os.path.join(test_folder, "data-images", "1.jpg"))), ] ) def test_recognition_model_mobilenetv2(image_input): classifier = CustomImageClassification() classifier.setModelTypeAsMobileNetV2() classifier.setModelPath(os.path.join(test_folder, "data-models", "mobilenet_v2-idenprof-test_acc_0.85300_epoch-92.pt")) classifier.setJsonPath(os.path.join(test_folder, "data-json", "idenprof_model_classes.json")) classifier.loadModel() predictions, probabilities = classifier.classifyImage(image_input=image_input, result_count=5) assert isinstance(predictions, list) assert isinstance(probabilities, list) assert isinstance(predictions[0], str) assert isinstance(probabilities[0], float) @pytest.mark.parametrize( "image_input", [ (os.path.join(test_folder, "data-images", "1.jpg")), (cv2.imread(os.path.join(test_folder, "data-images", "1.jpg"))), (Image.open(os.path.join(test_folder, "data-images", "1.jpg"))), ] ) def test_recognition_model_resnet(image_input): classifier = CustomImageClassification() classifier.setModelTypeAsResNet50() classifier.setModelPath(os.path.join(test_folder, "data-models", "resnet50-idenprof-test_acc_0.78200_epoch-91.pt")) classifier.setJsonPath(os.path.join(test_folder, "data-json", "idenprof_model_classes.json")) classifier.loadModel() predictions, probabilities = classifier.classifyImage(image_input=image_input, result_count=5) assert isinstance(predictions, list) assert isinstance(probabilities, list) assert isinstance(predictions[0], str) assert isinstance(probabilities[0], float) @pytest.mark.parametrize( "image_input", [ (os.path.join(test_folder, "data-images", "1.jpg")), (cv2.imread(os.path.join(test_folder, "data-images", "1.jpg"))), (Image.open(os.path.join(test_folder, "data-images", "1.jpg"))), ] ) def test_recognition_model_inceptionv3(image_input): classifier = CustomImageClassification() classifier.setModelTypeAsInceptionV3() classifier.setModelPath(os.path.join(test_folder, "data-models", "inception_v3-idenprof-test_acc_0.81050_epoch-92.pt")) classifier.setJsonPath(os.path.join(test_folder, "data-json", "idenprof_model_classes.json")) classifier.loadModel() predictions, probabilities = classifier.classifyImage(image_input=image_input, result_count=5) assert isinstance(predictions, list) assert isinstance(probabilities, list) assert isinstance(predictions[0], str) assert isinstance(probabilities[0], float) @pytest.mark.parametrize( "image_input", [ (os.path.join(test_folder, "data-images", "1.jpg")), (cv2.imread(os.path.join(test_folder, "data-images", "1.jpg"))), (Image.open(os.path.join(test_folder, "data-images", "1.jpg"))), ] ) def test_recognition_model_densenet(image_input): classifier = CustomImageClassification() classifier.setModelTypeAsDenseNet121() classifier.setModelPath(os.path.join(test_folder, "data-models", "densenet121-idenprof-test_acc_0.82550_epoch-95.pt")) classifier.setJsonPath(os.path.join(test_folder, "data-json", "idenprof_model_classes.json")) classifier.loadModel() predictions, probabilities = classifier.classifyImage(image_input=image_input, result_count=5) assert isinstance(predictions, list) assert isinstance(probabilities, list) assert isinstance(predictions[0], str) assert isinstance(probabilities[0], float) ================================================ FILE: test/test_custom_classification_training.py ================================================ import os, sys import cv2 import shutil from PIL import Image import pytest from os.path import dirname sys.path.insert(1, os.path.join(dirname(dirname(os.path.abspath(__file__))))) from imageai.Classification.Custom import ClassificationModelTrainer, CustomImageClassification test_folder = dirname(os.path.abspath(__file__)) classification_dataset = os.path.join( test_folder, "data-datasets", "idenprof" ) pretrained_models_folder = os.path.join( test_folder, "data-models" ) @pytest.mark.parametrize( "transfer_learning", [ (os.path.join( pretrained_models_folder, "resnet50-19c8e357.pth" )), (None), ] ) def test_resnet50_training(transfer_learning): models_dir = os.path.join( classification_dataset, "models" ) if os.path.isdir( models_dir ): shutil.rmtree(models_dir) trainer = ClassificationModelTrainer() trainer.setModelTypeAsResNet50() trainer.setDataDirectory(data_directory=classification_dataset) trainer.trainModel( num_experiments=1, batch_size=2, transfer_from_model=transfer_learning) assert os.path.isdir(models_dir) == True assert os.path.isfile( os.path.join( models_dir, "idenprof_model_classes.json" ) ) == True model_found = False for file in os.listdir(models_dir): if file.endswith(".pt"): model_found = True assert model_found == True @pytest.mark.parametrize( "transfer_learning", [ (os.path.join( pretrained_models_folder, "densenet121-a639ec97.pth" )), (None), ] ) def test_densenet121_training(transfer_learning): models_dir = os.path.join( classification_dataset, "models" ) if os.path.isdir( models_dir ): shutil.rmtree(models_dir) trainer = ClassificationModelTrainer() trainer.setModelTypeAsDenseNet121() trainer.setDataDirectory(data_directory=classification_dataset) trainer.trainModel( num_experiments=1, batch_size=2, transfer_from_model=transfer_learning) assert os.path.isdir(models_dir) == True assert os.path.isfile( os.path.join( models_dir, "idenprof_model_classes.json" ) ) == True model_found = False for file in os.listdir(models_dir): if file.endswith(".pt"): model_found = True assert model_found == True @pytest.mark.parametrize( "transfer_learning", [ (os.path.join( pretrained_models_folder, "inception_v3_google-1a9a5a14.pth" )), (None), ] ) def test_inceptionv3_training(transfer_learning): models_dir = os.path.join( classification_dataset, "models" ) if os.path.isdir( models_dir ): shutil.rmtree(models_dir) trainer = ClassificationModelTrainer() trainer.setModelTypeAsInceptionV3() trainer.setDataDirectory(data_directory=classification_dataset) trainer.trainModel( num_experiments=1, batch_size=2, transfer_from_model=transfer_learning) assert os.path.isdir(models_dir) == True assert os.path.isfile( os.path.join( models_dir, "idenprof_model_classes.json" ) ) == True model_found = False for file in os.listdir(models_dir): if file.endswith(".pt"): model_found = True assert model_found == True @pytest.mark.parametrize( "transfer_learning", [ (os.path.join( pretrained_models_folder, "mobilenet_v2-b0353104.pth" )), (None), ] ) def test_mobilenetv2_training(transfer_learning): models_dir = os.path.join( classification_dataset, "models" ) if os.path.isdir( models_dir ): shutil.rmtree(models_dir) trainer = ClassificationModelTrainer() trainer.setModelTypeAsMobileNetV2() trainer.setDataDirectory(data_directory=classification_dataset) trainer.trainModel( num_experiments=1, batch_size=2, transfer_from_model=transfer_learning) assert os.path.isdir(models_dir) == True assert os.path.isfile( os.path.join( models_dir, "idenprof_model_classes.json" ) ) == True model_found = False for file in os.listdir(models_dir): if file.endswith(".pt"): model_found = True assert model_found == True ================================================ FILE: test/test_custom_detection_training.py ================================================ import os, sys import shutil import pytest from os.path import dirname sys.path.insert(1, os.path.join(dirname(dirname(os.path.abspath(__file__))))) from imageai.Detection.Custom import DetectionModelTrainer test_folder = dirname(os.path.abspath(__file__)) detection_dataset = os.path.join( test_folder, "data-datasets", "number-plate" ) pretrained_models_folder = os.path.join( test_folder, "data-models" ) def delete_cache(dirs: list): for dir in dirs: if os.path.isdir(dir): shutil.rmtree(dir) @pytest.mark.parametrize( "transfer_learning", [ (os.path.join( pretrained_models_folder, "yolov3.pt" )), (None), ] ) def test_yolov3_training(transfer_learning): json_dir = os.path.join(detection_dataset, "json") json_file = os.path.join(json_dir, "number-plate_yolov3_detection_config.json") models_dir = os.path.join(detection_dataset, "models") delete_cache([json_dir, models_dir]) trainer = DetectionModelTrainer() trainer.setModelTypeAsYOLOv3() trainer.setDataDirectory(data_directory=detection_dataset) trainer.setTrainConfig(object_names_array=["number-plate"], batch_size=2, num_experiments=2, train_from_pretrained_model=transfer_learning) trainer.trainModel() assert os.path.isfile(json_file) assert len([file for file in os.listdir(models_dir) if file.endswith(".pt")]) > 0 delete_cache([json_dir, models_dir]) @pytest.mark.parametrize( "transfer_learning", [ (os.path.join( pretrained_models_folder, "tiny-yolov3.pt" )), (None), ] ) def test_tiny_yolov3_training(transfer_learning): json_dir = os.path.join(detection_dataset, "json") json_file = os.path.join(json_dir, "number-plate_tiny-yolov3_detection_config.json") models_dir = os.path.join(detection_dataset, "models") delete_cache([json_dir, models_dir]) trainer = DetectionModelTrainer() trainer.setModelTypeAsTinyYOLOv3() trainer.setDataDirectory(data_directory=detection_dataset) trainer.setTrainConfig(object_names_array=["number-plate"], batch_size=2, num_experiments=2, train_from_pretrained_model=transfer_learning) trainer.trainModel() assert os.path.isfile(json_file) assert len([file for file in os.listdir(models_dir) if file.endswith(".pt")]) > 0 delete_cache([json_dir, models_dir]) ================================================ FILE: test/test_custom_object_detection.py ================================================ import os, sys from typing import List import shutil import cv2 import uuid from PIL import Image import numpy as np import pytest from os.path import dirname sys.path.insert(1, os.path.join(dirname(dirname(os.path.abspath(__file__))))) from imageai.Detection.Custom import CustomObjectDetection test_folder = dirname(os.path.abspath(__file__)) def delete_cache(paths: List[str]): for path in paths: if os.path.isfile(path): os.remove(path) elif os.path.isdir(path): shutil.rmtree(path) @pytest.mark.parametrize( "input_image, output_type, extract_objects", [ (os.path.join(test_folder, test_folder, "data-images", "15.jpg"), "file", False), (os.path.join(test_folder, test_folder, "data-images", "15.jpg"), "file", True), (cv2.imread(os.path.join(test_folder, test_folder, "data-images", "15.jpg")), "array", False), (cv2.imread(os.path.join(test_folder, test_folder, "data-images", "15.jpg")), "array", True), (Image.open(os.path.join(test_folder, test_folder, "data-images", "15.jpg")), "array", True), ] ) def test_object_detection_yolov3(input_image, output_type, extract_objects): detector = CustomObjectDetection() detector.setModelTypeAsYOLOv3() detector.setModelPath(os.path.join(test_folder, "data-models", "yolov3_number-plate-dataset-imageai_mAP-0.57145_epoch-11.pt")) detector.setJsonPath(os.path.join(test_folder, "data-json", "number-plate-dataset-imageai_yolov3_detection_config.json")) detector.loadModel() output_img_path = os.path.join(test_folder, "data-images", str(uuid.uuid4()) + ".jpg") if output_type == "array": if extract_objects: output_image_array, detections, extracted_objects = detector.detectObjectsFromImage(input_image=input_image, output_type=output_type, extract_detected_objects=extract_objects) assert len(detections) > 0 assert len(extracted_objects) > 0 for extracted_obj in extracted_objects: assert type(extracted_obj) == np.ndarray else: output_image_array, detections = detector.detectObjectsFromImage(input_image=input_image, output_type=output_type) assert type(output_image_array) == np.ndarray assert len(detections) > 0 else: if extract_objects: detections, extracted_object_paths = detector.detectObjectsFromImage(input_image=input_image, output_image_path=output_img_path, extract_detected_objects=True) assert len(detections) > 0 assert os.path.isfile(output_img_path) assert len(extracted_object_paths) > 0 delete_cache( extracted_object_paths ) delete_cache( [extracted_object_paths[0], output_img_path] ) else: detections = detector.detectObjectsFromImage(input_image=input_image, output_image_path=output_img_path) assert len(detections) > 0 delete_cache([output_img_path]) assert type(detections) == list for eachObject in detections: assert type(eachObject) == dict assert "name" in eachObject.keys() assert type(eachObject["name"]) == str assert "percentage_probability" in eachObject.keys() assert type(eachObject["percentage_probability"]) == float assert "box_points" in eachObject.keys() assert type(eachObject["box_points"]) == list box_points = eachObject["box_points"] for point in box_points: assert type(point) == int assert box_points[0] < box_points[2] assert box_points[1] < box_points[3] @pytest.mark.parametrize( "input_image, output_type, extract_objects", [ (os.path.join(test_folder, test_folder, "data-images", "15.jpg"), "file", False), (os.path.join(test_folder, test_folder, "data-images", "15.jpg"), "file", True), (cv2.imread(os.path.join(test_folder, test_folder, "data-images", "15.jpg")), "array", False), (cv2.imread(os.path.join(test_folder, test_folder, "data-images", "15.jpg")), "array", True), (Image.open(os.path.join(test_folder, test_folder, "data-images", "15.jpg")), "array", True), ] ) def test_object_detection_tiny_yolov3(input_image, output_type, extract_objects): detector = CustomObjectDetection() detector.setModelTypeAsTinyYOLOv3() detector.setModelPath(os.path.join(test_folder, "data-models", "tiny_yolov3_number-plate-dataset-imageai_mAP-0.22595_epoch-20.pt")) detector.setJsonPath(os.path.join(test_folder, "data-json", "number-plate-dataset-imageai_tiny_yolov3_detection_config.json")) detector.loadModel() output_img_path = os.path.join(test_folder, "data-images", str(uuid.uuid4()) + ".jpg") if output_type == "array": if extract_objects: output_image_array, detections, extracted_objects = detector.detectObjectsFromImage(input_image=input_image, output_type=output_type, extract_detected_objects=extract_objects) assert len(detections) > 0 assert len(extracted_objects) == len(detections) for extracted_obj in extracted_objects: assert type(extracted_obj) == np.ndarray else: output_image_array, detections = detector.detectObjectsFromImage(input_image=input_image, output_type=output_type) assert type(output_image_array) == np.ndarray assert len(detections) > 0 else: if extract_objects: detections, extracted_object_paths = detector.detectObjectsFromImage(input_image=input_image, output_image_path=output_img_path, extract_detected_objects=True) assert len(detections) > 0 assert os.path.isfile(output_img_path) assert len(extracted_object_paths) == len(detections) delete_cache( extracted_object_paths ) delete_cache( [extracted_object_paths[0], output_img_path] ) else: detections = detector.detectObjectsFromImage(input_image=input_image, output_image_path=output_img_path) assert len(detections) > 0 delete_cache([output_img_path]) assert type(detections) == list for eachObject in detections: assert type(eachObject) == dict assert "name" in eachObject.keys() assert type(eachObject["name"]) == str assert "percentage_probability" in eachObject.keys() assert type(eachObject["percentage_probability"]) == float assert "box_points" in eachObject.keys() assert type(eachObject["box_points"]) == list box_points = eachObject["box_points"] for point in box_points: assert type(point) == int assert box_points[0] < box_points[2] assert box_points[1] < box_points[3] ================================================ FILE: test/test_custom_video_detection.py ================================================ import os, sys from typing import List from numpy import ndarray from os.path import dirname from mock import patch sys.path.insert(1, os.path.join(dirname(dirname(os.path.abspath(__file__))))) from imageai.Detection.Custom import CustomVideoObjectDetection test_folder = dirname(os.path.abspath(__file__)) video_file = os.path.join(test_folder, "data-videos", "dashcam.mp4") video_file_output = os.path.join(test_folder, "data-videos", "dashcam-detected") class CallbackFunctions: def forFrame(frame_number, output_array, output_count, detected_frame): assert isinstance(detected_frame, ndarray) assert isinstance(frame_number, int) assert isinstance(output_array, list) assert isinstance(output_array[0], dict) assert isinstance(output_array[0]["name"], str) assert isinstance(output_array[0]["percentage_probability"], float) assert isinstance(output_array[0]["box_points"], list) assert isinstance(output_count, dict) for a_key in dict(output_count).keys(): assert isinstance(a_key, str) assert isinstance(output_count[a_key], int) def forSecond(second_number, output_arrays, count_arrays, average_output_count, detected_frame): assert isinstance(detected_frame, ndarray) assert isinstance(second_number, int) assert isinstance(output_arrays, list) assert isinstance(output_arrays[0], list) assert isinstance(output_arrays[0][0], dict) assert isinstance(output_arrays[0][0]["name"], str) assert isinstance(output_arrays[0][0]["percentage_probability"], float) assert isinstance(output_arrays[0][0]["box_points"], list) assert isinstance(count_arrays, list) assert isinstance(count_arrays[0], dict) for a_key in dict(count_arrays[0]).keys(): assert isinstance(a_key, str) assert isinstance(count_arrays[0][a_key], int) assert isinstance(average_output_count, dict) for a_key2 in dict(average_output_count).keys(): assert isinstance(a_key2, str) assert isinstance(average_output_count[a_key2], int) def delete_cache(files: List[str]): for file in files: if os.path.isfile(file): os.remove(file) def test_video_detection_yolov3(): delete_cache([video_file_output + ".mp4"]) detector = CustomVideoObjectDetection() detector.setModelTypeAsYOLOv3() detector.setModelPath(model_path=os.path.join(test_folder, "data-models", "yolov3_number-plate-dataset-imageai_mAP-0.57145_epoch-11.pt")) detector.setJsonPath(os.path.join(test_folder, "data-json", "number-plate-dataset-imageai_yolov3_detection_config.json")) detector.loadModel() video_path = detector.detectObjectsFromVideo(input_file_path=video_file, output_file_path=video_file_output, save_detected_video=True, frames_per_second=30, log_progress=True) assert os.path.exists(video_file_output + ".mp4") assert isinstance(video_path, str) delete_cache([video_file_output + ".mp4"]) def test_video_detection_tiny_yolov3(): delete_cache([video_file_output + ".mp4"]) detector = CustomVideoObjectDetection() detector.setModelTypeAsTinyYOLOv3() detector.setModelPath(model_path=os.path.join(test_folder, "data-models", "tiny_yolov3_number-plate-dataset-imageai_mAP-0.22595_epoch-20.pt")) detector.setJsonPath(os.path.join(test_folder, "data-json", "number-plate-dataset-imageai_tiny_yolov3_detection_config.json")) detector.loadModel() video_path = detector.detectObjectsFromVideo(input_file_path=video_file, output_file_path=video_file_output, save_detected_video=True, frames_per_second=30, log_progress=True) assert os.path.exists(video_file_output + ".mp4") assert isinstance(video_path, str) delete_cache([video_file_output + ".mp4"]) def test_video_detection_yolo_analysis(): delete_cache([video_file_output + ".mp4"]) detector = CustomVideoObjectDetection() detector.setModelTypeAsYOLOv3() detector.setModelPath(model_path=os.path.join(test_folder, "data-models", "yolov3_number-plate-dataset-imageai_mAP-0.57145_epoch-11.pt")) detector.setJsonPath(os.path.join(test_folder, "data-json", "number-plate-dataset-imageai_yolov3_detection_config.json")) detector.loadModel() with patch.object(CallbackFunctions, 'forFrame') as frameFunc: with patch.object(CallbackFunctions, 'forSecond') as secondFunc: video_path = detector.detectObjectsFromVideo(input_file_path=video_file, output_file_path=video_file_output, save_detected_video=True, frames_per_second=30, log_progress=True, per_frame_function=frameFunc, per_second_function=secondFunc, return_detected_frame=True) assert os.path.exists(video_file_output + ".mp4") assert isinstance(video_path, str) frameFunc.assert_called() secondFunc.assert_called() delete_cache([video_file_output + ".mp4"]) ================================================ FILE: test/test_image_classification.py ================================================ import os, sys import cv2 from PIL import Image import pytest from os.path import dirname sys.path.insert(1, os.path.join(dirname(dirname(os.path.abspath(__file__))))) from imageai.Classification import ImageClassification test_folder = dirname(os.path.abspath(__file__)) @pytest.mark.parametrize( "image_input", [ (os.path.join(test_folder, "data-images", "1.jpg")), (cv2.imread(os.path.join(test_folder, "data-images", "1.jpg"))), (Image.open(os.path.join(test_folder, "data-images", "1.jpg"))), ] ) def test_recognition_model_mobilenetv2(image_input): classifier = ImageClassification() classifier.setModelTypeAsMobileNetV2() classifier.setModelPath(os.path.join(test_folder, "data-models", "mobilenet_v2-b0353104.pth")) classifier.loadModel() predictions, probabilities = classifier.classifyImage(image_input=image_input) assert isinstance(predictions, list) assert isinstance(probabilities, list) assert isinstance(predictions[0], str) assert isinstance(probabilities[0], float) @pytest.mark.parametrize( "image_input", [ (os.path.join(test_folder, "data-images", "1.jpg")), (cv2.imread(os.path.join(test_folder, "data-images", "1.jpg"))), (Image.open(os.path.join(test_folder, "data-images", "1.jpg"))), ] ) def test_recognition_model_resnet(image_input): classifier = ImageClassification() classifier.setModelTypeAsResNet50() classifier.setModelPath(os.path.join(test_folder, "data-models", "resnet50-19c8e357.pth")) classifier.loadModel() predictions, probabilities = classifier.classifyImage(image_input=image_input) assert isinstance(predictions, list) assert isinstance(probabilities, list) assert isinstance(predictions[0], str) assert isinstance(probabilities[0], float) @pytest.mark.parametrize( "image_input", [ (os.path.join(test_folder, "data-images", "1.jpg")), (cv2.imread(os.path.join(test_folder, "data-images", "1.jpg"))), (Image.open(os.path.join(test_folder, "data-images", "1.jpg"))), ] ) def test_recognition_model_inceptionv3(image_input): classifier = ImageClassification() classifier.setModelTypeAsInceptionV3() classifier.setModelPath(os.path.join(test_folder, "data-models", "inception_v3_google-1a9a5a14.pth")) classifier.loadModel() predictions, probabilities = classifier.classifyImage(image_input=image_input) assert isinstance(predictions, list) assert isinstance(probabilities, list) assert isinstance(predictions[0], str) assert isinstance(probabilities[0], float) @pytest.mark.parametrize( "image_input", [ (os.path.join(test_folder, "data-images", "1.jpg")), (cv2.imread(os.path.join(test_folder, "data-images", "1.jpg"))), (Image.open(os.path.join(test_folder, "data-images", "1.jpg"))), ] ) def test_recognition_model_densenet(image_input): classifier = ImageClassification() classifier.setModelTypeAsDenseNet121() classifier.setModelPath(os.path.join(test_folder, "data-models", "densenet121-a639ec97.pth")) classifier.loadModel() predictions, probabilities = classifier.classifyImage(image_input=image_input) assert isinstance(predictions, list) assert isinstance(probabilities, list) assert isinstance(predictions[0], str) assert isinstance(probabilities[0], float) ================================================ FILE: test/test_object_detection.py ================================================ import os, sys from typing import List import shutil import cv2 import uuid from PIL import Image import numpy as np import pytest from os.path import dirname sys.path.insert(1, os.path.join(dirname(dirname(os.path.abspath(__file__))))) from imageai.Detection import ObjectDetection test_folder = dirname(os.path.abspath(__file__)) def delete_cache(paths: List[str]): for path in paths: if os.path.isfile(path): os.remove(path) elif os.path.isdir(path): shutil.rmtree(path) @pytest.mark.parametrize( "input_image, output_type, extract_objects", [ (os.path.join(test_folder, test_folder, "data-images", "1.jpg"), "file", False), (os.path.join(test_folder, test_folder, "data-images", "4.jpg"), "file", False), (os.path.join(test_folder, test_folder, "data-images", "1.jpg"), "file", True), (cv2.imread(os.path.join(test_folder, test_folder, "data-images", "1.jpg")), "array", False), (cv2.imread(os.path.join(test_folder, test_folder, "data-images", "1.jpg")), "array", True), (Image.open(os.path.join(test_folder, test_folder, "data-images", "1.jpg")), "array", True), ] ) def test_object_detection_retinanet(input_image, output_type, extract_objects): detector = ObjectDetection() detector.setModelTypeAsRetinaNet() detector.setModelPath(os.path.join(test_folder, "data-models", "retinanet_resnet50_fpn_coco-eeacb38b.pth")) detector.loadModel() output_img_path = os.path.join(test_folder, "data-images", str(uuid.uuid4()) + ".jpg") if output_type == "array": if extract_objects: output_image_array, detections, extracted_objects = detector.detectObjectsFromImage(input_image=input_image, output_type=output_type, extract_detected_objects=extract_objects) assert len(extracted_objects) > 1 for extracted_obj in extracted_objects: assert type(extracted_obj) == np.ndarray assert type(detections) == list else: output_image_array, detections = detector.detectObjectsFromImage(input_image=input_image, output_type=output_type) assert type(output_image_array) == np.ndarray assert type(detections) == list else: if extract_objects: detections, extracted_object_paths = detector.detectObjectsFromImage(input_image=input_image, output_image_path=output_img_path, extract_detected_objects=True) assert type(detections) == list assert os.path.isfile(output_img_path) assert len(extracted_object_paths) > 3 delete_cache( extracted_object_paths ) delete_cache( [extracted_object_paths[0], output_img_path] ) else: detections = detector.detectObjectsFromImage(input_image=input_image, output_image_path=output_img_path) assert type(detections) == list delete_cache( [output_img_path] ) for eachObject in detections: assert type(eachObject) == dict assert "name" in eachObject.keys() assert type(eachObject["name"]) == str assert "percentage_probability" in eachObject.keys() assert type(eachObject["percentage_probability"]) == float assert "box_points" in eachObject.keys() assert type(eachObject["box_points"]) == list box_points = eachObject["box_points"] for point in box_points: assert type(point) == int assert box_points[0] < box_points[2] assert box_points[1] < box_points[3] @pytest.mark.parametrize( "input_image, output_type, extract_objects", [ (os.path.join(test_folder, test_folder, "data-images", "1.jpg"), "file", False), (os.path.join(test_folder, test_folder, "data-images", "4.jpg"), "file", False), (os.path.join(test_folder, test_folder, "data-images", "1.jpg"), "file", True), (cv2.imread(os.path.join(test_folder, test_folder, "data-images", "1.jpg")), "array", False), (cv2.imread(os.path.join(test_folder, test_folder, "data-images", "1.jpg")), "array", True), (Image.open(os.path.join(test_folder, test_folder, "data-images", "1.jpg")), "array", True), ] ) def test_object_detection_yolov3(input_image, output_type, extract_objects): detector = ObjectDetection() detector.setModelTypeAsYOLOv3() detector.setModelPath(os.path.join(test_folder, "data-models", "yolov3.pt")) detector.loadModel() output_img_path = os.path.join(test_folder, "data-images", str(uuid.uuid4()) + ".jpg") if output_type == "array": if extract_objects: output_image_array, detections, extracted_objects = detector.detectObjectsFromImage(input_image=input_image, output_type=output_type, extract_detected_objects=extract_objects) assert len(extracted_objects) > 1 assert type(detections) == list for extracted_obj in extracted_objects: assert type(extracted_obj) == np.ndarray else: output_image_array, detections = detector.detectObjectsFromImage(input_image=input_image, output_type=output_type) assert type(output_image_array) == np.ndarray assert type(detections) == list else: if extract_objects: detections, extracted_object_paths = detector.detectObjectsFromImage(input_image=input_image, output_image_path=output_img_path, extract_detected_objects=True) assert os.path.isfile(output_img_path) assert len(extracted_object_paths) > 3 assert type(detections) == list delete_cache( extracted_object_paths ) delete_cache( [extracted_object_paths[0], output_img_path] ) else: detections = detector.detectObjectsFromImage(input_image=input_image, output_image_path=output_img_path) assert type(detections) == list delete_cache( [output_img_path] ) for eachObject in detections: assert type(eachObject) == dict assert "name" in eachObject.keys() assert type(eachObject["name"]) == str assert "percentage_probability" in eachObject.keys() assert type(eachObject["percentage_probability"]) == float assert "box_points" in eachObject.keys() assert type(eachObject["box_points"]) == list box_points = eachObject["box_points"] for point in box_points: assert type(point) == int assert box_points[0] < box_points[2] assert box_points[1] < box_points[3] @pytest.mark.parametrize( "input_image, output_type, extract_objects", [ (os.path.join(test_folder, test_folder, "data-images", "1.jpg"), "file", False), (os.path.join(test_folder, test_folder, "data-images", "4.jpg"), "file", False), (os.path.join(test_folder, test_folder, "data-images", "1.jpg"), "file", True), (cv2.imread(os.path.join(test_folder, test_folder, "data-images", "1.jpg")), "array", False), (cv2.imread(os.path.join(test_folder, test_folder, "data-images", "1.jpg")), "array", True), (Image.open(os.path.join(test_folder, test_folder, "data-images", "11.jpg")), "array", True), ] ) def test_object_detection_tiny_yolov3(input_image, output_type, extract_objects): detector = ObjectDetection() detector.setModelTypeAsTinyYOLOv3() detector.setModelPath(os.path.join(test_folder, "data-models", "tiny-yolov3.pt")) detector.loadModel() output_img_path = os.path.join(test_folder, "data-images", str(uuid.uuid4()) + ".jpg") if output_type == "array": if extract_objects: output_image_array, detections, extracted_objects = detector.detectObjectsFromImage(input_image=input_image, output_type=output_type, extract_detected_objects=extract_objects) assert len(extracted_objects) > 1 assert type(detections) == list for extracted_obj in extracted_objects: assert type(extracted_obj) == np.ndarray else: output_image_array, detections = detector.detectObjectsFromImage(input_image=input_image, output_type=output_type) assert type(output_image_array) == np.ndarray assert type(detections) == list else: if extract_objects: detections, extracted_object_paths = detector.detectObjectsFromImage(input_image=input_image, output_image_path=output_img_path, extract_detected_objects=True) assert os.path.isfile(output_img_path) assert len(extracted_object_paths) > 1 assert type(detections) == list delete_cache( extracted_object_paths ) delete_cache( [extracted_object_paths[0], output_img_path] ) else: detections = detector.detectObjectsFromImage(input_image=input_image, output_image_path=output_img_path) assert type(detections) == list delete_cache( [output_img_path] ) for eachObject in detections: assert type(eachObject) == dict assert "name" in eachObject.keys() assert type(eachObject["name"]) == str assert "percentage_probability" in eachObject.keys() assert type(eachObject["percentage_probability"]) == float assert "box_points" in eachObject.keys() assert type(eachObject["box_points"]) == list box_points = eachObject["box_points"] for point in box_points: assert type(point) == int assert box_points[0] < box_points[2] assert box_points[1] < box_points[3] @pytest.mark.parametrize( "input_image", [ (os.path.join(test_folder, test_folder, "data-images", "11.jpg")), (cv2.imread(os.path.join(test_folder, test_folder, "data-images", "11.jpg"))), (Image.open(os.path.join(test_folder, test_folder, "data-images", "11.jpg"))), ] ) def test_object_detection_retinanet_custom_objects(input_image): detector = ObjectDetection() detector.setModelTypeAsRetinaNet() detector.setModelPath(os.path.join(test_folder, "data-models", "retinanet_resnet50_fpn_coco-eeacb38b.pth")) detector.loadModel() custom = detector.CustomObjects(person=True, cell_phone=True) custom_detections = detector.detectObjectsFromImage(input_image=input_image, custom_objects=custom) for custom_detection in custom_detections: assert custom_detection["name"] in ["person", "cell phone"] detections = detector.detectObjectsFromImage(input_image=input_image) assert len(detections) > len(custom_detections) @pytest.mark.parametrize( "input_image", [ (os.path.join(test_folder, test_folder, "data-images", "11.jpg")), (cv2.imread(os.path.join(test_folder, test_folder, "data-images", "11.jpg"))), (Image.open(os.path.join(test_folder, test_folder, "data-images", "11.jpg"))), ] ) def test_object_detection_yolov3_custom_objects(input_image): detector = ObjectDetection() detector.setModelTypeAsYOLOv3() detector.setModelPath(os.path.join(test_folder, "data-models", "yolov3.pt")) detector.loadModel() custom = detector.CustomObjects(person=True, cell_phone=True) custom_detections = detector.detectObjectsFromImage(input_image=input_image, custom_objects=custom) for custom_detection in custom_detections: assert custom_detection["name"] in ["person", "cell phone"] detections = detector.detectObjectsFromImage(input_image=input_image) assert len(detections) > len(custom_detections) @pytest.mark.parametrize( "input_image", [ (os.path.join(test_folder, test_folder, "data-images", "11.jpg")), (cv2.imread(os.path.join(test_folder, test_folder, "data-images", "11.jpg"))), (Image.open(os.path.join(test_folder, test_folder, "data-images", "11.jpg"))), ] ) def test_object_detection_tiny_yolov3_custom_objects(input_image): detector = ObjectDetection() detector.setModelTypeAsTinyYOLOv3() detector.setModelPath(os.path.join(test_folder, "data-models", "tiny-yolov3.pt")) detector.loadModel() custom = detector.CustomObjects(person=True, cell_phone=True) custom_detections = detector.detectObjectsFromImage(input_image=input_image, custom_objects=custom) for custom_detection in custom_detections: assert custom_detection["name"] in ["person", "cell phone"] detections = detector.detectObjectsFromImage(input_image=input_image) assert len(detections) > len(custom_detections) ================================================ FILE: test/test_video_object_detection.py ================================================ import os, sys from typing import List from numpy import ndarray from os.path import dirname from mock import patch sys.path.insert(1, os.path.join(dirname(dirname(os.path.abspath(__file__))))) from imageai.Detection import VideoObjectDetection test_folder = dirname(os.path.abspath(__file__)) video_file = os.path.join(test_folder, "data-videos", "traffic-micro.mp4") video_file_output = os.path.join(test_folder, "data-videos", "traffic-micro-detected") class CallbackFunctions: def forFrame(frame_number, output_array, output_count, detected_frame): assert isinstance(detected_frame, ndarray) assert isinstance(frame_number, int) assert isinstance(output_array, list) assert isinstance(output_array[0], dict) assert isinstance(output_array[0]["name"], str) assert isinstance(output_array[0]["percentage_probability"], float) assert isinstance(output_array[0]["box_points"], list) assert isinstance(output_count, dict) for a_key in dict(output_count).keys(): assert isinstance(a_key, str) assert isinstance(output_count[a_key], int) def forSecond(second_number, output_arrays, count_arrays, average_output_count, detected_frame): assert isinstance(detected_frame, ndarray) assert isinstance(second_number, int) assert isinstance(output_arrays, list) assert isinstance(output_arrays[0], list) assert isinstance(output_arrays[0][0], dict) assert isinstance(output_arrays[0][0]["name"], str) assert isinstance(output_arrays[0][0]["percentage_probability"], float) assert isinstance(output_arrays[0][0]["box_points"], list) assert isinstance(count_arrays, list) assert isinstance(count_arrays[0], dict) for a_key in dict(count_arrays[0]).keys(): assert isinstance(a_key, str) assert isinstance(count_arrays[0][a_key], int) assert isinstance(average_output_count, dict) for a_key2 in dict(average_output_count).keys(): assert isinstance(a_key2, str) assert isinstance(average_output_count[a_key2], int) def delete_cache(files: List[str]): for file in files: if os.path.isfile(file): os.remove(file) def test_video_detection_retinanet(): delete_cache([video_file_output + ".mp4"]) detector = VideoObjectDetection() detector.setModelTypeAsRetinaNet() detector.setModelPath(model_path=os.path.join(test_folder, "data-models", "retinanet_resnet50_fpn_coco-eeacb38b.pth")) detector.loadModel() video_path = detector.detectObjectsFromVideo(input_file_path=video_file, output_file_path=video_file_output, save_detected_video=True, frames_per_second=30, log_progress=True) assert os.path.exists(video_file_output + ".mp4") assert isinstance(video_path, str) delete_cache([video_file_output + ".mp4"]) def test_video_detection_retinanet_custom_objects(): delete_cache([video_file_output + ".mp4"]) detector = VideoObjectDetection() detector.setModelTypeAsRetinaNet() detector.setModelPath(model_path=os.path.join(test_folder, "data-models", "retinanet_resnet50_fpn_coco-eeacb38b.pth")) detector.loadModel() custom_objects = detector.CustomObjects( person=True, bus=True ) video_path = detector.detectObjectsFromVideo(input_file_path=video_file, output_file_path=video_file_output, save_detected_video=True, frames_per_second=30, log_progress=True, custom_objects=custom_objects) assert os.path.exists(video_file_output + ".mp4") assert isinstance(video_path, str) delete_cache([video_file_output + ".mp4"]) def test_video_detection_yolov3(): delete_cache([video_file_output + ".mp4"]) detector = VideoObjectDetection() detector.setModelTypeAsYOLOv3() detector.setModelPath(model_path=os.path.join(test_folder, "data-models", "yolov3.pt")) detector.loadModel() video_path = detector.detectObjectsFromVideo(input_file_path=video_file, output_file_path=video_file_output, save_detected_video=True, frames_per_second=30, log_progress=True) assert os.path.exists(video_file_output + ".mp4") assert isinstance(video_path, str) delete_cache([video_file_output + ".mp4"]) def test_video_detection_tiny_yolov3(): delete_cache([video_file_output + ".mp4"]) detector = VideoObjectDetection() detector.setModelTypeAsTinyYOLOv3() detector.setModelPath(model_path=os.path.join(test_folder, "data-models", "tiny-yolov3.pt")) detector.loadModel() video_path = detector.detectObjectsFromVideo(input_file_path=video_file, output_file_path=video_file_output, save_detected_video=True, frames_per_second=30, log_progress=True) assert os.path.exists(video_file_output + ".mp4") assert isinstance(video_path, str) delete_cache([video_file_output + ".mp4"]) def test_video_detection_retinanet_analysis(): delete_cache([video_file_output + ".mp4"]) detector = VideoObjectDetection() detector.setModelTypeAsRetinaNet() detector.setModelPath(model_path=os.path.join(test_folder, "data-models", "retinanet_resnet50_fpn_coco-eeacb38b.pth")) detector.loadModel() with patch.object(CallbackFunctions, 'forFrame') as frameFunc: with patch.object(CallbackFunctions, 'forSecond') as secondFunc: video_path = detector.detectObjectsFromVideo(input_file_path=video_file, output_file_path=video_file_output, save_detected_video=True, frames_per_second=30, log_progress=True, per_frame_function=frameFunc, per_second_function=secondFunc, return_detected_frame=True) assert os.path.exists(video_file_output + ".mp4") assert isinstance(video_path, str) frameFunc.assert_called() secondFunc.assert_called() delete_cache([video_file_output + ".mp4"])