Repository: AIWintermuteAI/aXeleRate Branch: master Commit: 0012d683e1cb Files: 135 Total size: 572.1 KB Directory structure: gitextract_o2hqtp1u/ ├── .github/ │ ├── FUNDING.yml │ ├── ISSUE_TEMPLATE/ │ │ ├── bug_report.yml │ │ ├── config.yml │ │ └── feature_request.yml │ └── workflows/ │ └── python-publish.yml ├── .gitignore ├── LICENSE ├── README.md ├── axelerate/ │ ├── __init__.py │ ├── evaluate.py │ ├── infer.py │ ├── networks/ │ │ ├── __init__.py │ │ ├── classifier/ │ │ │ ├── __init__.py │ │ │ ├── batch_gen.py │ │ │ ├── directory_iterator.py │ │ │ ├── frontend_classifier.py │ │ │ ├── iterator.py │ │ │ └── utils.py │ │ ├── common_utils/ │ │ │ ├── __init__.py │ │ │ ├── augment.py │ │ │ ├── callbacks.py │ │ │ ├── convert.py │ │ │ ├── feature.py │ │ │ ├── fit.py │ │ │ ├── install_edge_tpu_compiler.sh │ │ │ ├── install_openvino.sh │ │ │ └── mobilenet_sipeed/ │ │ │ ├── __init__.py │ │ │ ├── imagenet_utils.py │ │ │ └── mobilenet.py │ │ ├── segnet/ │ │ │ ├── __init__.py │ │ │ ├── data_utils/ │ │ │ │ ├── __init__.py │ │ │ │ └── data_loader.py │ │ │ ├── frontend_segnet.py │ │ │ ├── metrics.py │ │ │ ├── models/ │ │ │ │ ├── __init__.py │ │ │ │ ├── _pspnet_2.py │ │ │ │ ├── all_models.py │ │ │ │ ├── basic_models.py │ │ │ │ ├── config.py │ │ │ │ ├── fcn.py │ │ │ │ ├── model.py │ │ │ │ ├── model_utils.py │ │ │ │ ├── pspnet.py │ │ │ │ ├── segnet.py │ │ │ │ └── unet.py │ │ │ ├── predict.py │ │ │ └── train.py │ │ └── yolo/ │ │ ├── __init__.py │ │ ├── backend/ │ │ │ ├── __init__.py │ │ │ ├── batch_gen.py │ │ │ ├── decoder.py │ │ │ ├── loss.py │ │ │ ├── network.py │ │ │ └── utils/ │ │ │ ├── __init__.py │ │ │ ├── annotation.py │ │ │ ├── box.py │ │ │ ├── custom.py │ │ │ └── eval/ │ │ │ ├── __init__.py │ │ │ ├── _box_match.py │ │ │ └── fscore.py │ │ └── frontend.py │ └── train.py ├── configs/ │ ├── classifier.json │ ├── detector.json │ ├── dogs_classifier.json │ ├── face_detector.json │ ├── kangaroo_detector.json │ ├── lego_detector.json │ ├── pascal_20_detector.json │ ├── pascal_20_detector_2.json │ ├── pascal_20_segnet.json │ ├── person_detector.json │ ├── raccoon_detector.json │ ├── santa_uno.json │ └── segmentation.json ├── example_scripts/ │ ├── arm_nn/ │ │ ├── README.md │ │ ├── box.py │ │ ├── cv_utils.py │ │ ├── network_executor.py │ │ ├── run_video_file.py │ │ ├── run_video_stream.py │ │ └── yolov2.py │ ├── edge_tpu/ │ │ └── detector/ │ │ ├── box.py │ │ └── detector_video.py │ ├── k210/ │ │ ├── classifier/ │ │ │ └── santa_uno.py │ │ ├── detector/ │ │ │ ├── yolov2/ │ │ │ │ ├── person_detector_v4.py │ │ │ │ ├── raccoon_detector.py │ │ │ │ └── raccoon_detector_uart.py │ │ │ └── yolov3/ │ │ │ └── raccoon_detector.py │ │ └── segnet/ │ │ └── segnet-support-is-WIP-contributions-welcome │ ├── oak/ │ │ └── yolov2/ │ │ ├── YOLO_best_mAP.json │ │ ├── box.py │ │ ├── yolo.py │ │ └── yolo_alt.py │ └── tensorflow_lite/ │ ├── classifier/ │ │ ├── base_camera.py │ │ ├── camera_opencv.py │ │ ├── camera_pi.py │ │ ├── classifier_file.py │ │ ├── classifier_stream.py │ │ ├── cv_utils.py │ │ └── templates/ │ │ └── index.html │ ├── detector/ │ │ ├── base_camera.py │ │ ├── camera_opencv.py │ │ ├── camera_pi.py │ │ ├── cv_utils.py │ │ ├── detector_file.py │ │ ├── detector_stream.py │ │ └── templates/ │ │ └── index.html │ └── segnet/ │ ├── base_camera.py │ ├── camera_opencv.py │ ├── camera_pi.py │ ├── cv_utils.py │ ├── segnet_file.py │ ├── segnet_stream.py │ └── templates/ │ └── index.html ├── resources/ │ ├── aXeleRate_face_detector.ipynb │ ├── aXeleRate_human_segmentation.ipynb │ ├── aXeleRate_mark_detector.ipynb │ ├── aXeleRate_pascal20_detector.ipynb │ ├── aXeleRate_person_detector.ipynb │ └── aXeleRate_standford_dog_classifier.ipynb ├── sample_datasets/ │ └── detector/ │ ├── anns/ │ │ ├── 2007_000032.xml │ │ └── 2007_000033.xml │ └── anns_validation/ │ ├── 2007_000243.xml │ ├── 2007_000250.xml │ ├── 2007_000645.xml │ ├── 2007_001595.xml │ ├── 2007_001834.xml │ ├── 2007_003131.xml │ ├── 2007_003201.xml │ ├── 2007_003593.xml │ ├── 2007_004627.xml │ └── 2007_005803.xml ├── setup.py └── tests_training_and_inference.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/FUNDING.yml ================================================ # These are supported funding model platforms github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] patreon: # Replace with a single Patreon username open_collective: # Replace with a single Open Collective username ko_fi: # Replace with a single Ko-fi username tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry liberapay: # Replace with a single Liberapay username issuehunt: # Replace with a single IssueHunt username otechie: # Replace with a single Otechie username custom: ['https://www.buymeacoffee.com/hardwareai'] ================================================ FILE: .github/ISSUE_TEMPLATE/bug_report.yml ================================================ name: Bug Report description: File a bug report title: "[Bug]: " labels: [bug, triage] assignees: - AIWintermuteAI body: - type: markdown attributes: value: | Thanks for taking the time to fill out this bug report! Before you do, however, make sure you have done the following. - type: checkboxes id: googled attributes: label: Check if applicable options: - label: I used Google/Bing/other search engines to thoroughly research my question and DID NOT find any suitable answers required: true - label: Additionally I went through the issues in this repository/MaixPy/Tensorflow repositories and DID NOT find any suitable answers required: true - type: textarea id: what-happened attributes: label: Describe the bug description: A clear and concise description of what the bug is, with screenshots/models/videos if necessary. value: | **To Reproduce** Steps to reproduce the behavior: 1. Go to '...' 2. Click on '....' 3. Scroll down to '....' 4. See error validations: required: true - type: textarea id: what-expected attributes: label: Expected behavior description: A clear and concise description of what you expected to happen. validations: required: true - type: textarea id: platform attributes: label: Platform description: What platform are you running the code on. value: | - Device: [e.g. Raspberry Pi 4 or M5 StickV] - OS/firmware: [e.g. Raspbian OS 32bit kernel version ...] - Version/commit number of aXeleRate: [e.g. d1816f5] validations: required: true - type: textarea id: logs attributes: label: Relevant log output description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. render: shell ================================================ FILE: .github/ISSUE_TEMPLATE/config.yml ================================================ blank_issues_enabled: false contact_links: - name: Google url: https://google.com/ about: Please find answers to general questions,i.e "what are anchors", "how is mAP calculated", "my cat coughing up fur can you help please" HERE. ================================================ FILE: .github/ISSUE_TEMPLATE/feature_request.yml ================================================ name: Feature request description: Suggest an idea for this project title: "[Feature request]: " labels: [enhancement, help wanted] body: - type: markdown attributes: value: | Thanks for interest in improving aXeleRate! It is a personal project of mine, which I continually develop with help of other volunteers. - type: checkboxes id: boxes attributes: label: Choose an option options: - label: I'd like to contribute to development by making a PR. - label: Alternatively I could consider a small beer donation to the developer as token of my appreciation. - type: textarea id: feature attributes: label: Describe the desired feature description: A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]. Add screenshots/models/videos if necessary. validations: required: true - type: textarea id: what-expected attributes: label: Describe the solution you'd like description: A clear and concise description of what you want to happen. validations: required: true - type: textarea id: logs attributes: label: Relevant log output description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. render: shell ================================================ FILE: .github/workflows/python-publish.yml ================================================ # This workflows will upload a Python Package using Twine when a release is created # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries name: Upload Python Package on: release: types: [created] jobs: deploy: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - name: Set up Python uses: actions/setup-python@v2 with: python-version: '3.x' - name: Install dependencies run: | python -m pip install --upgrade pip pip install setuptools wheel twine - name: Build and publish env: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: | python setup.py sdist bdist_wheel twine upload dist/* ================================================ FILE: .gitignore ================================================ __pycache__/ axelerate/networks/common_utils/ncc axelerate/networks/common_utils/ncc_linux_x86_64.tar.xz axelerate.egg-info/ build/ dist/ _configs/ projects/ logs/ *.tflite *.h5 *.kmodel *.txt *.pyc .vscode/ ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2020 Dmitry Maslov Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================

aXeleRate

Keras-based framework for AI on the Edge


aXeleRate streamlines training and converting computer vision models to be run on various platforms with hardware acceleration. It is optimized for both the workflow on local machine(Ubuntu 18.04/20.04 - other Linux distributions might work, but not tested. Mac OS/Windows are not supported) and on Google Colab. Currently supports trained model conversion to: .kmodel(K210), .tflite format(full integer and dynamic range quantization support available), .onnx formats. Experimental support: Google Edge TPU.

Standford Dog Breed Classification Dataset NASNetMobile backend + Classifier Open In Colab PASCAL-VOC 2012 Object Detection Dataset MobileNet1_0 backend + YOLOv3 Open In Colab Human parsing Semantic Segmentation MobileNet5_0 backend + Segnet-Basic Open In Colab
### aXeleRate TL;DR aXeleRate is meant for people who need to run computer vision applications(image classification, object detection, semantic segmentation) on the edge devices with hardware acceleration. It has easy configuration process through config file or config dictionary(for Google Colab) and automatic conversion of the best model for training session into the required file format. You put the properly formatted data in, start the training script and (hopefully) come back to see a converted model that is ready for deployment on your device! ### :wrench: Key Features - Supports multiple computer vision models: object detection(YOLOv3), image classification, semantic segmentation(SegNet-basic) - Different feature extractors to be used with the above network types: Full Yolo, Tiny Yolo, MobileNet, SqueezeNet, NASNetMobile, ResNet50, and DenseNet121. - Automatic conversion of the best model for the training session. aXeleRate will download the suitable converter automatically. - Currently supports trained model conversion to: .kmodel(K210), .tflite format(full integer and dynamic range quantization support available), .tflite(Edge TPU), .onnx(for later on-device optimization with TensorRT). - Model version control made easier. Keras model files and converted models are saved in the project folder, grouped by the training date. Training history is saved as .png graph in the model folder. - Two modes of operation: locally, with train.py script and .json config file and remote, tailored for Google Colab, with module import and dictionary config. ### 💾 Install Stable version: pip install axelerate Daily development version: pip install git+https://github.com/AIWintermuteAI/aXeleRate If installing in Anaconda environment, make sure you have necessary CUDA/CUDNN version installed in that environment to use GPU for training. ### :question: F.A.Q. Q: I trained a YOLO model, but it doesn't run on K210 with MaixPy firmware. A: While there can be a lot of reasons for that (memory constrains is one of them), master branch of aXeleRate trains YOLOv3 model, which shows better convergence, especially for datasets with smaller objects and non-square image sizes. There is a [PR for adding YOLOv3 support](https://github.com/sipeed/MaixPy/pull/451) to MaixPy (where you can also see my comparisons of the two), but it is not merged at the moment. There are two options you can choose to train the model, that can run on K210 MaixPy: - switch to legacy branch on aXeleRate with ```git switch legacy-yolov2``` (if you are running the training locally you will also need to re-install aXeleRate after that with ```pip install -e .```. The trained model should be compatible with current MaixPy. - use [this pre-compiled firmware](https://drive.google.com/file/d/1q1BcWA8GiTQ_3Q9vYkSysRvGD62K2zh4/view?usp=sharing) with experimental support for YOLOv3 (examples included) or compile your own from [this PR's branch](https://github.com/sipeed/MaixPy/pull/451). ### :computer: Project Story aXeleRate started as a personal project of mine for training YOLOv2 based object detection networks and exporting them to .kmodel format to be run on K210 chip. I also needed to train image classification networks. And sometimes I needed to run inference with Tensorflow Lite on Raspberry Pi. As a result I had a whole bunch of disconnected scripts each had somewhat overlapping functionality. So, I decided to fix that and share the results with other people who might have similar workflows. aXeleRate is still work in progress project. I will be making some changes from time to time and if you find it useful and can contribute, PRs are very much welcome! :ballot_box_with_check: TODO list: TODO list is moving to Github Projects! ### Acknowledgements - YOLOv2 Keras code jeongjoonsup and Ngoc Anh Huynh https://github.com/experiencor/keras-yolo2 https://github.com/penny4860/Yolo-digit-detector - SegNet Keras code Divam Gupta https://github.com/divamgupta/image-segmentation-keras - Big Thank You to creator/maintainers of Keras/Tensorflow ### Donation Recently there were a few people that wanted to make a small donation to aXeleRate, because it helped them with their work. I was caught off guard with the question about donations :) I didn't have anything set up, so I quickly created a page for them to be able to send money. If aXeleRate was useful in your work, you can donate a pizza or a beer to the project here https://www.buymeacoffee.com/hardwareai . But times are tough now(and always), so if you don't have much to spare, don't feel guilty! aXeleRate is totally open source and free to use. ================================================ FILE: axelerate/__init__.py ================================================ from .train import setup_training from .infer import setup_inference from .evaluate import setup_evaluation ================================================ FILE: axelerate/evaluate.py ================================================ import os import argparse import json import cv2 import numpy as np import matplotlib import matplotlib.pyplot as plt import matplotlib.image as mpimg from tensorflow.keras import backend as K from axelerate.networks.yolo.frontend import create_yolo from axelerate.networks.yolo.backend.utils.box import draw_boxes from axelerate.networks.yolo.backend.utils.annotation import parse_annotation from axelerate.networks.yolo.backend.utils.eval.fscore import count_true_positives, calc_score from axelerate.networks.segnet.frontend_segnet import create_segnet from axelerate.networks.classifier.frontend_classifier import get_labels, create_classifier K.clear_session() DEFAULT_THRESHOLD = 0.3 def save_report(config, report, report_file): with open(report_file, 'w') as outfile: outfile.write("REPORT\n") outfile.write(str(report)) outfile.write("\nCONFIG\n") outfile.write(json.dumps(config, indent=4, sort_keys=False)) def show_image(filename): image = mpimg.imread(filename) plt.figure() plt.imshow(image) plt.show(block=False) plt.pause(1) plt.close() print(filename) def prepare_image(img_path, network): orig_image = cv2.imread(img_path) input_image = cv2.cvtColor(orig_image, cv2.COLOR_BGR2RGB) input_image = cv2.resize(input_image, (network.input_size[1], network.input_size[0])) input_image = network.norm(input_image) input_image = np.expand_dims(input_image, 0) return orig_image, input_image def setup_evaluation(config, weights, threshold = None): try: matplotlib.use('TkAgg') except: pass #added for compatibility with < 0.5.7 versions try: input_size = config['model']['input_size'][:] except: input_size = [config['model']['input_size'],config['model']['input_size']] """make directory to save inference results """ dirname = os.path.dirname(weights) if config['model']['type']=='Classifier': print('Classifier') if config['model']['labels']: labels = config['model']['labels'] else: labels = get_labels(config['train']['train_image_folder']) # 1.Construct the model classifier = create_classifier(config['model']['architecture'], labels, input_size, config['model']['fully-connected'], config['model']['dropout']) # 2. Load the pretrained weights classifier.load_weights(weights) report, cm = classifier.evaluate(config['train']['valid_image_folder'], 16) save_report(config, report, os.path.join(dirname, 'report.txt')) if config['model']['type']=='SegNet': print('Segmentation') # 1. Construct the model segnet = create_segnet(config['model']['architecture'], input_size, config['model']['n_classes']) # 2. Load the pretrained weights (if any) segnet.load_weights(weights) report = segnet.evaluate(config['train']['valid_image_folder'], config['train']['valid_annot_folder'], 2) save_report(config, report, os.path.join(dirname, 'report.txt')) print(report) if config['model']['type']=='Detector': # 2. create yolo instance & predict yolo = create_yolo(config['model']['architecture'], config['model']['labels'], input_size, config['model']['anchors'], config['model']['obj_thresh'], config['model']['iou_thresh'], config['model']['coord_scale'], config['model']['object_scale'], config['model']['no_object_scale'], config['weights']['backend']) yolo.load_weights(weights) # 3. read image annotations = parse_annotation(config['train']['valid_annot_folder'], config['train']['valid_image_folder'], config['model']['labels'], is_only_detect=config['train']['is_only_detect']) threshold = threshold if threshold else config['model']['obj_thresh'] dirname = os.path.join(os.path.dirname(weights), 'Inference_results') #temporary if os.path.isdir(dirname): print("Folder {} is already exists. Image files in directory might be overwritten".format(dirname)) else: print("Folder {} is created.".format(dirname)) os.makedirs(dirname) n_true_positives = 0 n_truth = 0 n_pred = 0 inference_time = [] for i in range(len(annotations)): img_path = annotations.fname(i) img_fname = os.path.basename(img_path) true_boxes = annotations.boxes(i) true_labels = annotations.code_labels(i) orig_image, input_image = prepare_image(img_path, yolo) height, width = orig_image.shape[:2] prediction_time, boxes, scores = yolo.predict(input_image, height, width, float(threshold)) classes = np.argmax(scores, axis=1) if len(scores) > 0 else [] inference_time.append(prediction_time) # 4. save detection result orig_image = draw_boxes(orig_image, boxes, scores, classes, config['model']['labels']) output_path = os.path.join(dirname, os.path.split(img_fname)[-1]) cv2.imwrite(output_path, orig_image) print("{}-boxes are detected. {} saved.".format(len(boxes), output_path)) n_true_positives += count_true_positives(boxes, true_boxes, classes, true_labels) n_truth += len(true_boxes) n_pred += len(boxes) report = calc_score(n_true_positives, n_truth, n_pred) save_report(config, report, os.path.join(dirname, 'report.txt')) print(report) if len(inference_time)>1: print("Average prediction time:{} ms".format(sum(inference_time[1:])/len(inference_time[1:]))) if __name__ == '__main__': # 1. extract arguments argparser = argparse.ArgumentParser( description='Run evaluation script') argparser.add_argument( '-c', '--config', help='path to configuration file') argparser.add_argument( '-t', '--threshold', help='detection threshold') argparser.add_argument( '-w', '--weights', help='trained weight files') args = argparser.parse_args() with open(args.config) as config_buffer: config = json.loads(config_buffer.read()) setup_evaluation(config, args.weights, args.threshold) ================================================ FILE: axelerate/infer.py ================================================ import glob import os import argparse import json import cv2 import numpy as np import matplotlib import matplotlib.pyplot as plt import matplotlib.image as mpimg from tensorflow.keras import backend as K from tensorflow.keras import backend as K from axelerate.networks.yolo.frontend import create_yolo from axelerate.networks.yolo.backend.utils.box import draw_boxes from axelerate.networks.segnet.frontend_segnet import create_segnet from axelerate.networks.segnet.predict import visualize_segmentation from axelerate.networks.classifier.frontend_classifier import get_labels, create_classifier K.clear_session() def show_image(filename): image = mpimg.imread(filename) plt.figure() plt.imshow(image) plt.show(block=False) plt.pause(1) plt.close() print(filename) def prepare_image(img_path, network, input_size): orig_image = cv2.imread(img_path) input_image = cv2.cvtColor(orig_image, cv2.COLOR_BGR2RGB) input_image = cv2.resize(input_image, (input_size[1], input_size[0])) input_image = network.norm(input_image) input_image = np.expand_dims(input_image, 0) return orig_image, input_image def find_imgs(folder): ext_list = ['/**/*.jpg', '/**/*.jpeg', '/**/*.png', '/**/*.JPG', '/**/*.JPEG'] image_files_list = [] image_search = lambda ext : glob.glob(folder + ext, recursive=True) for ext in ext_list: image_files_list.extend(image_search(ext)) return image_files_list def setup_inference(config, weights, threshold = None, folder = None): try: matplotlib.use('TkAgg') except: pass #added for compatibility with < 0.5.7 versions try: input_size = config['model']['input_size'][:] except: input_size = [config['model']['input_size'], config['model']['input_size']] """make directory to save inference results """ dirname = os.path.join(os.path.dirname(weights), 'Inference_results') if os.path.isdir(dirname): print("Folder {} is already exists. Image files in directory might be overwritten".format(dirname)) else: print("Folder {} is created.".format(dirname)) os.makedirs(dirname) if config['model']['type']=='Classifier': print('Classifier') if config['model']['labels']: labels = config['model']['labels'] else: labels = get_labels(config['train']['train_image_folder']) # 1.Construct the model classifier = create_classifier(config['model']['architecture'], labels, input_size, config['model']['fully-connected'], config['model']['dropout']) # 2. Load the trained weights classifier.load_weights(weights) font = cv2.FONT_HERSHEY_SIMPLEX background_color = (70, 120, 70) # grayish green background for text text_color = (255, 255, 255) # white text file_folder = folder if folder else config['train']['valid_image_folder'] image_files_list = find_imgs(file_folder) inference_time = [] for filepath in image_files_list: output_path = os.path.join(dirname, os.path.basename(filepath)) orig_image, input_image = prepare_image(filepath, classifier, input_size) prediction_time, prob, img_class = classifier.predict(input_image) inference_time.append(prediction_time) text = "{}:{:.2f}".format(img_class, prob) # label shape and colorization size = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)[0] left = 10 top = 35 - size[1] right = left + size[0] bottom = top + size[1] # set up the colored rectangle background for text cv2.rectangle(orig_image, (left - 1, top - 5),(right + 1, bottom + 1), background_color, -1) # set up text cv2.putText(orig_image, text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, text_color, 1) cv2.imwrite(output_path, orig_image) show_image(output_path) print("{}:{}".format(img_class, prob)) if len(inference_time)>1: print("Average prediction time:{} ms".format(sum(inference_time[1:])/len(inference_time[1:]))) if config['model']['type']=='SegNet': print('Segmentation') # 1. Construct the model segnet = create_segnet(config['model']['architecture'], input_size, config['model']['n_classes']) # 2. Load the trained weights segnet.load_weights(weights) file_folder = folder if folder else config['train']['valid_image_folder'] image_files_list = find_imgs(file_folder) inference_time = [] for filepath in image_files_list: orig_image, input_image = prepare_image(filepath, segnet, input_size) out_fname = os.path.join(dirname, os.path.basename(filepath)) prediction_time, output_array = segnet.predict(input_image) seg_img = visualize_segmentation(output_array, orig_image, segnet.n_classes, overlay_img = True) cv2.imwrite(out_fname, seg_img) show_image(out_fname) if config['model']['type']=='Detector': # 2. create yolo instance & predict yolo = create_yolo(config['model']['architecture'], config['model']['labels'], input_size, config['model']['anchors'], config['model']['obj_thresh'], config['model']['iou_thresh'], config['model']['coord_scale'], config['model']['object_scale'], config['model']['no_object_scale'], config['weights']['backend']) yolo.load_weights(weights) file_folder = folder if folder else config['train']['valid_image_folder'] threshold = threshold if threshold else config['model']['obj_thresh'] image_files_list = find_imgs(file_folder) inference_time = [] for filepath in image_files_list: img_fname = os.path.basename(filepath) orig_image, input_image = prepare_image(filepath, yolo, input_size) height, width = orig_image.shape[:2] prediction_time, boxes, scores = yolo.predict(input_image, height, width, float(threshold)) classes = np.argmax(scores, axis=1) if len(scores) > 0 else [] print(classes) inference_time.append(prediction_time) # 4. save detection result orig_image = draw_boxes(orig_image, boxes, scores, classes, config['model']['labels']) output_path = os.path.join(dirname, os.path.basename(filepath)) cv2.imwrite(output_path, orig_image) print("{}-boxes are detected. {} saved.".format(len(boxes), output_path)) show_image(output_path) if len(inference_time)>1: print("Average prediction time:{} ms".format(sum(inference_time[1:])/len(inference_time[1:]))) if __name__ == '__main__': # 1. extract arguments argparser = argparse.ArgumentParser( description='Run inference script') argparser.add_argument( '-c', '--config', help='path to configuration file') argparser.add_argument( '-t', '--threshold', help='detection threshold') argparser.add_argument( '-w', '--weights', help='trained weight files') argparser.add_argument( '-f', '--folder', help='folder with image files to run inference on') args = argparser.parse_args() if args.create_dataset: from pascal_voc_writer import Writer with open(args.config) as config_buffer: config = json.loads(config_buffer.read()) setup_inference(config, args.weights, args.threshold, args.folder) ================================================ FILE: axelerate/networks/__init__.py ================================================ ================================================ FILE: axelerate/networks/classifier/__init__.py ================================================ ================================================ FILE: axelerate/networks/classifier/batch_gen.py ================================================ ## Code heavily adapted from: ## *https://github.com/keras-team/keras-preprocessing/blob/master/keras_preprocessing/ """Utilities for real-time data augmentation on image data. """ from .directory_iterator import DirectoryIterator from axelerate.networks.common_utils.augment import process_image_classification from tensorflow.keras.utils import Sequence import cv2 import os def create_datagen(img_folder, batch_size, input_size, project_folder, augment, norm): datagen = ImageDataAugmentor(preprocess_input = norm, process_image = process_image_classification, augment = augment) generator = datagen.flow_from_directory(img_folder, target_size = input_size, color_mode = 'rgb', batch_size = batch_size, class_mode = 'categorical', shuffle = augment) if project_folder: labels = (generator.class_indices) labels = dict((v,k) for k,v in labels.items()) fo = open(os.path.join(project_folder,"labels.txt"), "w") for k,v in labels.items(): print(v) fo.write(v+"\n") fo.close() return generator class ImageDataAugmentor(Sequence): """Generate batches of tensor image data with real-time data augmentation. The data will be looped over (in batches). # Arguments preprocessing_input: function that will be implied on each input. The function will run after the image is resized and augmented. The function should take one argument: one image, and should output a Numpy tensor with the same shape. augment: augmentations passed as albumentations or imgaug transformation or sequence of transformations. data_format: Image data format, either "channels_first" or "channels_last". "channels_last" mode means that the images should have shape `(samples, height, width, channels)`, "channels_first" mode means that the images should have shape `(samples, channels, height, width)`. It defaults to the `image_data_format` value found in your Keras config file at `~/.keras/keras.json`. If you never set it, then it will be "channels_last". """ def __init__(self, augment = False, process_image=None, preprocess_input=None, data_format='channels_last'): self.augment = augment self.process_image = process_image self.preprocess_input = preprocess_input if data_format not in {'channels_last', 'channels_first'}: raise ValueError( '`data_format` should be `"channels_last"` ' '(channel after row and column) or ' '`"channels_first"` (channel before row and column). ' 'Received: %s' % data_format) self.data_format = data_format if data_format == 'channels_first': self.channel_axis = 1 self.row_axis = 2 self.col_axis = 3 if data_format == 'channels_last': self.channel_axis = 3 self.row_axis = 1 self.col_axis = 2 def flow_from_directory(self, directory, target_size=(256, 256), color_mode='rgb', classes=None, class_mode='categorical', batch_size=32, shuffle=True, seed=None, save_to_dir=None, save_prefix='', save_format='png', follow_links=False, subset=None, interpolation=cv2.INTER_NEAREST): """Takes the path to a directory & generates batches of augmented data. # Arguments directory: string, path to the target directory. It should contain one subdirectory per class. Any PNG, JPG, BMP, PPM or TIF images inside each of the subdirectories directory tree will be included in the generator. See [this script]( https://gist.github.com/fchollet/0830affa1f7f19fd47b06d4cf89ed44d) for more details. target_size: Tuple of integers `(height, width)`, default: `(256, 256)`. The dimensions to which all images found will be resized. color_mode: One of "gray", "rgb", "rgba". Default: "rgb". Whether the images will be converted to have 1, 3, or 4 channels. classes: Optional list of class subdirectories (e.g. `['dogs', 'cats']`). Default: None. If not provided, the list of classes will be automatically inferred from the subdirectory names/structure under `directory`, where each subdirectory will be treated as a different class (and the order of the classes, which will map to the label indices, will be alphanumeric). The dictionary containing the mapping from class names to class indices can be obtained via the attribute `class_indices`. class_mode: One of "categorical", "binary", "sparse", "input", or None. Default: "categorical". Determines the type of label arrays that are returned: - "categorical" will be 2D one-hot encoded labels, - "binary" will be 1D binary labels, "sparse" will be 1D integer labels, - "input" will be images identical to input images (mainly used to work with autoencoders). - If None, no labels are returned (the generator will only yield batches of image data, which is useful to use with `model.predict_generator()`). Please note that in case of class_mode None, the data still needs to reside in a subdirectory of `directory` for it to work correctly. batch_size: Size of the batches of data (default: 32). shuffle: Whether to shuffle the data (default: True) If set to False, sorts the data in alphanumeric order. seed: Optional random seed for shuffling and transformations. save_to_dir: None or str (default: None). This allows you to optionally specify a directory to which to save the augmented pictures being generated (useful for visualizing what you are doing). save_prefix: Str. Prefix to use for filenames of saved pictures (only relevant if `save_to_dir` is set). save_format: One of "png", "jpeg" (only relevant if `save_to_dir` is set). Default: "png". follow_links: Whether to follow symlinks inside class subdirectories (default: False). subset: Subset of data (`"training"` or `"validation"`) if `validation_split` is set in `ImageDataAugmentor`. interpolation: Interpolation method used to resample the image if the target size is different from that of the loaded image. Supported methods are `"nearest"`, `"bilinear"`, and `"bicubic"`. If PIL version 1.1.3 or newer is installed, `"lanczos"` is also supported. If PIL version 3.4.0 or newer is installed, `"box"` and `"hamming"` are also supported. By default, `"nearest"` is used. # Returns A `DirectoryIterator` yielding tuples of `(x, y)` where `x` is a numpy array containing a batch of images with shape `(batch_size, *target_size, channels)` and `y` is a numpy array of corresponding labels. """ return DirectoryIterator( directory, self, target_size=target_size, color_mode=color_mode, classes=classes, class_mode=class_mode, data_format=self.data_format, batch_size=batch_size, shuffle=shuffle, seed=seed, save_to_dir=save_to_dir, save_prefix=save_prefix, save_format=save_format, follow_links=follow_links, subset=subset, interpolation=interpolation ) def transform_image(self, image, desired_w, desired_h): """ Transforms an image by first augmenting and then standardizing """ image = self.process_image(image, desired_w, desired_h, self.augment) image = self.preprocess_input(image) return image ================================================ FILE: axelerate/networks/classifier/directory_iterator.py ================================================ """Utilities for real-time data augmentation on image data. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import multiprocessing.pool from six.moves import range import numpy as np import cv2 from .iterator import BatchFromFilesMixin, Iterator from .utils import _list_valid_filenames_in_directory class DirectoryIterator(BatchFromFilesMixin, Iterator): """Iterator capable of reading images from a directory on disk. # Arguments directory: string, path to the directory to read images from. Each subdirectory in this directory will be considered to contain images from one class, or alternatively you could specify class subdirectories via the `classes` argument. image_data_generator: Instance of `ImageDataAugmentor` to use for random transformations and normalization. target_size: tuple of integers, dimensions to resize input images to. color_mode: One of `"rgb"`, `"rgba"`, `"gray"`. Color mode to read images. classes: Optional list of strings, names of subdirectories containing images from each class (e.g. `["dogs", "cats"]`). It will be computed automatically if not set. class_mode: Mode for yielding the targets: `"binary"`: binary targets (if there are only two classes), `"categorical"`: categorical targets, `"sparse"`: integer targets, `"input"`: targets are images identical to input images (mainly used to work with autoencoders), `None`: no targets get yielded (only input images are yielded). batch_size: Integer, size of a batch. shuffle: Boolean, whether to shuffle the data between epochs. If set to False, sorts the data in alphanumeric order. seed: Random seed for data shuffling. data_format: String, one of `channels_first`, `channels_last`. save_to_dir: Optional directory where to save the pictures being yielded, in a viewable format. This is useful for visualizing the random transformations being applied, for debugging purposes. save_prefix: String prefix to use for saving sample images (if `save_to_dir` is set). save_format: Format to use for saving sample images (if `save_to_dir` is set). follow_links: boolean,follow symbolic links to subdirectories subset: Subset of data (`"training"` or `"validation"`) if validation_split is set in ImageDataAugmentor. interpolation: Interpolation method used to resample the image if the target size is different from that of the loaded image. Supported methods are `"cv2.INTER_NEAREST"`, `"cv2.INTER_LINEAR"`, `"cv2.INTER_AREA"`, `"cv2.INTER_CUBIC"` and `"cv2.INTER_LANCZOS4"` By default, `"cv2.INTER_NEAREST"` is used. dtype: Dtype to use for generated arrays. """ allowed_class_modes = {'categorical', 'binary', 'sparse', 'input', None} def __init__(self, directory, image_data_generator, target_size=(256, 256), color_mode='rgb', classes=None, class_mode='categorical', batch_size=32, shuffle=True, seed=None, data_format='channels_last', save_to_dir=None, save_prefix='', save_format='png', follow_links=False, subset=None, interpolation=cv2.INTER_NEAREST, dtype='float32'): super(DirectoryIterator, self).set_processing_attrs(image_data_generator, target_size, color_mode, data_format, save_to_dir, save_prefix, save_format, subset, interpolation) self.directory = directory self.classes = classes if class_mode not in self.allowed_class_modes: raise ValueError('Invalid class_mode: {}; expected one of: {}' .format(class_mode, self.allowed_class_modes)) self.class_mode = class_mode self.dtype = dtype # First, count the number of samples and classes. self.samples = 0 if not classes: classes = [] for subdir in sorted(os.listdir(directory)): if os.path.isdir(os.path.join(directory, subdir)): classes.append(subdir) self.num_classes = len(classes) self.class_indices = dict(zip(classes, range(len(classes)))) pool = multiprocessing.pool.ThreadPool() # Second, build an index of the images # in the different class subfolders. results = [] self.filenames = [] i = 0 for dirpath in (os.path.join(directory, subdir) for subdir in classes): results.append( pool.apply_async(_list_valid_filenames_in_directory, (dirpath, self.white_list_formats, self.split, self.class_indices, follow_links))) classes_list = [] for res in results: classes, filenames = res.get() classes_list.append(classes) self.filenames += filenames self.samples = len(self.filenames) self.classes = np.zeros((self.samples,), dtype='int32') for classes in classes_list: self.classes[i:i + len(classes)] = classes i += len(classes) print('Found %d images belonging to %d classes.' % (self.samples, self.num_classes)) pool.close() pool.join() self._filepaths = [ os.path.join(self.directory, fname) for fname in self.filenames ] super(DirectoryIterator, self).__init__(self.samples, batch_size, shuffle, seed) @property def filepaths(self): return self._filepaths @property def labels(self): return self.classes @property # mixin needs this property to work def sample_weight(self): # no sample weights will be returned return None ================================================ FILE: axelerate/networks/classifier/frontend_classifier.py ================================================ import time import os import numpy as np import matplotlib.pyplot as plt from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay from axelerate.networks.common_utils.feature import create_feature_extractor from axelerate.networks.classifier.batch_gen import create_datagen from axelerate.networks.common_utils.fit import train from tensorflow.keras.models import Model, load_model from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout from tensorflow.keras.applications.mobilenet import preprocess_input def get_labels(directory): labels = sorted(os.listdir(directory)) return labels def create_classifier(architecture, labels, input_size, layers, dropout, weights = None, save_bottleneck = False): base_model = create_feature_extractor(architecture, input_size, weights) x = base_model.feature_extractor.outputs[0] x = GlobalAveragePooling2D()(x) if len(layers) != 0: for layer in layers[0:-1]: x = Dense(layer, activation = 'relu')(x) x = Dropout(dropout)(x) x = Dense(layers[-1], activation = 'relu')(x) preds = Dense(len(labels), activation = 'softmax')(x) model = Model(inputs = base_model.feature_extractor.inputs[0],outputs = preds, name = 'classifier') bottleneck_layer = None if save_bottleneck: bottleneck_layer = base_model.feature_extractor.layers[-1].name network = Classifier(model, input_size, labels, base_model.normalize, bottleneck_layer) return network class Classifier(object): def __init__(self, network, input_size, labels, norm, bottleneck_layer): self.network = network self.labels = labels self.input_size = input_size self.bottleneck_layer = bottleneck_layer self.norm = norm def load_weights(self, weight_path, by_name=False): if os.path.exists(weight_path): print("Loading pre-trained weights for the whole model: ", weight_path) self.network.load_weights(weight_path) else: print("Failed to load pre-trained weights for the whole model. It might be because you didn't specify any or the weight file cannot be found") def save_bottleneck(self, model_path, bottleneck_layer): bottleneck_weights_path = os.path.join(os.path.dirname(model_path),'bottleneck_weigths.h5') model = load_model(model_path) for layer in model.layers: if layer.name == bottleneck_layer: output = layer.output bottleneck_model = Model(model.input, output) bottleneck_model.save_weights(bottleneck_weights_path) def predict(self, img): start_time = time.time() Y_pred = np.squeeze(self.network(img, training = False)) elapsed_ms = (time.time() - start_time) * 1000 y_pred = np.argmax(Y_pred) prob = Y_pred[y_pred] prediction = self.labels[y_pred] return elapsed_ms, prob, prediction def evaluate(self, img_folder, batch_size): self.generator = create_datagen(img_folder, batch_size, self.input_size, None, False, self.norm) Y_pred = self.network.predict(self.generator, len(self.generator) // batch_size + 1) y_pred = np.argmax(Y_pred, axis=1) print('Classification Report') report = classification_report(self.generator.classes, y_pred, target_names = self.labels) print(report) print('Confusion Matrix') cm = confusion_matrix(self.generator.classes, y_pred) disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels = self.labels) disp.plot(include_values=True, cmap='Blues', ax=None) plt.show() return report, cm def train(self, img_folder, nb_epoch, project_folder, batch_size = 8, augumentation = False, learning_rate = 1e-4, train_times = 1, valid_times = 1, valid_img_folder = "", first_trainable_layer = None, metrics = "val_loss"): if metrics != "accuracy" and metrics != "loss": print("Unknown metric for Classifier, valid options are: val_loss or val_accuracy. Defaulting ot val_loss") metrics = "loss" train_generator = create_datagen(img_folder, batch_size, self.input_size, project_folder, augumentation, self.norm) validation_generator = create_datagen(valid_img_folder, batch_size, self.input_size, project_folder, False, self.norm) model_layers, model_path = train(self.network, 'categorical_crossentropy', train_generator, validation_generator, learning_rate, nb_epoch, project_folder, first_trainable_layer, metric_name = metrics) if self.bottleneck_layer: self.save_bottleneck(model_path, self.bottleneck_layer) return model_layers, model_path ================================================ FILE: axelerate/networks/classifier/iterator.py ================================================ """Utilities for real-time data augmentation on image data. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import threading import numpy as np from keras_preprocessing import get_keras_submodule import matplotlib.pyplot as plt try: IteratorType = get_keras_submodule('utils').Sequence except ImportError: IteratorType = object from .utils import (array_to_img, img_to_array, load_img) class Iterator(IteratorType): """Base class for image data iterators. Every `Iterator` must implement the `_get_batch_of_samples` method. # Arguments n: Integer, total number of samples in the dataset to loop over. batch_size: Integer, size of a batch. shuffle: Boolean, whether to shuffle the data between epochs. seed: Random seeding for data shuffling. """ white_list_formats = ('png', 'jpg', 'jpeg', 'bmp', 'ppm', 'tif', 'tiff') def __init__(self, n, batch_size, shuffle, seed): self.n = n self.batch_size = batch_size self.seed = seed self.shuffle = shuffle self.batch_index = 0 self.total_batches_seen = 0 self.lock = threading.Lock() self.index_array = None self.index_generator = self._flow_index() def _set_index_array(self): self.index_array = np.arange(self.n) if self.shuffle: self.index_array = np.random.permutation(self.n) def __getitem__(self, idx): if idx >= len(self): raise ValueError('Asked to retrieve element {idx}, ' 'but the Sequence ' 'has length {length}'.format(idx=idx, length=len(self))) if self.seed is not None: np.random.seed(self.seed + self.total_batches_seen) self.total_batches_seen += 1 if self.index_array is None: self._set_index_array() index_array = self.index_array[self.batch_size * idx: self.batch_size * (idx + 1)] return self._get_batches_of_transformed_samples(index_array) def __len__(self): return (self.n + self.batch_size - 1) // self.batch_size # round up def on_epoch_end(self): self._set_index_array() def reset(self): self.batch_index = 0 def _flow_index(self): # Ensure self.batch_index is 0. self.reset() while 1: if self.seed is not None: np.random.seed(self.seed + self.total_batches_seen) if self.batch_index == 0: self._set_index_array() if self.n == 0: # Avoiding modulo by zero error current_index = 0 else: current_index = (self.batch_index * self.batch_size) % self.n if self.n > current_index + self.batch_size: self.batch_index += 1 else: self.batch_index = 0 self.total_batches_seen += 1 yield self.index_array[current_index: current_index + self.batch_size] def __iter__(self): # Needed if we want to do something like: # for x, y in data_gen.flow(...): return self def __next__(self, *args, **kwargs): return self.next(*args, **kwargs) def next(self): """For python 2.x. # Returns The next batch. """ with self.lock: index_array = next(self.index_generator) # The transformation of images is not under thread lock # so it can be done in parallel return self._get_batches_of_transformed_samples(index_array) def _get_batches_of_transformed_samples(self, index_array): """Gets a batch of transformed samples. # Arguments index_array: Array of sample indices to include in batch. # Returns A batch of transformed samples. """ raise NotImplementedError class BatchFromFilesMixin(): """Adds methods related to getting batches from filenames It includes the logic to transform image files to batches. """ def set_processing_attrs(self, image_data_generator, target_size, color_mode, data_format, save_to_dir, save_prefix, save_format, subset, interpolation): """Sets attributes to use later for processing files into a batch. # Arguments image_data_generator: Instance of `ImageDataAugmentor` to use for random transformations and normalization. target_size: tuple of integers, dimensions to resize input images to. color_mode: One of `"rgb"`, `"rgba"`, `"gray"`. Color mode to read images. data_format: String, one of `channels_first`, `channels_last`. save_to_dir: Optional directory where to save the pictures being yielded, in a viewable format. This is useful for visualizing the random transformations being applied, for debugging purposes. save_prefix: String prefix to use for saving sample images (if `save_to_dir` is set). save_format: Format to use for saving sample images (if `save_to_dir` is set). subset: Subset of data (`"training"` or `"validation"`) if validation_split is set in ImageDataAugmentor. interpolation: Interpolation method used to resample the image if the target size is different from that of the loaded image. Supported methods are `"cv2.INTER_NEAREST"`, `"cv2.INTER_LINEAR"`, `"cv2.INTER_AREA"`, `"cv2.INTER_CUBIC"` and `"cv2.INTER_LANCZOS4"` By default, `"cv2.INTER_NEAREST"` is used. """ self.image_data_generator = image_data_generator self.target_size = tuple(target_size) if color_mode not in {'rgb', 'rgba', 'gray'}: raise ValueError('Invalid color mode:', color_mode, '; expected "rgb", "rgba", or "gray".') self.color_mode = color_mode self.data_format = data_format if self.color_mode == 'rgba': if self.data_format == 'channels_last': self.image_shape = self.target_size + (4,) else: self.image_shape = (4,) + self.target_size elif self.color_mode == 'rgb': if self.data_format == 'channels_last': self.image_shape = self.target_size + (3,) else: self.image_shape = (3,) + self.target_size else: if self.data_format == 'channels_last': self.image_shape = self.target_size + (1,) else: self.image_shape = (1,) + self.target_size self.save_to_dir = save_to_dir self.save_prefix = save_prefix self.save_format = save_format self.interpolation = interpolation if subset is not None: validation_split = self.image_data_generator._validation_split if subset == 'validation': split = (0, validation_split) elif subset == 'training': split = (validation_split, 1) else: raise ValueError( 'Invalid subset name: %s;' 'expected "training" or "validation"' % (subset,)) else: split = None self.split = split self.subset = subset def _get_batch_of_samples(self, index_array, apply_standardization=True): """Gets a batch of transformed samples. # Arguments index_array: Array of sample indices to include in batch. # Returns A batch of transformed samples. """ # build batch of image data # self.filepaths is dynamic, is better to call it once outside the loop filepaths = self.filepaths # build batch of image data batch_x = np.array([load_img(filepaths[x], color_mode=self.color_mode, target_size=self.target_size, interpolation=self.interpolation) for x in index_array]) # apply the augmentations and custom transformations to the image data batch_x = np.array([self.image_data_generator.transform_image(x, self.target_size[0], self.target_size[1]) for x in batch_x]) # transform to `channels_first` format if needed if self.data_format == "channels_first": batch_x = np.array([np.swapaxes(x,0,2) for x in batch_x]) # optionally save augmented images to disk for debugging purposes if self.save_to_dir: for i, j in enumerate(index_array): img = array_to_img(batch_x[i], self.data_format, scale=True) fname = '{prefix}_{index}_{hash}.{format}'.format( prefix=self.save_prefix, index=j, hash=np.random.randint(1e7), format=self.save_format) img.save(os.path.join(self.save_to_dir, fname)) # build batch of labels if self.class_mode == 'input': batch_y = batch_x.copy() elif self.class_mode in {'binary', 'sparse'}: batch_y = np.empty(len(batch_x), dtype=self.dtype) for i, n_observation in enumerate(index_array): batch_y[i] = self.classes[n_observation] elif self.class_mode == 'categorical': batch_y = np.zeros((len(batch_x), len(self.class_indices)), dtype=self.dtype) for i, n_observation in enumerate(index_array): batch_y[i, self.classes[n_observation]] = 1. elif self.class_mode == 'multi_output': batch_y = [output[index_array] for output in self.labels] elif self.class_mode == 'raw': batch_y = self.labels[index_array] else: return batch_x if self.sample_weight is None: return batch_x, batch_y else: return batch_x, batch_y, self.sample_weight[index_array] def _get_batches_of_transformed_samples(self, index_array): return self._get_batch_of_samples(index_array) def show_batch(self, rows:int=5, apply_standardization:bool=False, **plt_kwargs): img_arr = np.random.choice(range(len(self.classes)), rows**2) if self.class_mode is None: imgs = self._get_batch_of_samples(img_arr, apply_standardization=apply_standardization) else: imgs, _ = self._get_batch_of_samples(img_arr, apply_standardization=apply_standardization) lbls = np.array(self.labels)[img_arr] try: inv_class_indices = {v: k for k, v in self.class_indices.items()} lbls = [inv_class_indices.get(k) for k in lbls] except: pass if self.data_format == "channels_first": imgs = np.array([np.swapaxes(img,0,2) for img in imgs]) if not 'figsize' in plt_kwargs: plt_kwargs['figsize'] = (12,12) plt.close('all') plt.figure(**plt_kwargs) for idx, img in enumerate(imgs): plt.subplot(rows, rows, idx+1) plt.imshow(img.squeeze()) if lbls is not None: plt.title(lbls[idx]) plt.axis('off') plt.subplots_adjust(hspace=0.5, wspace=0.5) plt.show() @property def filepaths(self): """List of absolute paths to image files""" raise NotImplementedError( '`filepaths` property method has not been implemented in {}.' .format(type(self).__name__) ) @property def labels(self): """Class labels of every observation""" raise NotImplementedError( '`labels` property method has not been implemented in {}.' .format(type(self).__name__) ) @property def sample_weight(self): raise NotImplementedError( '`sample_weight` property method has not been implemented in {}.' .format(type(self).__name__) ) ================================================ FILE: axelerate/networks/classifier/utils.py ================================================ """Utilities for real-time data augmentation on image data. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import warnings import numpy as np import cv2 try: from PIL import ImageEnhance from PIL import Image as pil_image except ImportError: pil_image = None ImageEnhance = None if pil_image is not None: _PIL_INTERPOLATION_METHODS = { 'nearest': pil_image.NEAREST, 'bilinear': pil_image.BILINEAR, 'bicubic': pil_image.BICUBIC, } # These methods were only introduced in version 3.4.0 (2016). if hasattr(pil_image, 'HAMMING'): _PIL_INTERPOLATION_METHODS['hamming'] = pil_image.HAMMING if hasattr(pil_image, 'BOX'): _PIL_INTERPOLATION_METHODS['box'] = pil_image.BOX # This method is new in version 1.1.3 (2013). if hasattr(pil_image, 'LANCZOS'): _PIL_INTERPOLATION_METHODS['lanczos'] = pil_image.LANCZOS def validate_filename(filename, white_list_formats): """Check if a filename refers to a valid file. # Arguments filename: String, absolute path to a file white_list_formats: Set, allowed file extensions # Returns A boolean value indicating if the filename is valid or not """ return (filename.lower().endswith(white_list_formats) and os.path.isfile(filename)) def save_img(path, x, data_format='channels_last', file_format=None, scale=True, **kwargs): """Saves an image stored as a Numpy array to a path or file object. # Arguments path: Path or file object. x: Numpy array. data_format: Image data format, either "channels_first" or "channels_last". file_format: Optional file format override. If omitted, the format to use is determined from the filename extension. If a file object was used instead of a filename, this parameter should always be used. scale: Whether to rescale image values to be within `[0, 255]`. **kwargs: Additional keyword arguments passed to `PIL.Image.save()`. """ img = array_to_img(x, data_format=data_format, scale=scale) if img.mode == 'RGBA' and (file_format == 'jpg' or file_format == 'jpeg'): warnings.warn('The JPG format does not support ' 'RGBA images, converting to RGB.') img = img.convert('RGB') img.save(path, format=file_format, **kwargs) def load_img(fname, color_mode='rgb', target_size=None, interpolation=cv2.INTER_NEAREST): if color_mode == "rgb": img = cv2.imread(fname) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) elif color_mode == "rgba": img = cv2.imread(fname,-1) if img.shape[-1]!=4: #Add alpha-channel if not RGBA img = cv2.cvtColor(img, cv2.COLOR_BGR2RGBA) elif color_mode == "gray": img = cv2.imread(fname, 0) else: img = cv2.imread(fname) if target_size is not None: width_height_tuple = (target_size[1], target_size[0]) if img.shape[0:2] != width_height_tuple: img = cv2.resize(img, dsize=width_height_tuple, interpolation = interpolation) if color_mode == "gray": return img[..., np.newaxis] #Add dummy axis. This is done here, cause `cv2.resize` removes the dummy axes else: return img def list_pictures(directory, ext=('jpg', 'jpeg', 'bmp', 'png', 'ppm', 'tif', 'tiff')): """Lists all pictures in a directory, including all subdirectories. # Arguments directory: string, absolute path to the directory ext: tuple of strings or single string, extensions of the pictures # Returns a list of paths """ ext = tuple('.%s' % e for e in ((ext,) if isinstance(ext, str) else ext)) return [os.path.join(root, f) for root, _, files in os.walk(directory) for f in files if f.lower().endswith(ext)] def _iter_valid_files(directory, white_list_formats, follow_links): """Iterates on files with extension in `white_list_formats` contained in `directory`. # Arguments directory: Absolute path to the directory containing files to be counted white_list_formats: Set of strings containing allowed extensions for the files to be counted. follow_links: Boolean, follow symbolic links to subdirectories. # Yields Tuple of (root, filename) with extension in `white_list_formats`. """ def _recursive_list(subpath): return sorted(os.walk(subpath, followlinks=follow_links), key=lambda x: x[0]) for root, _, files in _recursive_list(directory): for fname in sorted(files): if fname.lower().endswith('.tiff'): warnings.warn('Using ".tiff" files with multiple bands ' 'will cause distortion. Please verify your output.') if fname.lower().endswith(white_list_formats): yield root, fname def _list_valid_filenames_in_directory(directory, white_list_formats, split, class_indices, follow_links): """Lists paths of files in `subdir` with extensions in `white_list_formats`. # Arguments directory: absolute path to a directory containing the files to list. The directory name is used as class label and must be a key of `class_indices`. white_list_formats: set of strings containing allowed extensions for the files to be counted. split: tuple of floats (e.g. `(0.2, 0.6)`) to only take into account a certain fraction of files in each directory. E.g.: `segment=(0.6, 1.0)` would only account for last 40 percent of images in each directory. class_indices: dictionary mapping a class name to its index. follow_links: boolean, follow symbolic links to subdirectories. # Returns classes: a list of class indices filenames: the path of valid files in `directory`, relative from `directory`'s parent (e.g., if `directory` is "dataset/class1", the filenames will be `["class1/file1.jpg", "class1/file2.jpg", ...]`). """ dirname = os.path.basename(directory) if split: num_files = len(list( _iter_valid_files(directory, white_list_formats, follow_links))) start, stop = int(split[0] * num_files), int(split[1] * num_files) valid_files = list( _iter_valid_files( directory, white_list_formats, follow_links))[start: stop] else: valid_files = _iter_valid_files( directory, white_list_formats, follow_links) classes = [] filenames = [] for root, fname in valid_files: classes.append(class_indices[dirname]) absolute_path = os.path.join(root, fname) relative_path = os.path.join( dirname, os.path.relpath(absolute_path, directory)) filenames.append(relative_path) return classes, filenames def array_to_img(x, data_format='channels_last', scale=True, dtype='float32'): """Converts a 3D Numpy array to a PIL Image instance. # Arguments x: Input Numpy array. data_format: Image data format. either "channels_first" or "channels_last". scale: Whether to rescale image values to be within `[0, 255]`. dtype: Dtype to use. # Returns A PIL Image instance. # Raises ImportError: if PIL is not available. ValueError: if invalid `x` or `data_format` is passed. """ if pil_image is None: raise ImportError('Could not import PIL.Image. ' 'The use of `array_to_img` requires PIL.') x = np.asarray(x, dtype=dtype) if x.ndim != 3: raise ValueError('Expected image array to have rank 3 (single image). ' 'Got array with shape: %s' % (x.shape,)) if data_format not in {'channels_first', 'channels_last'}: raise ValueError('Invalid data_format: %s' % data_format) # Original Numpy array x has format (height, width, channel) # or (channel, height, width) # but target PIL image has format (width, height, channel) if data_format == 'channels_first': x = x.transpose(1, 2, 0) if scale: x = x + max(-np.min(x), 0) x_max = np.max(x) if x_max != 0: x /= x_max x *= 255 if x.shape[2] == 4: # RGBA return pil_image.fromarray(x.astype('uint8'), 'RGBA') elif x.shape[2] == 3: # RGB return pil_image.fromarray(x.astype('uint8'), 'RGB') elif x.shape[2] == 1: # grayscale return pil_image.fromarray(x[:, :, 0].astype('uint8'), 'L') else: raise ValueError('Unsupported channel number: %s' % (x.shape[2],)) def img_to_array(img, data_format='channels_last', dtype='float32'): """Converts a PIL Image instance to a Numpy array. # Arguments img: PIL Image instance. data_format: Image data format, either "channels_first" or "channels_last". dtype: Dtype to use for the returned array. # Returns A 3D Numpy array. # Raises ValueError: if invalid `img` or `data_format` is passed. """ if data_format not in {'channels_first', 'channels_last'}: raise ValueError('Unknown data_format: %s' % data_format) # Numpy array x has format (height, width, channel) # or (channel, height, width) # but original PIL image has format (width, height, channel) x = np.asarray(img, dtype=dtype) if len(x.shape) == 3: if data_format == 'channels_first': x = x.transpose(2, 0, 1) elif len(x.shape) == 2: if data_format == 'channels_first': x = x.reshape((1, x.shape[0], x.shape[1])) else: x = x.reshape((x.shape[0], x.shape[1], 1)) else: raise ValueError('Unsupported image shape: %s' % (x.shape,)) return x ================================================ FILE: axelerate/networks/common_utils/__init__.py ================================================ ================================================ FILE: axelerate/networks/common_utils/augment.py ================================================ # -*- coding: utf-8 -*- import numpy as np np.random.seed(1337) import imgaug as ia from imgaug import augmenters as iaa from imgaug.augmentables.segmaps import SegmentationMapsOnImage from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage import cv2 import os import glob import random class ImgAugment(object): def __init__(self, w, h, jitter): """ # Args desired_w : int desired_h : int jitter : bool """ self._jitter = jitter self._w = w self._h = h def imread(self, img_file, boxes, labels): """ # Args img_file : str boxes : array, shape of (N, 4) # Returns image : 3d-array, shape of (h, w, 3) boxes_ : array, same shape of boxes jittered & resized bounding box """ # 1. read image file try: image = cv2.imread(img_file) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) except: print("This image has an annotation file, but cannot be open. Check the integrity of your dataset.", img_file) raise boxes_ = np.copy(boxes) labels_ = np.copy(labels) # 2. resize and augment image image, boxes_, labels_ = process_image_detection(image, boxes_, labels_, self._w, self._h, self._jitter) return image, boxes_, labels_ def _to_bbs(boxes, labels, shape): new_boxes = [] for i in range(len(boxes)): x1,y1,x2,y2 = boxes[i] new_box = BoundingBox(x1,y1,x2,y2, labels[i]) new_boxes.append(new_box) bbs = BoundingBoxesOnImage(new_boxes, shape) return bbs def _to_array(bbs): new_boxes = [] new_labels = [] for bb in bbs.bounding_boxes: x1 = int(bb.x1) x2 = int(bb.x2) y1 = int(bb.y1) y2 = int(bb.y2) label = bb.label new_boxes.append([x1,y1,x2,y2]) new_labels.append(label) return new_boxes, new_labels def process_image_detection(image, boxes, labels, desired_w, desired_h, augment): # resize the image to standard size if (desired_w and desired_h) or augment: bbs = _to_bbs(boxes, labels, image.shape) if (desired_w and desired_h): # Rescale image and bounding boxes image = ia.imresize_single_image(image, (desired_w, desired_h)) bbs = bbs.on(image) if augment: aug_pipe = _create_augment_pipeline() image, bbs = aug_pipe(image=image, bounding_boxes=bbs) bbs = bbs.remove_out_of_image().clip_out_of_image() new_boxes, new_labels = _to_array(bbs) #if len(new_boxes) != len(boxes): # print(new_boxes) # print(boxes) # print("_________________") return image, np.array(new_boxes), new_labels else: return image, np.array(boxes), labels def process_image_classification(image, desired_w, desired_h, augment): # resize the image to standard size if (desired_w and desired_h) or augment: if (desired_w and desired_h): # Rescale image image = ia.imresize_single_image(image, (desired_w, desired_h)) if augment: aug_pipe = _create_augment_pipeline() image = aug_pipe(image=image) return image def process_image_segmentation(image, segmap, input_w, input_h, output_w, output_h, augment): # resize the image to standard size if (input_w and input_h) or augment: segmap = SegmentationMapsOnImage(segmap, shape=image.shape) if (input_w and input_h): # Rescale image and segmaps image = ia.imresize_single_image(image, (input_w, input_h)) segmap = segmap.resize((output_w, output_h), interpolation="nearest") if augment: aug_pipe = _create_augment_pipeline() image, segmap = aug_pipe(image=image, segmentation_maps=segmap) return image, segmap.get_arr() def _create_augment_pipeline(): sometimes = lambda aug: iaa.Sometimes(0.1, aug) aug_pipe = iaa.Sequential( [ iaa.Fliplr(0.5), iaa.Flipud(0.2), iaa.Affine(translate_percent={"x": (-0.1, 0.1), "y": (-0.1, 0.1)}), iaa.OneOf([iaa.Affine(scale=(0.8, 1.2)), iaa.Affine(rotate=(-10, 10)), iaa.Affine(shear=(-10, 10))]), sometimes(iaa.OneOf([ iaa.GaussianBlur((0, 3.0)), iaa.AverageBlur(k=(2, 7)), iaa.MedianBlur(k=(3, 11)), ])), sometimes(iaa.Sharpen(alpha=(0, 1.0), lightness=(0.75, 1.5))), sometimes(iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.05 * 255), per_channel=0.5)), sometimes(iaa.OneOf([ iaa.Dropout((0.01, 0.1), per_channel=0.5), iaa.CoarseDropout((0.03, 0.15), size_percent=(0.02, 0.05), per_channel=0.2), ])), sometimes(iaa.Add((-10, 10), per_channel=0.5)), sometimes(iaa.Multiply((0.5, 1.5), per_channel=0.5)), sometimes(iaa.LinearContrast((0.5, 2.0), per_channel=0.5)) ], random_order=True ) return aug_pipe def visualize_detection_dataset(img_folder, ann_folder, num_imgs = None, img_size=None, augment=None): import matplotlib.pyplot as plt import matplotlib from axelerate.networks.yolo.backend.utils.annotation import PascalVocXmlParser try: matplotlib.use('TkAgg') except: pass parser = PascalVocXmlParser() aug = ImgAugment(img_size, img_size, jitter=augment) for ann in os.listdir(ann_folder)[:num_imgs]: annotation_file = os.path.join(ann_folder, ann) fname = parser.get_fname(annotation_file) labels = parser.get_labels(annotation_file) boxes = parser.get_boxes(annotation_file) img_file = os.path.join(img_folder, fname) img, boxes_, labels_ = aug.imread(img_file, boxes, labels) for i in range(len(boxes_)): x1, y1, x2, y2 = boxes_[i] cv2.rectangle(img, (x1,y1), (x2,y2), (0,255,0), 3) cv2.putText(img, '{}'.format(labels_[i]), (x1, y1 - 13), cv2.FONT_HERSHEY_SIMPLEX, 1e-3 * img.shape[0], (255,0,0), 1) plt.imshow(img) plt.show(block=False) plt.pause(1) plt.close() def visualize_segmentation_dataset(images_path, segs_path, num_imgs = None, img_size=None, augment=False, n_classes=255): import matplotlib.pyplot as plt import matplotlib from axelerate.networks.segnet.data_utils.data_loader import get_pairs_from_paths, DATA_LOADER_SEED, class_colors, DataLoaderError try: matplotlib.use('TkAgg') except: pass def _get_colored_segmentation_image(img, seg, colors, n_classes, img_size, do_augment=False): """ Return a colored segmented image """ img, seg = process_image_segmentation(img, seg, img_size, img_size, img_size, img_size, do_augment) seg_img = np.zeros_like(seg) for c in range(n_classes): seg_img[:, :, 0] += ((seg[:, :, 0] == c) * (colors[c][0])).astype('uint8') seg_img[:, :, 1] += ((seg[:, :, 0] == c) * (colors[c][1])).astype('uint8') seg_img[:, :, 2] += ((seg[:, :, 0] == c) * (colors[c][2])).astype('uint8') return img, seg_img try: # Get image-segmentation pairs img_seg_pairs = get_pairs_from_paths(images_path, segs_path, ignore_non_matching=True) # Get the colors for the classes colors = class_colors print("Please press any key to display the next image") for im_fn, seg_fn in img_seg_pairs[:num_imgs]: img = cv2.imread(im_fn)[...,::-1] seg = cv2.imread(seg_fn) print("Found the following classes in the segmentation image:", np.unique(seg)) img, seg_img = _get_colored_segmentation_image(img, seg, colors, n_classes, img_size, do_augment=augment) fig = plt.figure(figsize=(14,7)) ax1 = fig.add_subplot(1,2,1) ax1.imshow(img) ax3 = fig.add_subplot(1,2,2) ax3.imshow(seg_img) plt.show(block=False) plt.pause(1) plt.close() except DataLoaderError as e: print("Found error during data loading\n{0}".format(str(e))) return False def visualize_classification_dataset(img_folder, num_imgs = None, img_size=None, augment=None): import matplotlib.pyplot as plt import matplotlib try: matplotlib.use('TkAgg') except: pass font = cv2.FONT_HERSHEY_SIMPLEX image_files_list = [] image_search = lambda ext : glob.glob(img_folder + ext, recursive=True) for ext in ['/**/*.jpg', '/**/*.jpeg', '/**/*.png']: image_files_list.extend(image_search(ext)) random.shuffle(image_files_list) for filename in image_files_list[0:num_imgs]: image = cv2.imread(filename)[...,::-1] image = process_image_classification(image, img_size, img_size, augment) cv2.putText(image, os.path.dirname(filename).split('/')[-1], (10,30), font, image.shape[1]/700 , (255, 0, 0), 2, True) plt.figure() plt.imshow(image) plt.show(block=False) plt.pause(1) plt.close() print(filename) if __name__ == '__main__': import argparse parser = argparse.ArgumentParser() parser.add_argument("--type", type=str) parser.add_argument("--images", type=str) parser.add_argument("--annotations", type=str) parser.add_argument("--num_imgs", type=int) parser.add_argument("--img_size", type=int) parser.add_argument("--aug", type=bool) args = parser.parse_args() if args.type == 'detection': visualize_detection_dataset(args.images, args.annotations, args.num_imgs, args.img_size, args.aug) if args.type == 'segmentation': visualize_segmentation_dataset(args.images, args.annotations, args.num_imgs, args.img_size, args.aug) if args.type == 'classification': visualize_classification_dataset(args.images, args.num_imgs, args.img_size, args.aug) ================================================ FILE: axelerate/networks/common_utils/callbacks.py ================================================ import numpy as np from tensorflow import keras from tensorflow.keras import backend as K def cosine_decay_with_warmup(global_step, learning_rate_base, total_steps, warmup_learning_rate=0.0, warmup_steps=0, hold_base_rate_steps=0): """Cosine decay schedule with warm up period. Cosine annealing learning rate as described in: Loshchilov and Hutter, SGDR: Stochastic Gradient Descent with Warm Restarts. ICLR 2017. https://arxiv.org/abs/1608.03983 In this schedule, the learning rate grows linearly from warmup_learning_rate to learning_rate_base for warmup_steps, then transitions to a cosine decay schedule. Arguments: global_step {int} -- global step. learning_rate_base {float} -- base learning rate. total_steps {int} -- total number of training steps. Keyword Arguments: warmup_learning_rate {float} -- initial learning rate for warm up. (default: {0.0}) warmup_steps {int} -- number of warmup steps. (default: {0}) hold_base_rate_steps {int} -- Optional number of steps to hold base learning rate before decaying. (default: {0}) Returns: a float representing learning rate. Raises: ValueError: if warmup_learning_rate is larger than learning_rate_base, or if warmup_steps is larger than total_steps. """ if total_steps < warmup_steps: raise ValueError('total_steps must be larger or equal to ' 'warmup_steps.') learning_rate = 0.5 * learning_rate_base * (1 + np.cos( np.pi * (global_step - warmup_steps - hold_base_rate_steps ) / float(total_steps - warmup_steps - hold_base_rate_steps))) if hold_base_rate_steps > 0: learning_rate = np.where(global_step > warmup_steps + hold_base_rate_steps, learning_rate, learning_rate_base) if warmup_steps > 0: if learning_rate_base < warmup_learning_rate: raise ValueError('learning_rate_base must be larger or equal to ' 'warmup_learning_rate.') slope = (learning_rate_base - warmup_learning_rate) / warmup_steps warmup_rate = slope * global_step + warmup_learning_rate learning_rate = np.where(global_step < warmup_steps, warmup_rate, learning_rate) return np.where(global_step > total_steps, 0.0, learning_rate) class WarmUpCosineDecayScheduler(keras.callbacks.Callback): """Cosine decay with warmup learning rate scheduler """ def __init__(self, learning_rate_base, total_steps, global_step_init=0, warmup_learning_rate=0.0, warmup_steps=0, hold_base_rate_steps=0, verbose=0): """Constructor for cosine decay with warmup learning rate scheduler. Arguments: learning_rate_base {float} -- base learning rate. total_steps {int} -- total number of training steps. Keyword Arguments: global_step_init {int} -- initial global step, e.g. from previous checkpoint. warmup_learning_rate {float} -- initial learning rate for warm up. (default: {0.0}) warmup_steps {int} -- number of warmup steps. (default: {0}) hold_base_rate_steps {int} -- Optional number of steps to hold base learning rate before decaying. (default: {0}) verbose {int} -- 0: quiet, 1: update messages. (default: {0}) """ super(WarmUpCosineDecayScheduler, self).__init__() self.learning_rate_base = learning_rate_base self.total_steps = total_steps self.global_step = global_step_init self.warmup_learning_rate = warmup_learning_rate self.warmup_steps = warmup_steps self.hold_base_rate_steps = hold_base_rate_steps self.verbose = verbose self.learning_rates = [] self.current_lr = 0.0 def on_epoch_end(self, epoch, logs={}): if self.verbose == 1: print('Epoch %05d: Learning rate is %s.\n' % (epoch, self.current_lr)) def on_batch_end(self, batch, logs=None): self.global_step = self.global_step + 1 lr = K.get_value(self.model.optimizer.lr) self.learning_rates.append(lr) def on_batch_begin(self, batch, logs=None): self.current_lr = cosine_decay_with_warmup(global_step=self.global_step, learning_rate_base=self.learning_rate_base, total_steps=self.total_steps, warmup_learning_rate=self.warmup_learning_rate, warmup_steps=self.warmup_steps, hold_base_rate_steps=self.hold_base_rate_steps) K.set_value(self.model.optimizer.lr, self.current_lr) if self.verbose ==2: print('\nBatch %05d: setting learning rate to %s.' % (self.global_step + 1, self.current_lr)) ================================================ FILE: axelerate/networks/common_utils/convert.py ================================================ import tensorflow as tf import tensorflow.keras.backend as k import subprocess import os import cv2 import argparse import tarfile import glob import shutil import numpy as np import shlex k210_converter_path=os.path.join(os.path.dirname(__file__),"ncc","ncc") k210_converter_download_path=os.path.join(os.path.dirname(os.path.abspath(__file__)),'ncc_linux_x86_64.tar.xz') nncase_download_url="https://github.com/kendryte/nncase/releases/download/v0.2.0-beta4/ncc_linux_x86_64.tar.xz" cwd = os.path.dirname(os.path.realpath(__file__)) def run_command(cmd, cwd=None): with subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, executable='/bin/bash', universal_newlines=True, cwd=cwd) as p: while True: line = p.stdout.readline() if not line: break print(line) exit_code = p.poll() return exit_code class Converter(object): def __init__(self, converter_type, backend=None, dataset_path=None): if 'tflite' in converter_type: print('Tflite Converter ready') if 'k210' in converter_type: if os.path.exists(k210_converter_path): print('K210 Converter ready') else: print('Downloading K210 Converter') _path = tf.keras.utils.get_file(k210_converter_download_path, nncase_download_url) print(_path) tar_file = tarfile.open(k210_converter_download_path) tar_file.extractall(os.path.join(os.path.dirname(__file__),"ncc")) tar_file.close() os.chmod(k210_converter_path, 0o775) if 'edgetpu' in converter_type: rc, out = subprocess.getstatusoutput('dpkg -l edgetpu-compiler') if rc == 0: print('Edge TPU Converter ready') else: print('Installing Edge TPU Converter') cmd = "bash install_edge_tpu_compiler.sh" result = run_command(cmd, cwd) print(result) if 'openvino' in converter_type: rc = os.path.isdir('/opt/intel/openvino') if rc: print('OpenVINO Converter ready') else: print('Installing OpenVINO Converter') cmd = "bash install_openvino.sh" result = run_command(cmd, cwd) print(result) if 'onnx' in converter_type: try: import tf2onnx except: cmd = "pip install tf2onnx" result = run_command(cmd, cwd) print(result) self._converter_type = converter_type self._backend = backend self._dataset_path=dataset_path def edgetpu_dataset_gen(self): num_imgs = 300 image_files_list = [] from axelerate.networks.common_utils.feature import create_feature_extractor backend = create_feature_extractor(self._backend, [self._img_size[0], self._img_size[1]]) image_search = lambda ext : glob.glob(self._dataset_path + ext, recursive=True) for ext in ['/**/*.jpg', '/**/*.jpeg', '/**/*.png']: image_files_list.extend(image_search(ext)) for filename in image_files_list[:num_imgs]: image = cv2.imread(filename) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) image = cv2.resize(image, (self._img_size[0], self._img_size[1])) data = np.array(backend.normalize(image), dtype=np.float32) data = np.expand_dims(data, 0) yield [data] def k210_dataset_gen(self): num_imgs = 300 image_files_list = [] from axelerate.networks.common_utils.feature import create_feature_extractor backend = create_feature_extractor(self._backend, [self._img_size[0], self._img_size[1]]) image_search = lambda ext : glob.glob(self._dataset_path + ext, recursive=True) for ext in ['/**/*.jpg', '/**/*.jpeg', '/**/*.png']: image_files_list.extend(image_search(ext)) temp_folder = os.path.join(os.path.dirname(__file__),'tmp') os.mkdir(temp_folder) for filename in image_files_list[:num_imgs]: image = cv2.imread(filename) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) image = cv2.resize(image, (self._img_size[0], self._img_size[1])) data = np.array(backend.normalize(image), dtype=np.float32) data = np.expand_dims(data, 0) bin_filename = os.path.basename(filename).split('.')[0]+'.bin' with open(os.path.join(temp_folder, bin_filename), "wb") as f: data = np.transpose(data, [0, 3, 1, 2]) data.tofile(f) return temp_folder def convert_edgetpu(self, model_path): output_path = os.path.dirname(model_path) print(output_path) cmd = "edgetpu_compiler --out_dir {} {}".format(output_path, model_path) print(cmd) result = run_command(cmd) print(result) def convert_k210(self, model_path): folder_name = self.k210_dataset_gen() output_name = os.path.basename(model_path).split(".")[0]+".kmodel" output_path = os.path.join(os.path.dirname(model_path),output_name) print(output_path) cmd = '{} compile "{}" "{}" -i tflite --weights-quantize-threshold 1000 --dataset-format raw --dataset "{}"'.format(k210_converter_path, model_path, output_path, folder_name) print(cmd) result = run_command(cmd) shutil.rmtree(folder_name, ignore_errors=True) print(result) def convert_ir(self, model_path, model_layers): input_model = os.path.join(model_path.split(".")[0], "saved_model.pb") output_dir = os.path.dirname(model_path) output_layer = model_layers[-2].name+'/BiasAdd' cmd = 'source /opt/intel/openvino/bin/setupvars.sh && python3 /opt/intel/openvino/deployment_tools/model_optimizer/mo.py --input_model "{}" --output {} --batch 1 --reverse_input_channels --data_type FP16 --mean_values [127.5,127.5,127.5] --scale_values [127.5] --output_dir "{}"'.format(input_model, output_layer, output_dir) print(cmd) result = run_command(cmd) print(result) def convert_oak(self, model_path): output_name = model_path.split(".")[0]+".blob" cmd = 'source /opt/intel/openvino/bin/setupvars.sh && /opt/intel/openvino/deployment_tools/inference_engine/lib/intel64/myriad_compile -m "{}" -o "{}" -ip U8 -VPU_MYRIAD_PLATFORM VPU_MYRIAD_2480 -VPU_NUMBER_OF_SHAVES 4 -VPU_NUMBER_OF_CMX_SLICES 4'.format(model_path.split(".")[0] + '.xml', output_name) print(cmd) result = run_command(cmd) print(result) def convert_onnx(self, model): spec = (tf.TensorSpec((None, *self._img_size, 3), tf.float32, name="input"),) output_path = self.model_path.split(".")[0] + '.onnx' model_proto, external_tensor_storage = tf2onnx.convert.from_keras(model, input_signature=spec, output_path = output_path) def convert_tflite(self, model, model_layers, target=None): model_type = model.name model.summary() if target=='k210': if model_type == 'yolo' or model_type == 'segnet': print("Converting to tflite without Reshape for K210 YOLO") if len(model.outputs) == 2: output1 = model.get_layer(name="detection_layer_1").output output2 = model.get_layer(name="detection_layer_2").output model = tf.keras.Model(inputs=model.input, outputs=[output1, output2]) else: model = tf.keras.Model(inputs=model.input, outputs=model.layers[-2].output) model.input.set_shape(1 + model.input.shape[1:]) converter = tf.lite.TFLiteConverter.from_keras_model(model) elif target == 'edgetpu': converter = tf.lite.TFLiteConverter.from_keras_model(model) converter.optimizations = [tf.lite.Optimize.DEFAULT] converter.representative_dataset = self.edgetpu_dataset_gen converter.target_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] converter.inference_input_type = tf.uint8 converter.inference_output_type = tf.uint8 elif target == 'tflite_dynamic': converter = tf.lite.TFLiteConverter.from_keras_model(model) converter.optimizations = [tf.lite.Optimize.DEFAULT] elif target == 'tflite_fullint': converter = tf.lite.TFLiteConverter.from_keras_model(model) converter.optimizations = [tf.lite.Optimize.DEFAULT] converter.representative_dataset = self.edgetpu_dataset_gen else: converter = tf.lite.TFLiteConverter.from_keras_model(model) tflite_model = converter.convert() open(os.path.join (self.model_path.split(".")[0] + '.tflite'), "wb").write(tflite_model) def convert_model(self, model_path): k.clear_session() k.set_learning_phase(0) model = tf.keras.models.load_model(model_path, compile=False) model_layers = model.layers self._img_size = model.input_shape[1:3] self.model_path = os.path.abspath(model_path) if 'k210' in self._converter_type: self.convert_tflite(model, model_layers, 'k210') self.convert_k210(self.model_path.split(".")[0] + '.tflite') if 'edgetpu' in self._converter_type: self.convert_tflite(model, model_layers, 'edgetpu') self.convert_edgetpu(model_path.split(".")[0] + '.tflite') if 'onnx' in self._converter_type: self.convert_onnx(model) if 'openvino' in self._converter_type: model.save(model_path.split(".")[0]) self.convert_ir(model_path, model_layers) self.convert_oak(model_path) if 'tflite' in self._converter_type: self.convert_tflite(model, model_layers, self._converter_type) if __name__ == '__main__': parser = argparse.ArgumentParser(description="Keras model conversion to .kmodel, .tflite, or .onnx") parser.add_argument("--model_path", "-m", type=str, required=True, help="path to keras model") parser.add_argument("--converter_type", type=str, default='k210', help="batch size") parser.add_argument("--dataset_path", type=str, required=False, help="path to calibration dataset") parser.add_argument("--backend", type=str, default='MobileNet7_5', help="network feature extractor, e.g. Mobilenet/YOLO/NASNet/etc") args = parser.parse_args() converter = Converter(args.converter_type, args.backend, args.dataset_path) converter.convert_model(args.model_path) ================================================ FILE: axelerate/networks/common_utils/feature.py ================================================ import tensorflow from tensorflow.keras.models import Model from tensorflow.keras.layers import Reshape, Activation, Conv2D, Input, MaxPooling2D, BatchNormalization, Flatten, Dense, Lambda, ZeroPadding2D from tensorflow.keras.layers import LeakyReLU from tensorflow.keras.layers import Concatenate from tensorflow.keras.applications import DenseNet121 from tensorflow.keras.applications import NASNetMobile from tensorflow.keras.applications import ResNet50 from .mobilenet_sipeed.mobilenet import MobileNet def create_feature_extractor(architecture, input_size, weights = None): """ # Args architecture : str input_size : int # Returns feature_extractor : BaseFeatureExtractor instance """ if architecture == 'DenseNet121': feature_extractor = DenseNet121Feature(input_size, weights) elif architecture == 'SqueezeNet': feature_extractor = SqueezeNetFeature(input_size, weights) elif architecture == 'MobileNet1_0': feature_extractor = MobileNetFeature(input_size, weights, alpha=1) elif architecture == 'MobileNet7_5': feature_extractor = MobileNetFeature(input_size, weights, alpha=0.75) elif architecture == 'MobileNet5_0': feature_extractor = MobileNetFeature(input_size, weights, alpha=0.5) elif architecture == 'MobileNet2_5': feature_extractor = MobileNetFeature(input_size, weights, alpha=0.25) elif architecture == 'Full Yolo': feature_extractor = FullYoloFeature(input_size, weights) elif architecture == 'Tiny Yolo': feature_extractor = TinyYoloFeature(input_size, weights) elif architecture == 'NASNetMobile': feature_extractor = NASNetMobileFeature(input_size, weights) elif architecture == 'ResNet50': feature_extractor = ResNet50Feature(input_size, weights) else: raise Exception('Architecture not supported! Name should be Full Yolo, Tiny Yolo, MobileNet1_0, MobileNet7_5, MobileNet5_0, MobileNet2_5, SqueezeNet, NASNetMobile, ResNet50 or DenseNet121') return feature_extractor class BaseFeatureExtractor(object): """docstring for ClassName""" # to be defined in each subclass def __init__(self, input_size): raise NotImplementedError("error message") # to be defined in each subclass def normalize(self, image): raise NotImplementedError("error message") def get_input_size(self): input_shape = self.feature_extractor.get_input_shape_at(0) assert input_shape[1] == input_shape[2] return input_shape[1] def get_output_size(self, layer = None): if not layer: output_shape = self.feature_extractor.outputs[0].shape output_shape = self.feature_extractor.get_layer(layer).output.shape return output_shape[1:3] def get_output_tensor(self, layer): return self.feature_extractor.get_layer(layer).output def extract(self, input_image): return self.feature_extractor(input_image) class FullYoloFeature(BaseFeatureExtractor): """docstring for ClassName""" def __init__(self, input_size, weights=None): input_image = Input(shape=(input_size[0], input_size[1], 3)) # the function to implement the orgnization layer (thanks to github.com/allanzelener/YAD2K) def space_to_depth_x2(x): return tensorflow.nn.space_to_depth(x, block_size=2) # Layer 1 x = Conv2D(32, (3,3), strides=(1,1), padding='same', name='conv_1', use_bias=False)(input_image) x = BatchNormalization(name='norm_1')(x) x = LeakyReLU(alpha=0.1)(x) x = MaxPooling2D(pool_size=(2, 2))(x) # Layer 2 x = Conv2D(64, (3,3), strides=(1,1), padding='same', name='conv_2', use_bias=False)(x) x = BatchNormalization(name='norm_2')(x) x = LeakyReLU(alpha=0.1)(x) x = MaxPooling2D(pool_size=(2, 2))(x) # Layer 3 x = Conv2D(128, (3,3), strides=(1,1), padding='same', name='conv_3', use_bias=False)(x) x = BatchNormalization(name='norm_3')(x) x = LeakyReLU(alpha=0.1)(x) # Layer 4 x = Conv2D(64, (1,1), strides=(1,1), padding='same', name='conv_4', use_bias=False)(x) x = BatchNormalization(name='norm_4')(x) x = LeakyReLU(alpha=0.1)(x) # Layer 5 x = Conv2D(128, (3,3), strides=(1,1), padding='same', name='conv_5', use_bias=False)(x) x = BatchNormalization(name='norm_5')(x) x = LeakyReLU(alpha=0.1)(x) x = MaxPooling2D(pool_size=(2, 2))(x) # Layer 6 x = Conv2D(256, (3,3), strides=(1,1), padding='same', name='conv_6', use_bias=False)(x) x = BatchNormalization(name='norm_6')(x) x = LeakyReLU(alpha=0.1)(x) # Layer 7 x = Conv2D(128, (1,1), strides=(1,1), padding='same', name='conv_7', use_bias=False)(x) x = BatchNormalization(name='norm_7')(x) x = LeakyReLU(alpha=0.1)(x) # Layer 8 x = Conv2D(256, (3,3), strides=(1,1), padding='same', name='conv_8', use_bias=False)(x) x = BatchNormalization(name='norm_8')(x) x = LeakyReLU(alpha=0.1)(x) x = MaxPooling2D(pool_size=(2, 2))(x) # Layer 9 x = Conv2D(512, (3,3), strides=(1,1), padding='same', name='conv_9', use_bias=False)(x) x = BatchNormalization(name='norm_9')(x) x = LeakyReLU(alpha=0.1)(x) # Layer 10 x = Conv2D(256, (1,1), strides=(1,1), padding='same', name='conv_10', use_bias=False)(x) x = BatchNormalization(name='norm_10')(x) x = LeakyReLU(alpha=0.1)(x) # Layer 11 x = Conv2D(512, (3,3), strides=(1,1), padding='same', name='conv_11', use_bias=False)(x) x = BatchNormalization(name='norm_11')(x) x = LeakyReLU(alpha=0.1)(x) # Layer 12 x = Conv2D(256, (1,1), strides=(1,1), padding='same', name='conv_12', use_bias=False)(x) x = BatchNormalization(name='norm_12')(x) x = LeakyReLU(alpha=0.1)(x) # Layer 13 x = Conv2D(512, (3,3), strides=(1,1), padding='same', name='conv_13', use_bias=False)(x) x = BatchNormalization(name='norm_13')(x) x = LeakyReLU(alpha=0.1)(x) skip_connection = x x = MaxPooling2D(pool_size=(2, 2))(x) # Layer 14 x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_14', use_bias=False)(x) x = BatchNormalization(name='norm_14')(x) x = LeakyReLU(alpha=0.1)(x) # Layer 15 x = Conv2D(512, (1,1), strides=(1,1), padding='same', name='conv_15', use_bias=False)(x) x = BatchNormalization(name='norm_15')(x) x = LeakyReLU(alpha=0.1)(x) # Layer 16 x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_16', use_bias=False)(x) x = BatchNormalization(name='norm_16')(x) x = LeakyReLU(alpha=0.1)(x) # Layer 17 x = Conv2D(512, (1,1), strides=(1,1), padding='same', name='conv_17', use_bias=False)(x) x = BatchNormalization(name='norm_17')(x) x = LeakyReLU(alpha=0.1)(x) # Layer 18 x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_18', use_bias=False)(x) x = BatchNormalization(name='norm_18')(x) x = LeakyReLU(alpha=0.1)(x) # Layer 19 x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_19', use_bias=False)(x) x = BatchNormalization(name='norm_19')(x) x = LeakyReLU(alpha=0.1)(x) # Layer 20 x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_20', use_bias=False)(x) x = BatchNormalization(name='norm_20')(x) x = LeakyReLU(alpha=0.1)(x) # Layer 21 skip_connection = Conv2D(64, (1,1), strides=(1,1), padding='same', name='conv_21', use_bias=False)(skip_connection) skip_connection = BatchNormalization(name='norm_21')(skip_connection) skip_connection = LeakyReLU(alpha=0.1)(skip_connection) skip_connection = Lambda(space_to_depth_x2)(skip_connection) x = Concatenate()([skip_connection, x]) # Layer 22 x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_22', use_bias=False)(x) x = BatchNormalization(name='norm_22')(x) x = LeakyReLU(alpha=0.1)(x) self.feature_extractor = Model(input_image, x) if weights == 'imagenet': print('Imagenet for YOLO backend are not available yet, defaulting to random weights') elif weights == None: pass else: print('Loaded backend weigths: '+weights) self.feature_extractor.load_weights(weights) def normalize(self, image): return image / 255. class TinyYoloFeature(BaseFeatureExtractor): """docstring for ClassName""" def __init__(self, input_size, weights): input_image = Input(shape=(input_size[0], input_size[1], 3)) # Layer 1 x = Conv2D(16, (3,3), strides=(1,1), padding='same', name='conv_1', use_bias=False)(input_image) x = BatchNormalization(name='norm_1')(x) x = LeakyReLU(alpha=0.1)(x) x = MaxPooling2D(pool_size=(2, 2))(x) # Layer 2 - 5 for i in range(0,4): x = Conv2D(24*(2**i), (3,3), strides=(1,1), padding='same', name='conv_' + str(i+2), use_bias=False)(x) x = BatchNormalization(name='norm_' + str(i+2))(x) x = LeakyReLU(alpha=0.1)(x) x = MaxPooling2D(pool_size=(2, 2))(x) # Layer 6 x = Conv2D(256, (3,3), strides=(1,1), padding='same', name='conv_6', use_bias=False)(x) x = BatchNormalization(name='norm_6')(x) x = LeakyReLU(alpha=0.1)(x) x = MaxPooling2D(pool_size=(2, 2), strides=(1,1), padding='same')(x) # Layer 7 - 8 for i in range(0,2): x = Conv2D(312, (3,3), strides=(1,1), padding='same', name='conv_' + str(i+7), use_bias=False)(x) x = BatchNormalization(name='norm_' + str(i+7))(x) x = LeakyReLU(alpha=0.1)(x) self.feature_extractor = Model(input_image, x) if weights == 'imagenet': print('Imagenet for YOLO backend are not available yet, defaulting to random weights') elif weights == None: pass else: print('Loaded backend weigths: '+weights) self.feature_extractor.load_weights(weights) def normalize(self, image): return image / 255. class MobileNetFeature(BaseFeatureExtractor): """docstring for ClassName""" def __init__(self, input_size, weights, alpha): input_image = Input(shape=(input_size[0], input_size[1], 3)) input_shapes_imagenet = [(128, 128,3), (160, 160,3), (192, 192,3), (224, 224,3)] input_shape =(128,128,3) for item in input_shapes_imagenet: if item[0] <= input_size[0]: input_shape = item if weights == 'imagenet': mobilenet = MobileNet(input_shape=input_shape, input_tensor=input_image, alpha = alpha, weights = 'imagenet', include_top=False, backend=tensorflow.keras.backend, layers=tensorflow.keras.layers, models=tensorflow.keras.models, utils=tensorflow.keras.utils) print('Successfully loaded imagenet backend weights') else: mobilenet = MobileNet(input_shape=(input_size[0],input_size[1],3),alpha = alpha,depth_multiplier = 1, dropout = 0.001, weights = None, include_top=False, backend=tensorflow.keras.backend, layers=tensorflow.keras.layers,models=tensorflow.keras.models,utils=tensorflow.keras.utils) if weights: print('Loaded backend weigths: '+weights) mobilenet.load_weights(weights) #x = mobilenet(input_image) self.feature_extractor = mobilenet def normalize(self, image): image = image / 255. image = image - 0.5 image = image * 2. return image class SqueezeNetFeature(BaseFeatureExtractor): """docstring for ClassName""" def __init__(self, input_size, weights): # define some auxiliary variables and the fire module sq1x1 = "squeeze1x1" exp1x1 = "expand1x1" exp3x3 = "expand3x3" relu = "relu_" def fire_module(x, fire_id, squeeze=16, expand=64): s_id = 'fire' + str(fire_id) + '/' x = Conv2D(squeeze, (1, 1), padding='valid', name=s_id + sq1x1)(x) x = Activation('relu', name=s_id + relu + sq1x1)(x) left = Conv2D(expand, (1, 1), padding='valid', name=s_id + exp1x1)(x) left = Activation('relu', name=s_id + relu + exp1x1)(left) right = Conv2D(expand, (3, 3), padding='same', name=s_id + exp3x3)(x) right = Activation('relu', name=s_id + relu + exp3x3)(right) x = Concatenate(axis=3, name=s_id + 'concat')([left, right]) return x # define the model of SqueezeNet input_image = Input(shape=(input_size[0], input_size[1], 3)) x = ZeroPadding2D(padding=((1, 1), (1, 1)), name='pad')(input_image) x = Conv2D(64, (3, 3), strides=(2, 2), padding='valid', name='conv1')(x) x = Activation('relu', name='relu_conv1')(x) x = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), name='pool1')(x) x = fire_module(x, fire_id=2, squeeze=16, expand=64) x = fire_module(x, fire_id=3, squeeze=16, expand=64) x = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), name='pool3')(x) x = fire_module(x, fire_id=4, squeeze=32, expand=128) x = fire_module(x, fire_id=5, squeeze=32, expand=128) x = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), name='pool5')(x) x = fire_module(x, fire_id=6, squeeze=48, expand=192) x = fire_module(x, fire_id=7, squeeze=48, expand=192) x = fire_module(x, fire_id=8, squeeze=64, expand=256) x = fire_module(x, fire_id=9, squeeze=64, expand=256) self.feature_extractor = Model(input_image, x) if weights == 'imagenet': print('Imagenet for SqueezeNet backend are not available yet, defaulting to random weights') elif weights == None: pass else: print('Loaded backend weigths: '+ weights) self.feature_extractor.load_weights(weights) def normalize(self, image): image = image[..., ::-1] image = image.astype('float') image[..., 0] -= 103.939 image[..., 1] -= 116.779 image[..., 2] -= 123.68 return image class DenseNet121Feature(BaseFeatureExtractor): """docstring for ClassName""" def __init__(self, input_size, weights): input_image = Input(shape=(input_size[0], input_size[1], 3)) if weights == 'imagenet': densenet = DenseNet121(input_tensor=input_image, include_top=False, weights='imagenet', pooling=None) print('Successfully loaded imagenet backend weights') else: densenet = DenseNet121(input_tensor=input_image, include_top=False, weights=None, pooling=None) if weights: densenet.load_weights(weights) print('Loaded backend weigths: ' + weights) self.feature_extractor = densenet def normalize(self, image): from tensorflow.keras.applications.densenet import preprocess_input return preprocess_input(image) class NASNetMobileFeature(BaseFeatureExtractor): """docstring for ClassName""" def __init__(self, input_size, weights): input_image = Input(shape=(input_size[0], input_size[1], 3)) if weights == 'imagenet': nasnetmobile = NASNetMobile(input_tensor=input_image, include_top=False, weights='imagenet', pooling=None) print('Successfully loaded imagenet backend weights') else: nasnetmobile = NASNetMobile(input_tensor=input_image, include_top=False, weights=None, pooling=None) if weights: nasnetmobile.load_weights(weights) print('Loaded backend weigths: ' + weights) self.feature_extractor = nasnetmobile def normalize(self, image): from tensorflow.keras.applications.nasnet import preprocess_input return preprocess_input(image) class ResNet50Feature(BaseFeatureExtractor): """docstring for ClassName""" def __init__(self, input_size, weights): input_image = Input(shape=(input_size[0], input_size[1], 3)) if weights == 'imagenet': resnet50 = ResNet50(input_tensor=input_image, weights='imagenet', include_top=False, pooling = None) print('Successfully loaded imagenet backend weights') else: resnet50 = ResNet50(input_tensor=input_image, include_top=False, pooling = None) if weights: resnet50.load_weights(weights) print('Loaded backend weigths: ' + weights) self.feature_extractor = resnet50 def normalize(self, image): image = image[..., ::-1] image = image.astype('float') image[..., 0] -= 103.939 image[..., 1] -= 116.779 image[..., 2] -= 123.68 return image ================================================ FILE: axelerate/networks/common_utils/fit.py ================================================ import shutil import os import time import tensorflow as tf import numpy as np import warnings from axelerate.networks.common_utils.callbacks import WarmUpCosineDecayScheduler from axelerate.networks.yolo.backend.utils.custom import MergeMetrics from tensorflow.keras.optimizers import SGD from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint from datetime import datetime def train(model, loss_func, train_batch_gen, valid_batch_gen, learning_rate = 1e-4, nb_epoch = 300, project_folder = 'project', first_trainable_layer = None, metric=None, metric_name="val_loss"): """A function that performs training on a general keras model. # Args model : keras.models.Model instance loss_func : function refer to https://keras.io/losses/ train_batch_gen : keras.utils.Sequence instance valid_batch_gen : keras.utils.Sequence instance learning_rate : float saved_weights_name : str """ # Create project directory train_start = time.time() train_date = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') path = os.path.join(project_folder, train_date) basename = model.name + "_best_"+ metric_name print('Current training session folder is {}'.format(path)) os.makedirs(path) save_weights_name = os.path.join(path, basename + '.h5') save_weights_name_ctrlc = os.path.join(path, basename + '_ctrlc.h5') print('\n') # 1 Freeze layers layer_names = [layer.name for layer in model.layers] fixed_layers = [] if first_trainable_layer in layer_names: for layer in model.layers: if layer.name == first_trainable_layer: break layer.trainable = False fixed_layers.append(layer.name) elif not first_trainable_layer: pass else: print('First trainable layer specified in config file is not in the model. Did you mean one of these?') for i,layer in enumerate(model.layers): print(i,layer.name) raise Exception('First trainable layer specified in config file is not in the model') if fixed_layers != []: print("The following layers do not update weights!!!") print(" ", fixed_layers) # 2 create optimizer optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) if not metric: metric = metric_name else: metric = metric[metric_name] print(metric) # 3. create loss function model.compile(loss=loss_func, optimizer=optimizer, metrics=metric if metric != 'loss' else None) model.summary() #4 create callbacks tensorboard_callback = tf.keras.callbacks.TensorBoard("logs", histogram_freq=1) warm_up_lr = WarmUpCosineDecayScheduler(learning_rate_base=learning_rate, total_steps=len(train_batch_gen)*nb_epoch, warmup_learning_rate=0.0, warmup_steps=len(train_batch_gen)*min(3, nb_epoch-1), hold_base_rate_steps=0, verbose=1) if metric_name in ['recall', 'precision']: mergedMetric = MergeMetrics(model, metric_name, 1, True, save_weights_name, tensorboard_callback) callbacks = [mergedMetric, warm_up_lr, tensorboard_callback] else: early_stop = EarlyStopping(monitor='val_' + metric, min_delta=0.001, patience=20, mode='auto', verbose=2, restore_best_weights=True) checkpoint = ModelCheckpoint(save_weights_name, monitor='val_' + metric, verbose=2, save_best_only=True, mode='auto', period=1) reduce_lr = ReduceLROnPlateau(monitor='val_' + metric, factor=0.2, patience=10, min_lr=1e-6, mode='auto', verbose=2) callbacks = [early_stop, checkpoint, warm_up_lr, tensorboard_callback] # 4. training try: model.fit(train_batch_gen, steps_per_epoch = len(train_batch_gen), epochs = nb_epoch, validation_data = valid_batch_gen, validation_steps = len(valid_batch_gen), callbacks = callbacks, verbose = 1, workers = 4, max_queue_size = 10, use_multiprocessing = True) except KeyboardInterrupt: print("Saving model and copying logs") model.save(save_weights_name_ctrlc, overwrite=True, include_optimizer=False) shutil.copytree("logs", os.path.join(path, "logs")) return model.layers, save_weights_name_ctrlc shutil.copytree("logs", os.path.join(path, "logs")) _print_time(time.time()-train_start) return model.layers, save_weights_name def _print_time(process_time): if process_time < 60: print("{:d}-seconds to train".format(int(process_time))) else: print("{:d}-mins to train".format(int(process_time/60))) ================================================ FILE: axelerate/networks/common_utils/install_edge_tpu_compiler.sh ================================================ wget https://packages.cloud.google.com/apt/doc/apt-key.gpg sudo apt-key add apt-key.gpg && echo "deb https://packages.cloud.google.com/apt coral-edgetpu-stable main" | sudo tee /etc/apt/sources.list.d/coral-edgetpu.list sudo apt-get update && sudo apt-get install -y edgetpu-compiler && rm apt-key.gpg ================================================ FILE: axelerate/networks/common_utils/install_openvino.sh ================================================ sudo apt-get install -y pciutils cpio && wget http://registrationcenter-download.intel.com/akdlm/irc_nas/16345/l_openvino_toolkit_p_2020.1.023.tgz && tar xf l_openvino_toolkit_p_2020.1.023.tgz && cd l_openvino_toolkit_p_2020.1.023 && sudo -E ./install_openvino_dependencies.sh && sed -i 's/decline/accept/g' silent.cfg && sudo -E ./install.sh --silent silent.cfg ================================================ FILE: axelerate/networks/common_utils/mobilenet_sipeed/__init__.py ================================================ """Enables dynamic setting of underlying Keras module. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function _KERAS_BACKEND = None _KERAS_LAYERS = None _KERAS_MODELS = None _KERAS_UTILS = None def set_keras_submodules(backend=None, layers=None, models=None, utils=None, engine=None): # Deprecated, will be removed in the future. global _KERAS_BACKEND global _KERAS_LAYERS global _KERAS_MODELS global _KERAS_UTILS _KERAS_BACKEND = backend _KERAS_LAYERS = layers _KERAS_MODELS = models _KERAS_UTILS = utils def get_keras_submodule(name): # Deprecated, will be removed in the future. if name not in {'backend', 'layers', 'models', 'utils'}: raise ImportError( 'Can only retrieve one of "backend", ' '"layers", "models", or "utils". ' 'Requested: %s' % name) if _KERAS_BACKEND is None: raise ImportError('You need to first `import keras` ' 'in order to use `keras_applications`. ' 'For instance, you can do:\n\n' '```\n' 'import keras\n' 'from keras_applications import vgg16\n' '```\n\n' 'Or, preferably, this equivalent formulation:\n\n' '```\n' 'from keras import applications\n' '```\n') if name == 'backend': return _KERAS_BACKEND elif name == 'layers': return _KERAS_LAYERS elif name == 'models': return _KERAS_MODELS elif name == 'utils': return _KERAS_UTILS def get_submodules_from_kwargs(kwargs): backend = kwargs.get('backend', _KERAS_BACKEND) layers = kwargs.get('layers', _KERAS_LAYERS) models = kwargs.get('models', _KERAS_MODELS) utils = kwargs.get('utils', _KERAS_UTILS) for key in kwargs.keys(): if key not in ['backend', 'layers', 'models', 'utils']: raise TypeError('Invalid keyword argument: %s', key) return backend, layers, models, utils def correct_pad(backend, inputs, kernel_size): """Returns a tuple for zero-padding for 2D convolution with downsampling. # Arguments input_size: An integer or tuple/list of 2 integers. kernel_size: An integer or tuple/list of 2 integers. # Returns A tuple. """ img_dim = 2 if backend.image_data_format() == 'channels_first' else 1 input_size = backend.int_shape(inputs)[img_dim:(img_dim + 2)] if isinstance(kernel_size, int): kernel_size = (kernel_size, kernel_size) if input_size[0] is None: adjust = (1, 1) else: adjust = (1 - input_size[0] % 2, 1 - input_size[1] % 2) correct = (kernel_size[0] // 2, kernel_size[1] // 2) return ((correct[0] - adjust[0], correct[0]), (correct[1] - adjust[1], correct[1])) __version__ = '1.0.7' from . import mobilenet ================================================ FILE: axelerate/networks/common_utils/mobilenet_sipeed/imagenet_utils.py ================================================ """Utilities for ImageNet data preprocessing & prediction decoding. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import json import warnings import numpy as np from . import get_submodules_from_kwargs CLASS_INDEX = None CLASS_INDEX_PATH = ('https://s3.amazonaws.com/deep-learning-models/' 'image-models/imagenet_class_index.json') # Global tensor of imagenet mean for preprocessing symbolic inputs _IMAGENET_MEAN = None def _preprocess_numpy_input(x, data_format, mode, **kwargs): """Preprocesses a Numpy array encoding a batch of images. # Arguments x: Input array, 3D or 4D. data_format: Data format of the image array. mode: One of "caffe", "tf" or "torch". - caffe: will convert the images from RGB to BGR, then will zero-center each color channel with respect to the ImageNet dataset, without scaling. - tf: will scale pixels between -1 and 1, sample-wise. - torch: will scale pixels between 0 and 1 and then will normalize each channel with respect to the ImageNet dataset. # Returns Preprocessed Numpy array. """ backend, _, _, _ = get_submodules_from_kwargs(kwargs) if not issubclass(x.dtype.type, np.floating): x = x.astype(backend.floatx(), copy=False) if mode == 'tf': x /= 127.5 x -= 1. return x if mode == 'torch': x /= 255. mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225] else: if data_format == 'channels_first': # 'RGB'->'BGR' if x.ndim == 3: x = x[::-1, ...] else: x = x[:, ::-1, ...] else: # 'RGB'->'BGR' x = x[..., ::-1] mean = [103.939, 116.779, 123.68] std = None # Zero-center by mean pixel if data_format == 'channels_first': if x.ndim == 3: x[0, :, :] -= mean[0] x[1, :, :] -= mean[1] x[2, :, :] -= mean[2] if std is not None: x[0, :, :] /= std[0] x[1, :, :] /= std[1] x[2, :, :] /= std[2] else: x[:, 0, :, :] -= mean[0] x[:, 1, :, :] -= mean[1] x[:, 2, :, :] -= mean[2] if std is not None: x[:, 0, :, :] /= std[0] x[:, 1, :, :] /= std[1] x[:, 2, :, :] /= std[2] else: x[..., 0] -= mean[0] x[..., 1] -= mean[1] x[..., 2] -= mean[2] if std is not None: x[..., 0] /= std[0] x[..., 1] /= std[1] x[..., 2] /= std[2] return x def _preprocess_symbolic_input(x, data_format, mode, **kwargs): """Preprocesses a tensor encoding a batch of images. # Arguments x: Input tensor, 3D or 4D. data_format: Data format of the image tensor. mode: One of "caffe", "tf" or "torch". - caffe: will convert the images from RGB to BGR, then will zero-center each color channel with respect to the ImageNet dataset, without scaling. - tf: will scale pixels between -1 and 1, sample-wise. - torch: will scale pixels between 0 and 1 and then will normalize each channel with respect to the ImageNet dataset. # Returns Preprocessed tensor. """ global _IMAGENET_MEAN backend, _, _, _ = get_submodules_from_kwargs(kwargs) if mode == 'tf': x /= 127.5 x -= 1. return x if mode == 'torch': x /= 255. mean = [0.485, 0.456, 0.406] std = [0.229, 0.224, 0.225] else: if data_format == 'channels_first': # 'RGB'->'BGR' if backend.ndim(x) == 3: x = x[::-1, ...] else: x = x[:, ::-1, ...] else: # 'RGB'->'BGR' x = x[..., ::-1] mean = [103.939, 116.779, 123.68] std = None if _IMAGENET_MEAN is None: _IMAGENET_MEAN = backend.constant(-np.array(mean)) # Zero-center by mean pixel if backend.dtype(x) != backend.dtype(_IMAGENET_MEAN): x = backend.bias_add( x, backend.cast(_IMAGENET_MEAN, backend.dtype(x)), data_format=data_format) else: x = backend.bias_add(x, _IMAGENET_MEAN, data_format) if std is not None: x /= std return x def preprocess_input(x, data_format=None, mode='caffe', **kwargs): """Preprocesses a tensor or Numpy array encoding a batch of images. # Arguments x: Input Numpy or symbolic tensor, 3D or 4D. The preprocessed data is written over the input data if the data types are compatible. To avoid this behaviour, `numpy.copy(x)` can be used. data_format: Data format of the image tensor/array. mode: One of "caffe", "tf" or "torch". - caffe: will convert the images from RGB to BGR, then will zero-center each color channel with respect to the ImageNet dataset, without scaling. - tf: will scale pixels between -1 and 1, sample-wise. - torch: will scale pixels between 0 and 1 and then will normalize each channel with respect to the ImageNet dataset. # Returns Preprocessed tensor or Numpy array. # Raises ValueError: In case of unknown `data_format` argument. """ backend, _, _, _ = get_submodules_from_kwargs(kwargs) if data_format is None: data_format = backend.image_data_format() if data_format not in {'channels_first', 'channels_last'}: raise ValueError('Unknown data_format ' + str(data_format)) if isinstance(x, np.ndarray): return _preprocess_numpy_input(x, data_format=data_format, mode=mode, **kwargs) else: return _preprocess_symbolic_input(x, data_format=data_format, mode=mode, **kwargs) def decode_predictions(preds, top=5, **kwargs): """Decodes the prediction of an ImageNet model. # Arguments preds: Numpy tensor encoding a batch of predictions. top: Integer, how many top-guesses to return. # Returns A list of lists of top class prediction tuples `(class_name, class_description, score)`. One list of tuples per sample in batch input. # Raises ValueError: In case of invalid shape of the `pred` array (must be 2D). """ global CLASS_INDEX backend, _, _, keras_utils = get_submodules_from_kwargs(kwargs) if len(preds.shape) != 2 or preds.shape[1] != 1000: raise ValueError('`decode_predictions` expects ' 'a batch of predictions ' '(i.e. a 2D array of shape (samples, 1000)). ' 'Found array with shape: ' + str(preds.shape)) if CLASS_INDEX is None: fpath = keras_utils.get_file( 'imagenet_class_index.json', CLASS_INDEX_PATH, cache_subdir='models', file_hash='c2c37ea517e94d9795004a39431a14cb') with open(fpath) as f: CLASS_INDEX = json.load(f) results = [] for pred in preds: top_indices = pred.argsort()[-top:][::-1] result = [tuple(CLASS_INDEX[str(i)]) + (pred[i],) for i in top_indices] result.sort(key=lambda x: x[2], reverse=True) results.append(result) return results def _obtain_input_shape(input_shape, default_size, min_size, data_format, require_flatten, weights=None): """Internal utility to compute/validate a model's input shape. # Arguments input_shape: Either None (will return the default network input shape), or a user-provided shape to be validated. default_size: Default input width/height for the model. min_size: Minimum input width/height accepted by the model. data_format: Image data format to use. require_flatten: Whether the model is expected to be linked to a classifier via a Flatten layer. weights: One of `None` (random initialization) or 'imagenet' (pre-training on ImageNet). If weights='imagenet' input channels must be equal to 3. # Returns An integer shape tuple (may include None entries). # Raises ValueError: In case of invalid argument values. """ if weights != 'imagenet' and input_shape and len(input_shape) == 3: if data_format == 'channels_first': if input_shape[0] not in {1, 3}: warnings.warn( 'This model usually expects 1 or 3 input channels. ' 'However, it was passed an input_shape with ' + str(input_shape[0]) + ' input channels.') default_shape = (input_shape[0], default_size, default_size) else: if input_shape[-1] not in {1, 3}: warnings.warn( 'This model usually expects 1 or 3 input channels. ' 'However, it was passed an input_shape with ' + str(input_shape[-1]) + ' input channels.') default_shape = (default_size, default_size, input_shape[-1]) else: if data_format == 'channels_first': default_shape = (3, default_size, default_size) else: default_shape = (default_size, default_size, 3) if weights == 'imagenet' and require_flatten: if input_shape is not None: if input_shape != default_shape: raise ValueError('When setting `include_top=True` ' 'and loading `imagenet` weights, ' '`input_shape` should be ' + str(default_shape) + '.') return default_shape if input_shape: if data_format == 'channels_first': if input_shape is not None: if len(input_shape) != 3: raise ValueError( '`input_shape` must be a tuple of three integers.') if input_shape[0] != 3 and weights == 'imagenet': raise ValueError('The input must have 3 channels; got ' '`input_shape=' + str(input_shape) + '`') if ((input_shape[1] is not None and input_shape[1] < min_size) or (input_shape[2] is not None and input_shape[2] < min_size)): raise ValueError('Input size must be at least ' + str(min_size) + 'x' + str(min_size) + '; got `input_shape=' + str(input_shape) + '`') else: if input_shape is not None: if len(input_shape) != 3: raise ValueError( '`input_shape` must be a tuple of three integers.') if input_shape[-1] != 3 and weights == 'imagenet': raise ValueError('The input must have 3 channels; got ' '`input_shape=' + str(input_shape) + '`') if ((input_shape[0] is not None and input_shape[0] < min_size) or (input_shape[1] is not None and input_shape[1] < min_size)): raise ValueError('Input size must be at least ' + str(min_size) + 'x' + str(min_size) + '; got `input_shape=' + str(input_shape) + '`') else: if require_flatten: input_shape = default_shape else: if data_format == 'channels_first': input_shape = (3, None, None) else: input_shape = (None, None, 3) if require_flatten: if None in input_shape: raise ValueError('If `include_top` is True, ' 'you should specify a static `input_shape`. ' 'Got `input_shape=' + str(input_shape) + '`') return input_shape ================================================ FILE: axelerate/networks/common_utils/mobilenet_sipeed/mobilenet.py ================================================ """MobileNet v1 models for Keras. MobileNet is a general architecture and can be used for multiple use cases. Depending on the use case, it can use different input layer size and different width factors. This allows different width models to reduce the number of multiply-adds and thereby reduce inference cost on mobile devices. MobileNets support any input size greater than 32 x 32, with larger image sizes offering better performance. The number of parameters and number of multiply-adds can be modified by using the `alpha` parameter, which increases/decreases the number of filters in each layer. By altering the image size and `alpha` parameter, all 16 models from the paper can be built, with ImageNet weights provided. The paper demonstrates the performance of MobileNets using `alpha` values of 1.0 (also called 100 % MobileNet), 0.75, 0.5 and 0.25. For each of these `alpha` values, weights for 4 different input image sizes are provided (224, 192, 160, 128). The following table describes the size and accuracy of the 100% MobileNet on size 224 x 224: ---------------------------------------------------------------------------- Width Multiplier (alpha) | ImageNet Acc | Multiply-Adds (M) | Params (M) ---------------------------------------------------------------------------- | 1.0 MobileNet-224 | 70.6 % | 529 | 4.2 | | 0.75 MobileNet-224 | 68.4 % | 325 | 2.6 | | 0.50 MobileNet-224 | 63.7 % | 149 | 1.3 | | 0.25 MobileNet-224 | 50.6 % | 41 | 0.5 | ---------------------------------------------------------------------------- The following table describes the performance of the 100 % MobileNet on various input sizes: ------------------------------------------------------------------------ Resolution | ImageNet Acc | Multiply-Adds (M) | Params (M) ------------------------------------------------------------------------ | 1.0 MobileNet-224 | 70.6 % | 529 | 4.2 | | 1.0 MobileNet-192 | 69.1 % | 529 | 4.2 | | 1.0 MobileNet-160 | 67.2 % | 529 | 4.2 | | 1.0 MobileNet-128 | 64.4 % | 529 | 4.2 | ------------------------------------------------------------------------ The weights for all 16 models are obtained and translated from TensorFlow checkpoints found at https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md # Reference - [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/pdf/1704.04861.pdf)) """ from __future__ import print_function from __future__ import absolute_import from __future__ import division import os import warnings from . import get_submodules_from_kwargs from . import imagenet_utils from .imagenet_utils import decode_predictions from .imagenet_utils import _obtain_input_shape BASE_WEIGHT_PATH = ('https://github.com/fchollet/deep-learning-models/' 'releases/download/v0.6/') backend = None layers = None models = None keras_utils = None def preprocess_input(x, **kwargs): """Preprocesses a numpy array encoding a batch of images. # Arguments x: a 4D numpy array consists of RGB values within [0, 255]. # Returns Preprocessed array. """ return imagenet_utils.preprocess_input(x, mode='tf', **kwargs) def MobileNet(input_shape=None, alpha=1.0, depth_multiplier=1, dropout=1e-3, include_top=True, weights='imagenet', input_tensor=None, pooling=None, classes=1000, **kwargs): """Instantiates the MobileNet architecture. # Arguments input_shape: optional shape tuple, only to be specified if `include_top` is False (otherwise the input shape has to be `(224, 224, 3)` (with `channels_last` data format) or (3, 224, 224) (with `channels_first` data format). It should have exactly 3 inputs channels, and width and height should be no smaller than 32. E.g. `(200, 200, 3)` would be one valid value. alpha: controls the width of the network. This is known as the width multiplier in the MobileNet paper. - If `alpha` < 1.0, proportionally decreases the number of filters in each layer. - If `alpha` > 1.0, proportionally increases the number of filters in each layer. - If `alpha` = 1, default number of filters from the paper are used at each layer. depth_multiplier: depth multiplier for depthwise convolution. This is called the resolution multiplier in the MobileNet paper. dropout: dropout rate include_top: whether to include the fully-connected layer at the top of the network. weights: one of `None` (random initialization), 'imagenet' (pre-training on ImageNet), or the path to the weights file to be loaded. input_tensor: optional Keras tensor (i.e. output of `layers.Input()`) to use as image input for the model. pooling: Optional pooling mode for feature extraction when `include_top` is `False`. - `None` means that the output of the model will be the 4D tensor output of the last convolutional block. - `avg` means that global average pooling will be applied to the output of the last convolutional block, and thus the output of the model will be a 2D tensor. - `max` means that global max pooling will be applied. classes: optional number of classes to classify images into, only to be specified if `include_top` is True, and if no `weights` argument is specified. # Returns A Keras model instance. # Raises ValueError: in case of invalid argument for `weights`, or invalid input shape. RuntimeError: If attempting to run this model with a backend that does not support separable convolutions. """ global backend, layers, models, keras_utils backend, layers, models, keras_utils = get_submodules_from_kwargs(kwargs) if not (weights in {'imagenet', None} or os.path.exists(weights)): raise ValueError('The `weights` argument should be either ' '`None` (random initialization), `imagenet` ' '(pre-training on ImageNet), ' 'or the path to the weights file to be loaded.') if weights == 'imagenet' and include_top and classes != 1000: raise ValueError('If using `weights` as `"imagenet"` with `include_top` ' 'as true, `classes` should be 1000') # Determine proper input shape and default size. if input_shape is None: default_size = 224 else: if backend.image_data_format() == 'channels_first': rows = input_shape[1] cols = input_shape[2] else: rows = input_shape[0] cols = input_shape[1] if rows == cols and rows in [128, 160, 192, 224]: default_size = rows else: default_size = 224 input_shape = _obtain_input_shape(input_shape, default_size=default_size, min_size=32, data_format=backend.image_data_format(), require_flatten=include_top, weights=weights) if backend.image_data_format() == 'channels_last': row_axis, col_axis = (0, 1) else: row_axis, col_axis = (1, 2) rows = input_shape[row_axis] cols = input_shape[col_axis] if weights == 'imagenet': if depth_multiplier != 1: raise ValueError('If imagenet weights are being loaded, ' 'depth multiplier must be 1') if alpha not in [0.25, 0.50, 0.75, 1.0]: raise ValueError('If imagenet weights are being loaded, ' 'alpha can be one of' '`0.25`, `0.50`, `0.75` or `1.0` only.') if rows != cols or rows not in [128, 160, 192, 224]: if rows is None: rows = 224 warnings.warn('MobileNet shape is undefined.' ' Weights for input shape ' '(224, 224) will be loaded.') else: raise ValueError('If imagenet weights are being loaded, ' 'input must have a static square shape ' '(one of (128, 128), (160, 160), ' '(192, 192), or (224, 224)). ' 'Input shape provided = %s' % (input_shape,)) if backend.image_data_format() != 'channels_last': warnings.warn('The MobileNet family of models is only available ' 'for the input data format "channels_last" ' '(width, height, channels). ' 'However your settings specify the default ' 'data format "channels_first" (channels, width, height).' ' You should set `image_data_format="channels_last"` ' 'in your Keras config located at ~/.keras/keras.json. ' 'The model being returned right now will expect inputs ' 'to follow the "channels_last" data format.') backend.set_image_data_format('channels_last') old_data_format = 'channels_first' else: old_data_format = None if input_tensor is None: img_input = layers.Input(shape=input_shape) else: if not backend.is_keras_tensor(input_tensor): img_input = layers.Input(tensor=input_tensor, shape=input_shape) else: img_input = input_tensor x = _conv_block(img_input, 32, alpha, strides=(2, 2)) x = _depthwise_conv_block(x, 64, alpha, depth_multiplier, block_id=1) x = _depthwise_conv_block(x, 128, alpha, depth_multiplier, strides=(2, 2), block_id=2) x = _depthwise_conv_block(x, 128, alpha, depth_multiplier, block_id=3) x = _depthwise_conv_block(x, 256, alpha, depth_multiplier, strides=(2, 2), block_id=4) x = _depthwise_conv_block(x, 256, alpha, depth_multiplier, block_id=5) x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, strides=(2, 2), block_id=6) x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=7) x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=8) x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=9) x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=10) x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=11) x = _depthwise_conv_block(x, 1024, alpha, depth_multiplier, strides=(2, 2), block_id=12) x = _depthwise_conv_block(x, 1024, alpha, depth_multiplier, block_id=13) if include_top: if backend.image_data_format() == 'channels_first': shape = (int(1024 * alpha), 1, 1) else: shape = (1, 1, int(1024 * alpha)) x = layers.GlobalAveragePooling2D()(x) x = layers.Reshape(shape, name='reshape_1')(x) x = layers.Dropout(dropout, name='dropout')(x) x = layers.Conv2D(classes, (1, 1), padding='same', name='conv_preds')(x) x = layers.Activation('softmax', name='act_softmax')(x) x = layers.Reshape((classes,), name='reshape_2')(x) else: if pooling == 'avg': x = layers.GlobalAveragePooling2D()(x) elif pooling == 'max': x = layers.GlobalMaxPooling2D()(x) # Ensure that the model takes into account # any potential predecessors of `input_tensor`. if input_tensor is not None: inputs = keras_utils.get_source_inputs(input_tensor) else: inputs = img_input # Create model. model = models.Model(inputs, x, name='mobilenet_%0.2f_%s' % (alpha, rows)) # Load weights. if weights == 'imagenet': if backend.image_data_format() == 'channels_first': raise ValueError('Weights for "channels_first" format ' 'are not available.') if alpha == 1.0: alpha_text = '1_0' elif alpha == 0.75: alpha_text = '7_5' elif alpha == 0.50: alpha_text = '5_0' else: alpha_text = '2_5' if include_top: model_name = 'mobilenet_%s_%d_tf.h5' % (alpha_text, rows) weight_path = BASE_WEIGHT_PATH + model_name weights_path = keras_utils.get_file(model_name, weight_path, cache_subdir='models') else: model_name = 'mobilenet_%s_%d_tf_no_top.h5' % (alpha_text, rows) weight_path = BASE_WEIGHT_PATH + model_name weights_path = keras_utils.get_file(model_name, weight_path, cache_subdir='models') model.load_weights(weights_path) elif weights is not None: model.load_weights(weights) if old_data_format: backend.set_image_data_format(old_data_format) return model def _conv_block(inputs, filters, alpha, kernel=(3, 3), strides=(1, 1)): """Adds an initial convolution layer (with batch normalization and relu6). # Arguments inputs: Input tensor of shape `(rows, cols, 3)` (with `channels_last` data format) or (3, rows, cols) (with `channels_first` data format). It should have exactly 3 inputs channels, and width and height should be no smaller than 32. E.g. `(224, 224, 3)` would be one valid value. filters: Integer, the dimensionality of the output space (i.e. the number of output filters in the convolution). alpha: controls the width of the network. - If `alpha` < 1.0, proportionally decreases the number of filters in each layer. - If `alpha` > 1.0, proportionally increases the number of filters in each layer. - If `alpha` = 1, default number of filters from the paper are used at each layer. kernel: An integer or tuple/list of 2 integers, specifying the width and height of the 2D convolution window. Can be a single integer to specify the same value for all spatial dimensions. strides: An integer or tuple/list of 2 integers, specifying the strides of the convolution along the width and height. Can be a single integer to specify the same value for all spatial dimensions. Specifying any stride value != 1 is incompatible with specifying any `dilation_rate` value != 1. # Input shape 4D tensor with shape: `(samples, channels, rows, cols)` if data_format='channels_first' or 4D tensor with shape: `(samples, rows, cols, channels)` if data_format='channels_last'. # Output shape 4D tensor with shape: `(samples, filters, new_rows, new_cols)` if data_format='channels_first' or 4D tensor with shape: `(samples, new_rows, new_cols, filters)` if data_format='channels_last'. `rows` and `cols` values might have changed due to stride. # Returns Output tensor of block. """ channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1 filters = int(filters * alpha) x = layers.ZeroPadding2D(padding=((1, 1), (1, 1)), name='conv1_pad')(inputs) x = layers.Conv2D(filters, kernel, padding='valid', use_bias=False, strides=strides, name='conv1')(x) x = layers.BatchNormalization(axis=channel_axis, name='conv1_bn')(x) return layers.ReLU(6., name='conv1_relu')(x) def _depthwise_conv_block(inputs, pointwise_conv_filters, alpha, depth_multiplier=1, strides=(1, 1), block_id=1): """Adds a depthwise convolution block. A depthwise convolution block consists of a depthwise conv, batch normalization, relu6, pointwise convolution, batch normalization and relu6 activation. # Arguments inputs: Input tensor of shape `(rows, cols, channels)` (with `channels_last` data format) or (channels, rows, cols) (with `channels_first` data format). pointwise_conv_filters: Integer, the dimensionality of the output space (i.e. the number of output filters in the pointwise convolution). alpha: controls the width of the network. - If `alpha` < 1.0, proportionally decreases the number of filters in each layer. - If `alpha` > 1.0, proportionally increases the number of filters in each layer. - If `alpha` = 1, default number of filters from the paper are used at each layer. depth_multiplier: The number of depthwise convolution output channels for each input channel. The total number of depthwise convolution output channels will be equal to `filters_in * depth_multiplier`. strides: An integer or tuple/list of 2 integers, specifying the strides of the convolution along the width and height. Can be a single integer to specify the same value for all spatial dimensions. Specifying any stride value != 1 is incompatible with specifying any `dilation_rate` value != 1. block_id: Integer, a unique identification designating the block number. # Input shape 4D tensor with shape: `(batch, channels, rows, cols)` if data_format='channels_first' or 4D tensor with shape: `(batch, rows, cols, channels)` if data_format='channels_last'. # Output shape 4D tensor with shape: `(batch, filters, new_rows, new_cols)` if data_format='channels_first' or 4D tensor with shape: `(batch, new_rows, new_cols, filters)` if data_format='channels_last'. `rows` and `cols` values might have changed due to stride. # Returns Output tensor of block. """ channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1 pointwise_conv_filters = int(pointwise_conv_filters * alpha) if strides == (1, 1): x = inputs else: x = layers.ZeroPadding2D(((1, 1), (1, 1)), name='conv_pad_%d' % block_id)(inputs) x = layers.DepthwiseConv2D((3, 3), padding='same' if strides == (1, 1) else 'valid', depth_multiplier=depth_multiplier, strides=strides, use_bias=False, name='conv_dw_%d' % block_id)(x) x = layers.BatchNormalization( axis=channel_axis, name='conv_dw_%d_bn' % block_id)(x) x = layers.ReLU(6., name='conv_dw_%d_relu' % block_id)(x) x = layers.Conv2D(pointwise_conv_filters, (1, 1), padding='same', use_bias=False, strides=(1, 1), name='conv_pw_%d' % block_id)(x) x = layers.BatchNormalization(axis=channel_axis, name='conv_pw_%d_bn' % block_id)(x) return layers.ReLU(6., name='conv_pw_%d_relu' % block_id)(x) ================================================ FILE: axelerate/networks/segnet/__init__.py ================================================ ================================================ FILE: axelerate/networks/segnet/data_utils/__init__.py ================================================ ================================================ FILE: axelerate/networks/segnet/data_utils/data_loader.py ================================================ import os import numpy as np np.random.seed(1337) from tensorflow.keras.utils import Sequence from axelerate.networks.common_utils.augment import process_image_segmentation import glob import itertools import random import six import cv2 try: from tqdm import tqdm except ImportError: print("tqdm not found, disabling progress bars") def tqdm(iter): return iter from ..models.config import IMAGE_ORDERING DATA_LOADER_SEED = 0 random.seed(DATA_LOADER_SEED) class_colors = [(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) for _ in range(5000)] class DataLoaderError(Exception): pass def get_pairs_from_paths(images_path, segs_path, ignore_non_matching=True): """ Find all the images from the images_path directory and the segmentation images from the segs_path directory while checking integrity of data """ ACCEPTABLE_IMAGE_FORMATS = [".jpg", ".jpeg", ".png" , ".bmp"] ACCEPTABLE_SEGMENTATION_FORMATS = [".png", ".bmp"] image_files = [] segmentation_files = {} for dir_entry in os.listdir(images_path): if os.path.isfile(os.path.join(images_path, dir_entry)) and \ os.path.splitext(dir_entry)[1] in ACCEPTABLE_IMAGE_FORMATS: file_name, file_extension = os.path.splitext(dir_entry) image_files.append((file_name, file_extension, os.path.join(images_path, dir_entry))) for dir_entry in os.listdir(segs_path): if os.path.isfile(os.path.join(segs_path, dir_entry)) and \ os.path.splitext(dir_entry)[1] in ACCEPTABLE_SEGMENTATION_FORMATS: file_name, file_extension = os.path.splitext(dir_entry) if file_name in segmentation_files: raise DataLoaderError("Segmentation file with filename {0} already exists and is ambiguous to resolve with path {1}. Please remove or rename the latter.".format(file_name, os.path.join(segs_path, dir_entry))) segmentation_files[file_name] = (file_extension, os.path.join(segs_path, dir_entry)) return_value = [] # Match the images and segmentations for image_file, _, image_full_path in image_files: if image_file in segmentation_files: return_value.append((image_full_path, segmentation_files[image_file][1])) elif ignore_non_matching: print("No corresponding segmentation found for image {0}.".format(image_full_path)) continue else: # Error out raise DataLoaderError("No corresponding segmentation found for image {0}.".format(image_full_path)) return return_value def get_image_array(image_input, norm, ordering='channels_first'): """ Load image array from input """ if type(image_input) is np.ndarray: # It is already an array, use it as it is img = image_input elif isinstance(image_input, six.string_types) : if not os.path.isfile(image_input): raise DataLoaderError("get_image_array: path {0} doesn't exist".format(image_input)) img = cv2.imread(image_input, 1) else: raise DataLoaderError("get_image_array: Can't process input type {0}".format(str(type(image_input)))) if norm: img = norm(img) if ordering == 'channels_first': img = np.rollaxis(img, 2, 0) return img def get_segmentation_array(image_input, nClasses, no_reshape=True): """ Load segmentation array from input """ seg_labels = np.zeros((image_input.shape[0], image_input.shape[1], nClasses)) if type(image_input) is np.ndarray: # It is already an array, use it as it is img = image_input elif isinstance(image_input, six.string_types) : if not os.path.isfile(image_input): raise DataLoaderError("get_segmentation_array: path {0} doesn't exist".format(image_input)) img = cv2.imread(image_input, 1) else: raise DataLoaderError("get_segmentation_array: Can't process input type {0}".format(str(type(image_input)))) img = img[:, :, 0] for c in range(nClasses): seg_labels[:, :, c] = (img == c).astype(int) if not no_reshape: seg_labels = np.reshape(seg_labels, (width*height, nClasses)) return seg_labels def verify_segmentation_dataset(images_path, segs_path, n_classes, show_all_errors=False): try: img_seg_pairs = get_pairs_from_paths(images_path, segs_path) if not len(img_seg_pairs): print("Couldn't load any data from images_path: {0} and segmentations path: {1}".format(images_path, segs_path)) return False return_value = True for im_fn, seg_fn in tqdm(img_seg_pairs): img = cv2.imread(im_fn) seg = cv2.imread(seg_fn) # Check dimensions match if not img.shape == seg.shape: return_value = False print("The size of image {0} and its segmentation {1} doesn't match (possibly the files are corrupt).".format(im_fn, seg_fn)) if not show_all_errors: break else: max_pixel_value = np.max(seg[:, :, 0]) if max_pixel_value >= n_classes: return_value = False print("The pixel values of the segmentation image {0} violating range [0, {1}]. Found maximum pixel value {2}".format(seg_fn, str(n_classes - 1), max_pixel_value)) if not show_all_errors: break if return_value: print("Dataset verified! ") else: print("Dataset not verified!") return return_value except DataLoaderError as e: print("Found error during data loading\n{0}".format(str(e))) return False def create_batch_generator(images_path, segs_path, input_size=224, output_size=112, n_classes=51, batch_size=8, repeat_times=1, do_augment=False, norm=None): worker = BatchGenerator(images_path, segs_path, batch_size, n_classes, input_size, output_size, repeat_times, do_augment, norm) return worker class BatchGenerator(Sequence): def __init__(self, images_path, segs_path, batch_size, n_classes,input_size, output_size, repeat_times, do_augment=False, norm=None): self.norm = norm self.n_classes = n_classes self.input_size = input_size self.output_size = output_size self.do_augment = do_augment self._repeat_times = repeat_times self._batch_size = batch_size self.img_seg_pairs = get_pairs_from_paths(images_path, segs_path) random.shuffle(self.img_seg_pairs) self.zipped = itertools.cycle(self.img_seg_pairs) self.counter = 0 def __len__(self): return int(len(self.img_seg_pairs) * self._repeat_times/self._batch_size) def __getitem__(self, idx): """ # Args idx : batch index """ x_batch = [] y_batch= [] for i in range(self._batch_size): img, seg = next(self.zipped) img = cv2.imread(img, 1)[...,::-1] seg = cv2.imread(seg, 1) im, seg = process_image_segmentation(img, seg, self.input_size[0], self.input_size[1], self.output_size[0], self.output_size[1], self.do_augment) x_batch.append(get_image_array(im, self.norm, ordering=IMAGE_ORDERING)) y_batch.append(get_segmentation_array(seg, self.n_classes)) x_batch = np.array(x_batch) y_batch = np.array(y_batch) self.counter += 1 return x_batch, y_batch def on_epoch_end(self): self.counter = 0 random.shuffle(self.img_seg_pairs) ================================================ FILE: axelerate/networks/segnet/frontend_segnet.py ================================================ import os import numpy as np import cv2 import time from tqdm import tqdm from axelerate.networks.segnet.data_utils.data_loader import create_batch_generator, verify_segmentation_dataset from axelerate.networks.common_utils.feature import create_feature_extractor from axelerate.networks.common_utils.fit import train from axelerate.networks.segnet.models.segnet import mobilenet_segnet, squeezenet_segnet, full_yolo_segnet, tiny_yolo_segnet, nasnetmobile_segnet, resnet50_segnet, densenet121_segnet def masked_categorical_crossentropy(gt , pr ): from tensorflow.keras.losses import categorical_crossentropy mask = 1 - gt[: , : , 0] return categorical_crossentropy(gt, pr)*mask def create_segnet(architecture, input_size, n_classes, weights = None): if architecture == 'NASNetMobile': model = nasnetmobile_segnet(n_classes, input_size, encoder_level=4, weights = weights) elif architecture == 'SqueezeNet': model = squeezenet_segnet(n_classes, input_size, encoder_level=4, weights = weights) elif architecture == 'Full Yolo': model = full_yolo_segnet(n_classes, input_size, encoder_level=4, weights = weights) elif architecture == 'Tiny Yolo': model = tiny_yolo_segnet(n_classes, input_size, encoder_level=4, weights = weights) elif architecture == 'DenseNet121': model = densenet121_segnet(n_classes, input_size, encoder_level=4, weights = weights) elif architecture == 'ResNet50': model = resnet50_segnet(n_classes, input_size, encoder_level=4, weights = weights) elif 'MobileNet' in architecture: model = mobilenet_segnet(n_classes, input_size, encoder_level=4, weights = weights, architecture = architecture) output_size = (model.output_height, model.output_width) network = Segnet(model, input_size, n_classes, model.normalize, output_size) return network class Segnet(object): def __init__(self, network, input_size, n_classes, norm, output_size): self.network = network self.n_classes = n_classes self.input_size = input_size self.output_size = output_size self.norm = norm def load_weights(self, weight_path, by_name=False): if os.path.exists(weight_path): print("Loading pre-trained weights for the whole model: ", weight_path) self.network.load_weights(weight_path) else: print("Failed to load pre-trained weights for the whole model. It might be because you didn't specify any or the weight file cannot be found") def predict(self, image): start_time = time.time() Y_pred = np.squeeze(self.network.predict(image)) elapsed_ms = (time.time() - start_time) * 1000 y_pred = np.argmax(Y_pred, axis = 2) return elapsed_ms, y_pred def evaluate(self, img_folder, ann_folder, batch_size): self.generator = create_batch_generator(img_folder, ann_folder, self.input_size, self.output_size, self.n_classes, batch_size, 1, False, self.norm) tp = np.zeros(self.n_classes) fp = np.zeros(self.n_classes) fn = np.zeros(self.n_classes) n_pixels = np.zeros(self.n_classes) for inp, gt in tqdm(list(self.generator)): y_pred = self.network.predict(inp) y_pred = np.argmax(y_pred, axis=-1) gt = np.argmax(gt, axis=-1) for cl_i in range(self.n_classes): tp[cl_i] += np.sum((y_pred == cl_i) * (gt == cl_i)) fp[cl_i] += np.sum((y_pred == cl_i) * ((gt != cl_i))) fn[cl_i] += np.sum((y_pred != cl_i) * ((gt == cl_i))) n_pixels[cl_i] += np.sum(gt == cl_i) cl_wise_score = tp / (tp + fp + fn + 0.000000000001) n_pixels_norm = n_pixels / np.sum(n_pixels) frequency_weighted_IU = np.sum(cl_wise_score*n_pixels_norm) mean_IU = np.mean(cl_wise_score) report = {"frequency_weighted_IU":frequency_weighted_IU , "mean_IU":mean_IU , "class_wise_IU":cl_wise_score} return report def train(self, img_folder, ann_folder, nb_epoch, project_folder, batch_size=8, do_augment=False, learning_rate=1e-4, train_times=1, valid_times=1, valid_img_folder="", valid_ann_folder="", first_trainable_layer=None, ignore_zero_class=False, metrics='val_loss'): if metrics != "accuracy" and metrics != "loss": print("Unknown metric for SegNet, valid options are: val_loss or val_accuracy. Defaulting ot val_loss") metrics = "loss" if ignore_zero_class: loss_k = masked_categorical_crossentropy else: loss_k = 'categorical_crossentropy' train_generator = create_batch_generator(img_folder, ann_folder, self.input_size, self.output_size, self.n_classes,batch_size, train_times, do_augment, self.norm) validation_generator = create_batch_generator(valid_img_folder, valid_ann_folder, self.input_size, self.output_size, self.n_classes, batch_size, valid_times, False, self.norm) return train(self.network, loss_k, train_generator, validation_generator, learning_rate, nb_epoch, project_folder, first_trainable_layer, metric_name = metrics) ================================================ FILE: axelerate/networks/segnet/metrics.py ================================================ import numpy as np EPS = 1e-12 def get_iou(gt, pr, n_classes): class_wise = np.zeros(n_classes) for cl in range(n_classes): intersection = np.sum((gt == cl)*(pr == cl)) union = np.sum(np.maximum((gt == cl), (pr == cl))) iou = float(intersection)/(union + EPS) class_wise[cl] = iou return class_wise ================================================ FILE: axelerate/networks/segnet/models/__init__.py ================================================ ================================================ FILE: axelerate/networks/segnet/models/_pspnet_2.py ================================================ # This code is proveded by Vladkryvoruchko and small modifications done by me . from math import ceil from sys import exit from keras import layers from keras.layers import Conv2D, MaxPooling2D, AveragePooling2D from keras.layers import BatchNormalization, Activation, Input, Dropout, \ ZeroPadding2D, Lambda from keras.layers.merge import Concatenate, Add from keras.models import Model from keras.optimizers import SGD import tensorflow as tf from .config import IMAGE_ORDERING from .model_utils import get_segmentation_model, resize_image learning_rate = 1e-3 # Layer specific learning rate # Weight decay not implemented def BN(name=""): return BatchNormalization(momentum=0.95, name=name, epsilon=1e-5) class Interp(layers.Layer): def __init__(self, new_size, **kwargs): self.new_size = new_size super(Interp, self).__init__(**kwargs) def build(self, input_shape): super(Interp, self).build(input_shape) def call(self, inputs, **kwargs): new_height, new_width = self.new_size try: resized = tf.image.resize(inputs, [new_height, new_width]) except AttributeError: resized = tf.image.resize_images(inputs, [new_height, new_width], align_corners=True) return resized def compute_output_shape(self, input_shape): return tuple([None, self.new_size[0], self.new_size[1], input_shape[3]]) def get_config(self): config = super(Interp, self).get_config() config['new_size'] = self.new_size return config # def Interp(x, shape): # new_height, new_width = shape # resized = tf.image.resize_images(x, [new_height, new_width], # align_corners=True) # return resized def residual_conv(prev, level, pad=1, lvl=1, sub_lvl=1, modify_stride=False): lvl = str(lvl) sub_lvl = str(sub_lvl) names = ["conv" + lvl + "_" + sub_lvl + "_1x1_reduce", "conv" + lvl + "_" + sub_lvl + "_1x1_reduce_bn", "conv" + lvl + "_" + sub_lvl + "_3x3", "conv" + lvl + "_" + sub_lvl + "_3x3_bn", "conv" + lvl + "_" + sub_lvl + "_1x1_increase", "conv" + lvl + "_" + sub_lvl + "_1x1_increase_bn"] if modify_stride is False: prev = Conv2D(64 * level, (1, 1), strides=(1, 1), name=names[0], use_bias=False)(prev) elif modify_stride is True: prev = Conv2D(64 * level, (1, 1), strides=(2, 2), name=names[0], use_bias=False)(prev) prev = BN(name=names[1])(prev) prev = Activation('relu')(prev) prev = ZeroPadding2D(padding=(pad, pad))(prev) prev = Conv2D(64 * level, (3, 3), strides=(1, 1), dilation_rate=pad, name=names[2], use_bias=False)(prev) prev = BN(name=names[3])(prev) prev = Activation('relu')(prev) prev = Conv2D(256 * level, (1, 1), strides=(1, 1), name=names[4], use_bias=False)(prev) prev = BN(name=names[5])(prev) return prev def short_convolution_branch(prev, level, lvl=1, sub_lvl=1, modify_stride=False): lvl = str(lvl) sub_lvl = str(sub_lvl) names = ["conv" + lvl + "_" + sub_lvl + "_1x1_proj", "conv" + lvl + "_" + sub_lvl + "_1x1_proj_bn"] if modify_stride is False: prev = Conv2D(256 * level, (1, 1), strides=(1, 1), name=names[0], use_bias=False)(prev) elif modify_stride is True: prev = Conv2D(256 * level, (1, 1), strides=(2, 2), name=names[0], use_bias=False)(prev) prev = BN(name=names[1])(prev) return prev def empty_branch(prev): return prev def residual_short(prev_layer, level, pad=1, lvl=1, sub_lvl=1, modify_stride=False): prev_layer = Activation('relu')(prev_layer) block_1 = residual_conv(prev_layer, level, pad=pad, lvl=lvl, sub_lvl=sub_lvl, modify_stride=modify_stride) block_2 = short_convolution_branch(prev_layer, level, lvl=lvl, sub_lvl=sub_lvl, modify_stride=modify_stride) added = Add()([block_1, block_2]) return added def residual_empty(prev_layer, level, pad=1, lvl=1, sub_lvl=1): prev_layer = Activation('relu')(prev_layer) block_1 = residual_conv(prev_layer, level, pad=pad, lvl=lvl, sub_lvl=sub_lvl) block_2 = empty_branch(prev_layer) added = Add()([block_1, block_2]) return added def ResNet(inp, layers): # Names for the first couple layers of model names = ["conv1_1_3x3_s2", "conv1_1_3x3_s2_bn", "conv1_2_3x3", "conv1_2_3x3_bn", "conv1_3_3x3", "conv1_3_3x3_bn"] # Short branch(only start of network) cnv1 = Conv2D(64, (3, 3), strides=(2, 2), padding='same', name=names[0], use_bias=False)(inp) # "conv1_1_3x3_s2" bn1 = BN(name=names[1])(cnv1) # "conv1_1_3x3_s2/bn" relu1 = Activation('relu')(bn1) # "conv1_1_3x3_s2/relu" cnv1 = Conv2D(64, (3, 3), strides=(1, 1), padding='same', name=names[2], use_bias=False)(relu1) # "conv1_2_3x3" bn1 = BN(name=names[3])(cnv1) # "conv1_2_3x3/bn" relu1 = Activation('relu')(bn1) # "conv1_2_3x3/relu" cnv1 = Conv2D(128, (3, 3), strides=(1, 1), padding='same', name=names[4], use_bias=False)(relu1) # "conv1_3_3x3" bn1 = BN(name=names[5])(cnv1) # "conv1_3_3x3/bn" relu1 = Activation('relu')(bn1) # "conv1_3_3x3/relu" res = MaxPooling2D(pool_size=(3, 3), padding='same', strides=(2, 2))(relu1) # "pool1_3x3_s2" # ---Residual layers(body of network) """ Modify_stride --Used only once in first 3_1 convolutions block. changes stride of first convolution from 1 -> 2 """ # 2_1- 2_3 res = residual_short(res, 1, pad=1, lvl=2, sub_lvl=1) for i in range(2): res = residual_empty(res, 1, pad=1, lvl=2, sub_lvl=i + 2) # 3_1 - 3_3 res = residual_short(res, 2, pad=1, lvl=3, sub_lvl=1, modify_stride=True) for i in range(3): res = residual_empty(res, 2, pad=1, lvl=3, sub_lvl=i + 2) if layers is 50: # 4_1 - 4_6 res = residual_short(res, 4, pad=2, lvl=4, sub_lvl=1) for i in range(5): res = residual_empty(res, 4, pad=2, lvl=4, sub_lvl=i + 2) elif layers is 101: # 4_1 - 4_23 res = residual_short(res, 4, pad=2, lvl=4, sub_lvl=1) for i in range(22): res = residual_empty(res, 4, pad=2, lvl=4, sub_lvl=i + 2) else: print("This ResNet is not implemented") # 5_1 - 5_3 res = residual_short(res, 8, pad=4, lvl=5, sub_lvl=1) for i in range(2): res = residual_empty(res, 8, pad=4, lvl=5, sub_lvl=i + 2) res = Activation('relu')(res) return res def interp_block(prev_layer, level, feature_map_shape, input_shape): if input_shape == (473, 473): kernel_strides_map = {1: 60, 2: 30, 3: 20, 6: 10} elif input_shape == (713, 713): kernel_strides_map = {1: 90, 2: 45, 3: 30, 6: 15} else: print("Pooling parameters for input shape ", input_shape, " are not defined.") exit(1) names = [ "conv5_3_pool" + str(level) + "_conv", "conv5_3_pool" + str(level) + "_conv_bn" ] kernel = (kernel_strides_map[level], kernel_strides_map[level]) strides = (kernel_strides_map[level], kernel_strides_map[level]) prev_layer = AveragePooling2D(kernel, strides=strides)(prev_layer) prev_layer = Conv2D(512, (1, 1), strides=(1, 1), name=names[0], use_bias=False)(prev_layer) prev_layer = BN(name=names[1])(prev_layer) prev_layer = Activation('relu')(prev_layer) # prev_layer = Lambda(Interp, arguments={ # 'shape': feature_map_shape})(prev_layer) prev_layer = Interp(feature_map_shape)(prev_layer) return prev_layer def build_pyramid_pooling_module(res, input_shape): """Build the Pyramid Pooling Module.""" # ---PSPNet concat layers with Interpolation feature_map_size = tuple(int(ceil(input_dim / 8.0)) for input_dim in input_shape) interp_block1 = interp_block(res, 1, feature_map_size, input_shape) interp_block2 = interp_block(res, 2, feature_map_size, input_shape) interp_block3 = interp_block(res, 3, feature_map_size, input_shape) interp_block6 = interp_block(res, 6, feature_map_size, input_shape) # concat all these layers. resulted # shape=(1,feature_map_size_x,feature_map_size_y,4096) res = Concatenate()([res, interp_block6, interp_block3, interp_block2, interp_block1]) return res def _build_pspnet(nb_classes, resnet_layers, input_shape, activation='softmax'): assert IMAGE_ORDERING == 'channels_last' inp = Input((input_shape[0], input_shape[1], 3)) res = ResNet(inp, layers=resnet_layers) psp = build_pyramid_pooling_module(res, input_shape) x = Conv2D(512, (3, 3), strides=(1, 1), padding="same", name="conv5_4", use_bias=False)(psp) x = BN(name="conv5_4_bn")(x) x = Activation('relu')(x) x = Dropout(0.1)(x) x = Conv2D(nb_classes, (1, 1), strides=(1, 1), name="conv6")(x) # x = Lambda(Interp, arguments={'shape': ( # input_shape[0], input_shape[1])})(x) x = Interp([input_shape[0], input_shape[1]])(x) model = get_segmentation_model(inp, x) return model ================================================ FILE: axelerate/networks/segnet/models/all_models.py ================================================ from . import pspnet from . import unet from . import segnet from . import fcn model_from_name = {} model_from_name["fcn_8"] = fcn.fcn_8 model_from_name["fcn_32"] = fcn.fcn_32 model_from_name["fcn_8_vgg"] = fcn.fcn_8_vgg model_from_name["fcn_32_vgg"] = fcn.fcn_32_vgg model_from_name["fcn_8_resnet50"] = fcn.fcn_8_resnet50 model_from_name["fcn_32_resnet50"] = fcn.fcn_32_resnet50 model_from_name["fcn_8_mobilenet"] = fcn.fcn_8_mobilenet model_from_name["fcn_32_mobilenet"] = fcn.fcn_32_mobilenet model_from_name["pspnet"] = pspnet.pspnet model_from_name["vgg_pspnet"] = pspnet.vgg_pspnet model_from_name["resnet50_pspnet"] = pspnet.resnet50_pspnet model_from_name["vgg_pspnet"] = pspnet.vgg_pspnet model_from_name["resnet50_pspnet"] = pspnet.resnet50_pspnet model_from_name["pspnet_50"] = pspnet.pspnet_50 model_from_name["pspnet_101"] = pspnet.pspnet_101 # model_from_name["mobilenet_pspnet"] = pspnet.mobilenet_pspnet model_from_name["unet_mini"] = unet.unet_mini model_from_name["unet"] = unet.unet model_from_name["vgg_unet"] = unet.vgg_unet model_from_name["resnet50_unet"] = unet.resnet50_unet model_from_name["mobilenet_unet"] = unet.mobilenet_unet model_from_name["segnet"] = segnet.segnet model_from_name["vgg_segnet"] = segnet.vgg_segnet model_from_name["resnet50_segnet"] = segnet.resnet50_segnet model_from_name["mobilenet_segnet"] = segnet.mobilenet_segnet ================================================ FILE: axelerate/networks/segnet/models/basic_models.py ================================================ from keras.models import * from keras.layers import * import keras.backend as K from .config import IMAGE_ORDERING def vanilla_encoder(input_height=224, input_width=224): kernel = 3 filter_size = 64 pad = 1 pool_size = 2 if IMAGE_ORDERING == 'channels_first': img_input = Input(shape=(3, input_height, input_width)) elif IMAGE_ORDERING == 'channels_last': img_input = Input(shape=(input_height, input_width, 3)) x = img_input levels = [] x = (ZeroPadding2D((pad, pad), data_format=IMAGE_ORDERING))(x) x = (Conv2D(filter_size, (kernel, kernel), data_format=IMAGE_ORDERING, padding='valid'))(x) x = (BatchNormalization())(x) x = (Activation('relu'))(x) x = (MaxPooling2D((pool_size, pool_size), data_format=IMAGE_ORDERING))(x) levels.append(x) x = (ZeroPadding2D((pad, pad), data_format=IMAGE_ORDERING))(x) x = (Conv2D(128, (kernel, kernel), data_format=IMAGE_ORDERING, padding='valid'))(x) x = (BatchNormalization())(x) x = (Activation('relu'))(x) x = (MaxPooling2D((pool_size, pool_size), data_format=IMAGE_ORDERING))(x) levels.append(x) for _ in range(3): x = (ZeroPadding2D((pad, pad), data_format=IMAGE_ORDERING))(x) x = (Conv2D(256, (kernel, kernel), data_format=IMAGE_ORDERING, padding='valid'))(x) x = (BatchNormalization())(x) x = (Activation('relu'))(x) x = (MaxPooling2D((pool_size, pool_size), data_format=IMAGE_ORDERING))(x) levels.append(x) return img_input, levels ================================================ FILE: axelerate/networks/segnet/models/config.py ================================================ IMAGE_ORDERING_CHANNELS_LAST = "channels_last" IMAGE_ORDERING_CHANNELS_FIRST = "channels_first" # Default IMAGE_ORDERING = channels_last IMAGE_ORDERING = IMAGE_ORDERING_CHANNELS_LAST ================================================ FILE: axelerate/networks/segnet/models/fcn.py ================================================ from keras.models import * from keras.layers import * from .config import IMAGE_ORDERING from .model_utils import get_segmentation_model from .vgg16 import get_vgg_encoder from .mobilenet import get_mobilenet_encoder from .basic_models import vanilla_encoder from .resnet50 import get_resnet50_encoder # crop o1 wrt o2 def crop(o1, o2, i): o_shape2 = Model(i, o2).output_shape if IMAGE_ORDERING == 'channels_first': output_height2 = o_shape2[2] output_width2 = o_shape2[3] else: output_height2 = o_shape2[1] output_width2 = o_shape2[2] o_shape1 = Model(i, o1).output_shape if IMAGE_ORDERING == 'channels_first': output_height1 = o_shape1[2] output_width1 = o_shape1[3] else: output_height1 = o_shape1[1] output_width1 = o_shape1[2] cx = abs(output_width1 - output_width2) cy = abs(output_height2 - output_height1) if output_width1 > output_width2: o1 = Cropping2D(cropping=((0, 0), (0, cx)), data_format=IMAGE_ORDERING)(o1) else: o2 = Cropping2D(cropping=((0, 0), (0, cx)), data_format=IMAGE_ORDERING)(o2) if output_height1 > output_height2: o1 = Cropping2D(cropping=((0, cy), (0, 0)), data_format=IMAGE_ORDERING)(o1) else: o2 = Cropping2D(cropping=((0, cy), (0, 0)), data_format=IMAGE_ORDERING)(o2) return o1, o2 def fcn_8(n_classes, encoder=vanilla_encoder, input_height=416, input_width=608): img_input, levels = encoder( input_height=input_height, input_width=input_width) [f1, f2, f3, f4, f5] = levels o = f5 o = (Conv2D(4096, (7, 7), activation='relu', padding='same', data_format=IMAGE_ORDERING))(o) o = Dropout(0.5)(o) o = (Conv2D(4096, (1, 1), activation='relu', padding='same', data_format=IMAGE_ORDERING))(o) o = Dropout(0.5)(o) o = (Conv2D(n_classes, (1, 1), kernel_initializer='he_normal', data_format=IMAGE_ORDERING))(o) o = Conv2DTranspose(n_classes, kernel_size=(4, 4), strides=( 2, 2), use_bias=False, data_format=IMAGE_ORDERING)(o) o2 = f4 o2 = (Conv2D(n_classes, (1, 1), kernel_initializer='he_normal', data_format=IMAGE_ORDERING))(o2) o, o2 = crop(o, o2, img_input) o = Add()([o, o2]) o = Conv2DTranspose(n_classes, kernel_size=(4, 4), strides=( 2, 2), use_bias=False, data_format=IMAGE_ORDERING)(o) o2 = f3 o2 = (Conv2D(n_classes, (1, 1), kernel_initializer='he_normal', data_format=IMAGE_ORDERING))(o2) o2, o = crop(o2, o, img_input) o = Add()([o2, o]) o = Conv2DTranspose(n_classes, kernel_size=(16, 16), strides=( 8, 8), use_bias=False, data_format=IMAGE_ORDERING)(o) model = get_segmentation_model(img_input, o) model.model_name = "fcn_8" return model def fcn_32(n_classes, encoder=vanilla_encoder, input_height=416, input_width=608): img_input, levels = encoder( input_height=input_height, input_width=input_width) [f1, f2, f3, f4, f5] = levels o = f5 o = (Conv2D(4096, (7, 7), activation='relu', padding='same', data_format=IMAGE_ORDERING))(o) o = Dropout(0.5)(o) o = (Conv2D(4096, (1, 1), activation='relu', padding='same', data_format=IMAGE_ORDERING))(o) o = Dropout(0.5)(o) o = (Conv2D(n_classes, (1, 1), kernel_initializer='he_normal', data_format=IMAGE_ORDERING))(o) o = Conv2DTranspose(n_classes, kernel_size=(64, 64), strides=( 32, 32), use_bias=False, data_format=IMAGE_ORDERING)(o) model = get_segmentation_model(img_input, o) model.model_name = "fcn_32" return model def fcn_8_vgg(n_classes, input_height=416, input_width=608): model = fcn_8(n_classes, get_vgg_encoder, input_height=input_height, input_width=input_width) model.model_name = "fcn_8_vgg" return model def fcn_32_vgg(n_classes, input_height=416, input_width=608): model = fcn_32(n_classes, get_vgg_encoder, input_height=input_height, input_width=input_width) model.model_name = "fcn_32_vgg" return model def fcn_8_resnet50(n_classes, input_height=416, input_width=608): model = fcn_8(n_classes, get_resnet50_encoder, input_height=input_height, input_width=input_width) model.model_name = "fcn_8_resnet50" return model def fcn_32_resnet50(n_classes, input_height=416, input_width=608): model = fcn_32(n_classes, get_resnet50_encoder, input_height=input_height, input_width=input_width) model.model_name = "fcn_32_resnet50" return model def fcn_8_mobilenet(n_classes, input_height=416, input_width=608): model = fcn_8(n_classes, get_mobilenet_encoder, input_height=input_height, input_width=input_width) model.model_name = "fcn_8_mobilenet" return model def fcn_32_mobilenet(n_classes, input_height=416, input_width=608): model = fcn_32(n_classes, get_mobilenet_encoder, input_height=input_height, input_width=input_width) model.model_name = "fcn_32_mobilenet" return model if __name__ == '__main__': m = fcn_8(101) m = fcn_32(101) ================================================ FILE: axelerate/networks/segnet/models/model.py ================================================ """ Definition for the generic Model class """ class Model: def __init__(self, n_classes, input_height=None, input_width=None): pass ================================================ FILE: axelerate/networks/segnet/models/model_utils.py ================================================ from types import MethodType from tensorflow.keras.models import * from tensorflow.keras.layers import * import tensorflow.keras.backend as K from tqdm import tqdm from .config import IMAGE_ORDERING from ..train import train from ..predict import predict, predict_multiple, evaluate # source m1 , dest m2 def transfer_weights(m1, m2, verbose=True): assert len(m1.layers) == len( m2.layers), "Both models should have same number of layers" nSet = 0 nNotSet = 0 if verbose: print("Copying weights ") bar = tqdm(zip(m1.layers, m2.layers)) else: bar = zip(m1.layers, m2.layers) for l, ll in bar: if not any([w.shape != ww.shape for w, ww in zip(list(l.weights), list(ll.weights))]): if len(list(l.weights)) > 0: ll.set_weights(l.get_weights()) nSet += 1 else: nNotSet += 1 if verbose: print("Copied weights of %d layers and skipped %d layers" % (nSet, nNotSet)) def resize_image(inp, s, data_format): try: return Lambda(lambda x: K.resize_images(x, height_factor=s[0], width_factor=s[1], data_format=data_format, interpolation='bilinear'))(inp) except Exception as e: # if keras is old, then rely on the tf function # Sorry theano/cntk users!!! assert data_format == 'channels_last' assert IMAGE_ORDERING == 'channels_last' import tensorflow as tf return Lambda( lambda x: tf.image.resize_images( x, (K.int_shape(x)[1]*s[0], K.int_shape(x)[2]*s[1])) )(inp) def get_segmentation_model(input, output): img_input = input o = output o_shape = Model(img_input, o).output_shape i_shape = Model(img_input, o).input_shape if IMAGE_ORDERING == 'channels_first': output_height = o_shape[2] output_width = o_shape[3] input_height = i_shape[2] input_width = i_shape[3] n_classes = o_shape[1] #o = (Reshape((-1, output_height*output_width)))(o) o = (Permute((2, 1)))(o) elif IMAGE_ORDERING == 'channels_last': output_height = o_shape[1] output_width = o_shape[2] input_height = i_shape[1] input_width = i_shape[2] n_classes = o_shape[3] #o = (Reshape((output_height*output_width, -1)))(o) o = (Activation('softmax'))(o) model = Model(img_input, o, name = "segnet") model.output_width = output_width model.output_height = output_height model.n_classes = n_classes model.input_height = input_height model.input_width = input_width model.train = MethodType(train, model) model.predict_segmentation = MethodType(predict, model) model.predict_multiple = MethodType(predict_multiple, model) model.evaluate_segmentation = MethodType(evaluate, model) return model ================================================ FILE: axelerate/networks/segnet/models/pspnet.py ================================================ import numpy as np import keras from keras.models import * from keras.layers import * import keras.backend as K from .config import IMAGE_ORDERING from .model_utils import get_segmentation_model, resize_image from .vgg16 import get_vgg_encoder from .mobilenet import get_mobilenet_encoder from .basic_models import vanilla_encoder from .resnet50 import get_resnet50_encoder if IMAGE_ORDERING == 'channels_first': MERGE_AXIS = 1 elif IMAGE_ORDERING == 'channels_last': MERGE_AXIS = -1 def pool_block(feats, pool_factor): if IMAGE_ORDERING == 'channels_first': h = K.int_shape(feats)[2] w = K.int_shape(feats)[3] elif IMAGE_ORDERING == 'channels_last': h = K.int_shape(feats)[1] w = K.int_shape(feats)[2] pool_size = strides = [ int(np.round(float(h) / pool_factor)), int(np.round(float(w) / pool_factor))] x = AveragePooling2D(pool_size, data_format=IMAGE_ORDERING, strides=strides, padding='same')(feats) x = Conv2D(512, (1, 1), data_format=IMAGE_ORDERING, padding='same', use_bias=False)(x) x = BatchNormalization()(x) x = Activation('relu')(x) x = resize_image(x, strides, data_format=IMAGE_ORDERING) return x def _pspnet(n_classes, encoder, input_height=384, input_width=576): assert input_height % 192 == 0 assert input_width % 192 == 0 img_input, levels = encoder( input_height=input_height, input_width=input_width) [f1, f2, f3, f4, f5] = levels o = f5 pool_factors = [1, 2, 3, 6] pool_outs = [o] for p in pool_factors: pooled = pool_block(o, p) pool_outs.append(pooled) o = Concatenate(axis=MERGE_AXIS)(pool_outs) o = Conv2D(512, (1, 1), data_format=IMAGE_ORDERING, use_bias=False)(o) o = BatchNormalization()(o) o = Activation('relu')(o) o = Conv2D(n_classes, (3, 3), data_format=IMAGE_ORDERING, padding='same')(o) o = resize_image(o, (8, 8), data_format=IMAGE_ORDERING) model = get_segmentation_model(img_input, o) return model def pspnet(n_classes, input_height=384, input_width=576): model = _pspnet(n_classes, vanilla_encoder, input_height=input_height, input_width=input_width) model.model_name = "pspnet" return model def vgg_pspnet(n_classes, input_height=384, input_width=576): model = _pspnet(n_classes, get_vgg_encoder, input_height=input_height, input_width=input_width) model.model_name = "vgg_pspnet" return model def resnet50_pspnet(n_classes, input_height=384, input_width=576): model = _pspnet(n_classes, get_resnet50_encoder, input_height=input_height, input_width=input_width) model.model_name = "resnet50_pspnet" return model def pspnet_50(n_classes, input_height=473, input_width=473): from ._pspnet_2 import _build_pspnet nb_classes = n_classes resnet_layers = 50 input_shape = (input_height, input_width) model = _build_pspnet(nb_classes=nb_classes, resnet_layers=resnet_layers, input_shape=input_shape) model.model_name = "pspnet_50" return model def pspnet_101(n_classes, input_height=473, input_width=473): from ._pspnet_2 import _build_pspnet nb_classes = n_classes resnet_layers = 101 input_shape = (input_height, input_width) model = _build_pspnet(nb_classes=nb_classes, resnet_layers=resnet_layers, input_shape=input_shape) model.model_name = "pspnet_101" return model # def mobilenet_pspnet( n_classes , input_height=224, input_width=224 ): # model = _pspnet(n_classes, get_mobilenet_encoder, # input_height=input_height, input_width=input_width) # model.model_name = "mobilenet_pspnet" # return model if __name__ == '__main__': m = _pspnet(101, vanilla_encoder) # m = _pspnet( 101 , get_mobilenet_encoder ,True , 224 , 224 ) m = _pspnet(101, get_vgg_encoder) m = _pspnet(101, get_resnet50_encoder) ================================================ FILE: axelerate/networks/segnet/models/segnet.py ================================================ import os from tensorflow.keras.models import * from tensorflow.keras.layers import * from .config import IMAGE_ORDERING from .model_utils import get_segmentation_model from axelerate.networks.common_utils.feature import create_feature_extractor mobilenet = {1:10,2:23,3:36,4:73,5:86} densenet121 = {1:8,2:50,3:138,4:310,5:426} nasnetmobile = {1:7,2:64,3:295,4:537,5:768} squeezenet = {1:2,2:17,3:32,4:47,5:61} full_yolo = {1:14,2:27,3:40,4:53,5:73} tiny_yolo = {1:7,2:15,3:23,4:27,5:30} resnet50 = {1:2,2:37,3:80,4:142,5:174} def chopper(model, model_name, f): outputs = model.layers[model_name[f]].output def segnet_decoder(f, n_classes, n_up=3): assert n_up >= 2 o = f o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) o = (Conv2D(256, (3, 3), padding='valid', data_format=IMAGE_ORDERING))(o) o = (BatchNormalization())(o) o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) o = (Conv2D(128, (3, 3), padding='valid', data_format=IMAGE_ORDERING))(o) o = (BatchNormalization())(o) for _ in range(n_up-2): o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) o = (Conv2D(64, (3, 3), padding='valid', data_format=IMAGE_ORDERING))(o) o = (BatchNormalization())(o) o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) o = (Conv2D(32, (3, 3), padding='valid', data_format=IMAGE_ORDERING))(o) o = (BatchNormalization())(o) o = Conv2D(n_classes, (3, 3), padding='same', data_format=IMAGE_ORDERING)(o) return o def _segnet(n_classes, encoder_input, encoder_output, input_height=416, input_width=608, encoder_level=3): o = segnet_decoder(f=encoder_output, n_classes=n_classes, n_up=encoder_level-1) model = get_segmentation_model(encoder_input, o) return model def full_yolo_segnet(n_classes, input_size, encoder_level, weights): encoder = create_feature_extractor('Full Yolo',input_size, weights) encoder_output = encoder.feature_extractor.layers[full_yolo[encoder_level]].output print(encoder_output) encoder_input = encoder.feature_extractor.inputs[0] encoder_level += 1 model = _segnet(n_classes, encoder_input, encoder_output, input_size, encoder_level=encoder_level) model.model_name = "full_yolo_segnet" model.normalize = encoder.normalize return model def tiny_yolo_segnet(n_classes, input_size, encoder_level, weights): encoder = create_feature_extractor('Tiny Yolo',input_size, weights) encoder_output = encoder.feature_extractor.layers[tiny_yolo[encoder_level]].output print(encoder_output) encoder_input = encoder.feature_extractor.inputs[0] encoder_level += 1 model = _segnet(n_classes, encoder_input, encoder_output, input_size, encoder_level=encoder_level) model.model_name = "tiny_yolo_segnet" model.normalize = encoder.normalize return model def squeezenet_segnet(n_classes, input_size, encoder_level, weights): encoder = create_feature_extractor('SqueezeNet',input_size, weights) encoder_output = encoder.feature_extractor.layers[squeezenet[encoder_level]].output encoder_input = encoder.feature_extractor.inputs[0] model = _segnet(n_classes, encoder_input, encoder_output, input_size, encoder_level=encoder_level) model.model_name = "squeezenet_segnet" model.normalize = encoder.normalize return model def densenet121_segnet(n_classes, input_size, encoder_level, weights): encoder = create_feature_extractor('DenseNet121', input_size, weights) encoder_output = encoder.feature_extractor.layers[densenet121[encoder_level]].output encoder_input = encoder.feature_extractor.inputs[0] model = _segnet(n_classes, encoder_input, encoder_output, input_size, encoder_level=encoder_level) model.model_name = "densenet121_segnet" model.normalize = encoder.normalize return model def nasnetmobile_segnet(n_classes, input_size, encoder_level, weights): encoder = create_feature_extractor('NASNetMobile', input_size, weights) encoder_output = encoder.feature_extractor.layers[nasnetmobile[encoder_level]].output encoder_input = encoder.feature_extractor.inputs[0] model = _segnet(n_classes, encoder_input, encoder_output, input_size, encoder_level=encoder_level) model.model_name = "nasnetmobile_segnet" model.normalize = encoder.normalize return model def resnet50_segnet(n_classes, input_size, encoder_level, weights): encoder = create_feature_extractor('ResNet50',input_size, weights) encoder_output = encoder.feature_extractor.layers[resnet50[encoder_level]].output encoder_input = encoder.feature_extractor.inputs[0] model = _segnet(n_classes, encoder_input, encoder_output, input_size, encoder_level=encoder_level) model.model_name = "resnet50_segnet" model.normalize = encoder.normalize return model def mobilenet_segnet(n_classes, input_size, encoder_level, weights, architecture = 'MobileNet2_5'): encoder = create_feature_extractor(architecture, input_size, weights) encoder_output = encoder.feature_extractor.layers[mobilenet[encoder_level]].output encoder_input = encoder.feature_extractor.inputs[0] model = _segnet(n_classes, encoder_input, encoder_output, input_size, encoder_level=encoder_level) model.model_name = "mobilenet_segnet" model.normalize = encoder.normalize return model ================================================ FILE: axelerate/networks/segnet/models/unet.py ================================================ from keras.models import * from keras.layers import * from .config import IMAGE_ORDERING from .model_utils import get_segmentation_model from .vgg16 import get_vgg_encoder from .mobilenet import get_mobilenet_encoder from .basic_models import vanilla_encoder from .resnet50 import get_resnet50_encoder if IMAGE_ORDERING == 'channels_first': MERGE_AXIS = 1 elif IMAGE_ORDERING == 'channels_last': MERGE_AXIS = -1 def unet_mini(n_classes, input_height=360, input_width=480): if IMAGE_ORDERING == 'channels_first': img_input = Input(shape=(3, input_height, input_width)) elif IMAGE_ORDERING == 'channels_last': img_input = Input(shape=(input_height, input_width, 3)) conv1 = Conv2D(32, (3, 3), data_format=IMAGE_ORDERING, activation='relu', padding='same')(img_input) conv1 = Dropout(0.2)(conv1) conv1 = Conv2D(32, (3, 3), data_format=IMAGE_ORDERING, activation='relu', padding='same')(conv1) pool1 = MaxPooling2D((2, 2), data_format=IMAGE_ORDERING)(conv1) conv2 = Conv2D(64, (3, 3), data_format=IMAGE_ORDERING, activation='relu', padding='same')(pool1) conv2 = Dropout(0.2)(conv2) conv2 = Conv2D(64, (3, 3), data_format=IMAGE_ORDERING, activation='relu', padding='same')(conv2) pool2 = MaxPooling2D((2, 2), data_format=IMAGE_ORDERING)(conv2) conv3 = Conv2D(128, (3, 3), data_format=IMAGE_ORDERING, activation='relu', padding='same')(pool2) conv3 = Dropout(0.2)(conv3) conv3 = Conv2D(128, (3, 3), data_format=IMAGE_ORDERING, activation='relu', padding='same')(conv3) up1 = concatenate([UpSampling2D((2, 2), data_format=IMAGE_ORDERING)( conv3), conv2], axis=MERGE_AXIS) conv4 = Conv2D(64, (3, 3), data_format=IMAGE_ORDERING, activation='relu', padding='same')(up1) conv4 = Dropout(0.2)(conv4) conv4 = Conv2D(64, (3, 3), data_format=IMAGE_ORDERING, activation='relu', padding='same')(conv4) up2 = concatenate([UpSampling2D((2, 2), data_format=IMAGE_ORDERING)( conv4), conv1], axis=MERGE_AXIS) conv5 = Conv2D(32, (3, 3), data_format=IMAGE_ORDERING, activation='relu', padding='same')(up2) conv5 = Dropout(0.2)(conv5) conv5 = Conv2D(32, (3, 3), data_format=IMAGE_ORDERING, activation='relu', padding='same')(conv5) o = Conv2D(n_classes, (1, 1), data_format=IMAGE_ORDERING, padding='same')(conv5) model = get_segmentation_model(img_input, o) model.model_name = "unet_mini" return model def _unet(n_classes, encoder, l1_skip_conn=True, input_height=416, input_width=608): img_input, levels = encoder( input_height=input_height, input_width=input_width) [f1, f2, f3, f4, f5] = levels o = f4 o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) o = (Conv2D(512, (3, 3), padding='valid', data_format=IMAGE_ORDERING))(o) o = (BatchNormalization())(o) o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) o = (concatenate([o, f3], axis=MERGE_AXIS)) o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) o = (Conv2D(256, (3, 3), padding='valid', data_format=IMAGE_ORDERING))(o) o = (BatchNormalization())(o) o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) o = (concatenate([o, f2], axis=MERGE_AXIS)) o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) o = (Conv2D(128, (3, 3), padding='valid', data_format=IMAGE_ORDERING))(o) o = (BatchNormalization())(o) o = (UpSampling2D((2, 2), data_format=IMAGE_ORDERING))(o) if l1_skip_conn: o = (concatenate([o, f1], axis=MERGE_AXIS)) o = (ZeroPadding2D((1, 1), data_format=IMAGE_ORDERING))(o) o = (Conv2D(64, (3, 3), padding='valid', data_format=IMAGE_ORDERING))(o) o = (BatchNormalization())(o) o = Conv2D(n_classes, (3, 3), padding='same', data_format=IMAGE_ORDERING)(o) model = get_segmentation_model(img_input, o) return model def unet(n_classes, input_height=416, input_width=608, encoder_level=3): model = _unet(n_classes, vanilla_encoder, input_height=input_height, input_width=input_width) model.model_name = "unet" return model def vgg_unet(n_classes, input_height=416, input_width=608, encoder_level=3): model = _unet(n_classes, get_vgg_encoder, input_height=input_height, input_width=input_width) model.model_name = "vgg_unet" return model def resnet50_unet(n_classes, input_height=416, input_width=608, encoder_level=3): model = _unet(n_classes, get_resnet50_encoder, input_height=input_height, input_width=input_width) model.model_name = "resnet50_unet" return model def mobilenet_unet(n_classes, input_height=224, input_width=224, encoder_level=3): model = _unet(n_classes, get_mobilenet_encoder, input_height=input_height, input_width=input_width) model.model_name = "mobilenet_unet" return model if __name__ == '__main__': m = unet_mini(101) m = _unet(101, vanilla_encoder) # m = _unet( 101 , get_mobilenet_encoder ,True , 224 , 224 ) m = _unet(101, get_vgg_encoder) m = _unet(101, get_resnet50_encoder) ================================================ FILE: axelerate/networks/segnet/predict.py ================================================ import glob import random import json import os import cv2 import numpy as np np.set_printoptions(threshold=np.inf) from tqdm import tqdm from tensorflow.keras.models import load_model from axelerate.networks.segnet.train import find_latest_checkpoint from axelerate.networks.segnet.data_utils.data_loader import get_image_array, get_segmentation_array, DATA_LOADER_SEED, class_colors, get_pairs_from_paths from axelerate.networks.segnet.models.config import IMAGE_ORDERING from . import metrics import six random.seed(DATA_LOADER_SEED) def model_from_checkpoint_path(checkpoints_path): from .models.all_models import model_from_name assert (os.path.isfile(checkpoints_path+"_config.json") ), "Checkpoint not found." model_config = json.loads( open(checkpoints_path+"_config.json", "r").read()) latest_weights = find_latest_checkpoint(checkpoints_path) assert (latest_weights is not None), "Checkpoint not found." model = model_from_name[model_config['model_class']]( model_config['n_classes'], input_height=model_config['input_height'], input_width=model_config['input_width']) print("loaded weights ", latest_weights) model.load_weights(latest_weights) return model def get_colored_segmentation_image(seg_arr, n_classes, colors=class_colors): output_height = seg_arr.shape[0] output_width = seg_arr.shape[1] seg_img = np.zeros((output_height, output_width, 3)) for c in range(n_classes): seg_img[:, :, 0] += ((seg_arr[:, :] == c)*(colors[c][0])).astype('uint8') seg_img[:, :, 1] += ((seg_arr[:, :] == c)*(colors[c][1])).astype('uint8') seg_img[:, :, 2] += ((seg_arr[:, :] == c)*(colors[c][2])).astype('uint8') seg_img = seg_img.astype('uint8') return seg_img def get_legends(class_names, colors=class_colors): n_classes = len(class_names) legend = np.zeros(((len(class_names) * 25) + 25, 125, 3), dtype="uint8") + 255 for (i, (class_name, color)) in enumerate(zip(class_names[:n_classes] , colors[:n_classes])): color = [int(c) for c in color] cv2.putText(legend, class_name, (5, (i * 25) + 17), cv2.FONT_HERSHEY_COMPLEX, 0.5, (0, 0, 0), 1) cv2.rectangle(legend, (100, (i * 25)), (125, (i * 25) + 25), tuple(color), -1) return legend def overlay_seg_image(inp_img , seg_img): orininal_h = inp_img.shape[0] orininal_w = inp_img.shape[1] seg_img = cv2.resize(seg_img, (orininal_w, orininal_h)) fused_img = (inp_img/2 + seg_img/2 ).astype('uint8') return fused_img def concat_lenends( seg_img , legend_img ): new_h = np.maximum( seg_img.shape[0] , legend_img.shape[0] ) new_w = seg_img.shape[1] + legend_img.shape[1] out_img = np.zeros((new_h ,new_w , 3 )).astype('uint8') + legend_img[0 , 0 , 0 ] out_img[ :legend_img.shape[0] , : legend_img.shape[1] ] = np.copy(legend_img) out_img[ :seg_img.shape[0] , legend_img.shape[1]: ] = np.copy(seg_img) return out_img def visualize_segmentation(seg_arr, inp_img=None, n_classes=None, colors=class_colors, class_names=None, overlay_img=False, show_legends=False, prediction_width=None, prediction_height=None): print("Found the following classes in the segmentation image:", np.unique(seg_arr)) if n_classes is None: n_classes = np.max(seg_arr) seg_img = get_colored_segmentation_image(seg_arr, n_classes , colors=colors) if not inp_img is None: orininal_h = inp_img.shape[0] orininal_w = inp_img.shape[1] seg_img = cv2.resize(seg_img, (orininal_w, orininal_h)) if (not prediction_height is None) and (not prediction_width is None): seg_img = cv2.resize(seg_img, (prediction_width, prediction_height )) if not inp_img is None: inp_img = cv2.resize(inp_img, (prediction_width, prediction_height)) if overlay_img: assert not inp_img is None seg_img = overlay_seg_image(inp_img, seg_img) if show_legends: assert not class_names is None legend_img = get_legends(class_names , colors=colors ) seg_img = concat_lenends(seg_img, legend_img) return seg_img def predict(model=None, inp=None, out_fname=None, image = None, overlay_img=False, class_names=None, show_legends=False, colors=class_colors, prediction_width=None, prediction_height=None): n_classes = model.n_classes pr = model.predict(inp) pr = np.squeeze(pr) #pr = pr.reshape((output_height, output_width, n_classes)).argmax(axis=2) pr = pr.argmax(axis=2) seg_img = visualize_segmentation(pr, inp_img=image, n_classes=n_classes, overlay_img=True, colors=colors) if out_fname is not None: cv2.imwrite(out_fname, seg_img) return pr def predict_multiple(model=None, inps=None, inp_dir=None, out_dir=None, checkpoints_path=None ,overlay_img=False , class_names=None , show_legends=False , colors=class_colors , prediction_width=None , prediction_height=None ): if model is None and (checkpoints_path is not None): model = model_from_checkpoint_path(checkpoints_path) if inps is None and (inp_dir is not None): inps = glob.glob(os.path.join(inp_dir, "*.jpg")) + glob.glob( os.path.join(inp_dir, "*.png")) + \ glob.glob(os.path.join(inp_dir, "*.jpeg")) assert type(inps) is list all_prs = [] for i, inp in enumerate(tqdm(inps)): if out_dir is None: out_fname = None else: if isinstance(inp, six.string_types): out_fname = os.path.join(out_dir, os.path.basename(inp)) else: out_fname = os.path.join(out_dir, str(i) + ".jpg") pr = predict(model, inp, out_fname , overlay_img=overlay_img,class_names=class_names ,show_legends=show_legends , colors=colors , prediction_width=prediction_width , prediction_height=prediction_height ) all_prs.append(pr) return all_prs def evaluate(model=None, inp_images=None, annotations=None, inp_images_dir=None, annotations_dir=None, checkpoints_path=None): if model is None: assert (checkpoints_path is not None) , "Please provide the model or the checkpoints_path" model = model_from_checkpoint_path(checkpoints_path) if inp_images is None: assert (inp_images_dir is not None) , "Please provide inp_images or inp_images_dir" assert (annotations_dir is not None) , "Please provide inp_images or inp_images_dir" paths = get_pairs_from_paths(inp_images_dir, annotations_dir) paths = list(zip(*paths)) inp_images = list(paths[0]) annotations = list(paths[1]) assert type(inp_images) is list assert type(annotations) is list tp = np.zeros(model.n_classes) fp = np.zeros(model.n_classes) fn = np.zeros(model.n_classes) n_pixels = np.zeros(model.n_classes) for inp, ann in tqdm(zip(inp_images , annotations)): pr = model.predict(inp) gt = get_segmentation_array(ann, model.n_classes, no_reshape=True) gt = gt.argmax(-1) #pr = pr.flatten() #gt = gt.flatten() for cl_i in range(model.n_classes): tp[ cl_i ] += np.sum( (pr == cl_i) * (gt == cl_i) ) fp[ cl_i ] += np.sum( (pr == cl_i) * ((gt != cl_i)) ) fn[ cl_i ] += np.sum( (pr != cl_i) * ((gt == cl_i)) ) n_pixels[ cl_i ] += np.sum( gt == cl_i ) cl_wise_score = tp / ( tp + fp + fn + 0.000000000001 ) n_pixels_norm = n_pixels / np.sum(n_pixels) frequency_weighted_IU = np.sum(cl_wise_score*n_pixels_norm) mean_IU = np.mean(cl_wise_score) return {"frequency_weighted_IU":frequency_weighted_IU , "mean_IU":mean_IU , "class_wise_IU":cl_wise_score } ================================================ FILE: axelerate/networks/segnet/train.py ================================================ import argparse import json from .data_utils.data_loader import create_batch_generator, verify_segmentation_dataset import os import glob import six def find_latest_checkpoint(checkpoints_path, fail_safe=True): def get_epoch_number_from_path(path): return path.replace(checkpoints_path, "").strip(".") # Get all matching files all_checkpoint_files = glob.glob(checkpoints_path + ".*") # Filter out entries where the epoc_number part is pure number all_checkpoint_files = list(filter(lambda f: get_epoch_number_from_path(f).isdigit(), all_checkpoint_files)) if not len(all_checkpoint_files): # The glob list is empty, don't have a checkpoints_path if not fail_safe: raise ValueError("Checkpoint path {0} invalid".format(checkpoints_path)) else: return None # Find the checkpoint file with the maximum epoch latest_epoch_checkpoint = max(all_checkpoint_files, key=lambda f: int(get_epoch_number_from_path(f))) return latest_epoch_checkpoint def masked_categorical_crossentropy(gt , pr ): from keras.losses import categorical_crossentropy mask = 1- gt[: , : , 0 ] return categorical_crossentropy( gt , pr )*mask def train(model, train_images, train_annotations, input_height=None, input_width=None, n_classes=None, verify_dataset=True, checkpoints_path=None, epochs=5, batch_size=2, validate=False, val_images=None, val_annotations=None, val_batch_size=2, auto_resume_checkpoint=False, load_weights=None, steps_per_epoch=512, val_steps_per_epoch=512, gen_use_multiprocessing=False, ignore_zero_class=False , optimizer_name='adadelta' , do_augment=False , augmentation_name="aug_all" ): from .models.all_models import model_from_name # check if user gives model name instead of the model object if isinstance(model, six.string_types): # create the model from the name assert (n_classes is not None), "Please provide the n_classes" if (input_height is not None) and (input_width is not None): model = model_from_name[model]( n_classes, input_height=input_height, input_width=input_width) else: model = model_from_name[model](n_classes) n_classes = model.n_classes input_height = model.input_height input_width = model.input_width output_height = model.output_height output_width = model.output_width if validate: assert val_images is not None assert val_annotations is not None if optimizer_name is not None: if ignore_zero_class: loss_k = masked_categorical_crossentropy else: loss_k = 'categorical_crossentropy' model.compile(loss= loss_k , optimizer=optimizer_name, metrics=['accuracy']) if checkpoints_path is not None: with open(checkpoints_path+"_config.json", "w") as f: json.dump({ "model_class": model.model_name, "n_classes": n_classes, "input_height": input_height, "input_width": input_width, "output_height": output_height, "output_width": output_width }, f) if load_weights is not None and len(load_weights) > 0: print("Loading weights from ", load_weights) model.load_weights(load_weights) if auto_resume_checkpoint and (checkpoints_path is not None): latest_checkpoint = find_latest_checkpoint(checkpoints_path) if latest_checkpoint is not None: print("Loading the weights from latest checkpoint ", latest_checkpoint) model.load_weights(latest_checkpoint) if verify_dataset: print("Verifying training dataset") verified = verify_segmentation_dataset(train_images, train_annotations, n_classes) assert verified if validate: print("Verifying validation dataset") verified = verify_segmentation_dataset(val_images, val_annotations, n_classes) assert verified train_gen = image_segmentation_generator( train_images, train_annotations, batch_size, n_classes, input_height, input_width, output_height, output_width , do_augment=do_augment ,augmentation_name=augmentation_name ) if validate: val_gen = image_segmentation_generator( val_images, val_annotations, val_batch_size, n_classes, input_height, input_width, output_height, output_width) if not validate: for ep in range(epochs): print("Starting Epoch ", ep) model.fit_generator(train_gen, steps_per_epoch, epochs=1) if checkpoints_path is not None: model.save_weights(checkpoints_path + "." + str(ep)) print("saved ", checkpoints_path + ".model." + str(ep)) print("Finished Epoch", ep) else: for ep in range(epochs): print("Starting Epoch ", ep) model.fit_generator(train_gen, steps_per_epoch, validation_data=val_gen, validation_steps=val_steps_per_epoch, epochs=1 , use_multiprocessing=gen_use_multiprocessing) if checkpoints_path is not None: model.save_weights(checkpoints_path + "." + str(ep)) print("saved ", checkpoints_path + ".model." + str(ep)) print("Finished Epoch", ep) ================================================ FILE: axelerate/networks/yolo/__init__.py ================================================ ================================================ FILE: axelerate/networks/yolo/backend/__init__.py ================================================ ================================================ FILE: axelerate/networks/yolo/backend/batch_gen.py ================================================ import cv2 import os import numpy as np np.random.seed(1337) from tensorflow.keras.utils import Sequence from axelerate.networks.common_utils.augment import ImgAugment from axelerate.networks.yolo.backend.utils.box import to_centroid, create_anchor_boxes, find_match_box from axelerate.networks.common_utils.fit import train def create_batch_generator(annotations, input_size, grid_sizes, batch_size, anchors, repeat_times, augment, norm=None): """ # Args annotations : Annotations instance in utils.annotation module # Return worker : BatchGenerator instance """ img_aug = ImgAugment(input_size[0], input_size[1], augment) yolo_box = _YoloBox(input_size, grid_sizes) netin_gen = _NetinGen(input_size, norm) netout_gen = _NetoutGen(grid_sizes, annotations.n_classes(), anchors) worker = BatchGenerator(netin_gen, netout_gen, yolo_box, img_aug, annotations, batch_size, repeat_times) return worker class BatchGenerator(Sequence): def __init__(self, netin_gen, netout_gen, yolo_box, img_aug, annotations, batch_size, repeat_times): """ # Args annotations : Annotations instance """ self._netin_gen = netin_gen self._netout_gen = netout_gen self.nb_stages = len(netout_gen.anchors) self._img_aug = img_aug self._yolo_box = yolo_box self._batch_size = min(batch_size, len(annotations)*repeat_times) self._repeat_times = repeat_times self.annotations = annotations self.counter = 0 def __len__(self): return int(len(self.annotations) * self._repeat_times /self._batch_size) def __getitem__(self, idx): """ # Args idx : batch index """ x_batch = [] y_batch1 = [] if self.nb_stages == 2: y_batch2 = [] for i in range(self._batch_size): # 1. get input file & its annotation fname = self.annotations.fname(self._batch_size*idx + i) boxes = self.annotations.boxes(self._batch_size*idx + i) labels = self.annotations.code_labels(self._batch_size*idx + i) # 2. read image in fixed size img, boxes, labels = self._img_aug.imread(fname, boxes, labels) # 3. grid scaling centroid boxes if len(boxes) > 0: norm_boxes = self._yolo_box.trans(boxes) else: norm_boxes = [] labels = [] # 4. generate x_batch x_batch.append(self._netin_gen.run(img)) processed_labels = self._netout_gen.run(norm_boxes, labels) y_batch1.append(processed_labels[0]) if self.nb_stages == 2: y_batch2.append(processed_labels[1]) x_batch = np.array(x_batch) y_batch1 = np.array(y_batch1) batch = y_batch1 if self.nb_stages == 2: y_batch2 = np.array(y_batch2) batch = [y_batch1, y_batch2] self.counter += 1 return x_batch, batch def on_epoch_end(self): self.annotations.shuffle() self.counter = 0 class _YoloBox(object): def __init__(self, input_size, grid_size): self._input_size = input_size self._grid_size = grid_size def trans(self, boxes): """ # Args boxes : array, shape of (N, 4) (x1, y1, x2, y2)-ordered & input image size scale coordinate # Returns norm_boxes : array, same shape of boxes (cx, cy, w, h)-ordered & rescaled to grid-size """ # 1. [[100, 120, 140, 200]] minimax box -> centroid box centroid_boxes = to_centroid(boxes).astype(np.float32) # 2. [[120. 160. 40. 80.]] image scale -> imga scle 0 ~ 1 [[4. 5. 1.3333334 2.5 ]] norm_boxes = np.zeros_like(centroid_boxes) norm_boxes[:,0::2] = centroid_boxes[:,0::2] / self._input_size[1] norm_boxes[:,1::2] = centroid_boxes[:,1::2] / self._input_size[0] #print("norm boxes", norm_boxes) return norm_boxes class _NetinGen(object): def __init__(self, input_size, norm): self._input_size = input_size self._norm = self._set_norm(norm) def run(self, image): return self._norm(image) def _set_norm(self, norm): if norm is None: return lambda x: x else: return norm class _NetoutGen(object): def __init__(self, grid_sizes, nb_classes, anchors): self.nb_classes = nb_classes self.anchors = np.asarray(anchors) self._tensor_shape = self._set_tensor_shape(grid_sizes, nb_classes) def run(self, norm_boxes, labels): """ # Args norm_boxes : array, shape of (N, 4) scale normalized boxes labels : list of integers y_shape : tuple (grid_size, grid_size, nb_boxes, 4+1+nb_classes) """ labels = np.asarray([labels]) norm_boxes = np.asarray(norm_boxes) if len(norm_boxes) > 0: norm_boxes= np.concatenate((labels.T, norm_boxes), axis = 1) #print("boxes", boxes) y = self.box_to_label(norm_boxes) #print(y.shape) return y def _set_tensor_shape(self, grid_size, nb_classes): nb_boxes = len(self.anchors[0]) return [(grid_size[i][0], grid_size[i][1], nb_boxes, 4+1+nb_classes) for i in range(len(self.anchors))] def _xy_grid_index(self, box_xy: np.ndarray, layer: int): """ get xy index in grid scale Parameters ---------- box_xy : np.ndarray value = [x,y] layer : int layer index Returns ------- [np.ndarray,np.ndarray] index xy : = [idx,idy] """ out_wh = self._tensor_shape[layer][0:2:][::-1] #print(box_xy, out_wh) return np.floor(box_xy * out_wh).astype('int') @staticmethod def _fake_iou(a: np.ndarray, b: np.ndarray) -> float: """set a,b center to same,then calc the iou value Parameters ---------- a : np.ndarray array value = [w,h] b : np.ndarray array value = [w,h] Returns ------- float iou value """ a_maxes = a / 2. a_mins = -a_maxes b_maxes = b / 2. b_mins = -b_maxes iner_mins = np.maximum(a_mins, b_mins) iner_maxes = np.minimum(a_maxes, b_maxes) iner_wh = np.maximum(iner_maxes - iner_mins, 0.) iner_area = iner_wh[..., 0] * iner_wh[..., 1] s1 = a[..., 0] * a[..., 1] s2 = b[..., 0] * b[..., 1] return iner_area / (s1 + s2 - iner_area) def _get_anchor_index(self, wh: np.ndarray) -> np.ndarray: """get the max iou anchor index Parameters ---------- wh : np.ndarray value = [w,h] Returns ------- np.ndarray max iou anchor index value = [layer index , anchor index] """ iou = _NetoutGen._fake_iou(wh, self.anchors) return np.unravel_index(np.argmax(iou), iou.shape) def box_to_label(self, true_box: np.ndarray) -> tuple: """convert the annotation to yolo v3 label~ Parameters ---------- true_box : np.ndarray annotation shape :[n,5] value :[n*[p,x,y,w,h]] Returns ------- tuple labels list value :[output_number*[out_h,out_w,anchor_num,class+5]] """ labels = [np.zeros((self._tensor_shape[i][0], self._tensor_shape[i][1], len(self.anchors[i]), 5 + self.nb_classes), dtype='float32') for i in range(len(self.anchors))] for box in true_box: # NOTE box [x y w h] are relative to the size of the entire image [0~1] l, n = self._get_anchor_index(box[3:5]) # [layer index, anchor index] idx, idy = self._xy_grid_index(box[1:3], l) # [x index , y index] labels[l][idy, idx, n, 0:4] = np.clip(box[1:5], 1e-8, 1.) labels[l][idy, idx, n, 4] = 1. labels[l][idy, idx, n, 5 + int(box[0])] = 1. return labels ================================================ FILE: axelerate/networks/yolo/backend/decoder.py ================================================ import numpy as np from axelerate.networks.yolo.backend.utils.box import BoundBox from axelerate.networks.yolo.backend.utils.box import BoundBox, nms_boxes, boxes_to_array class YoloDecoder(object): def __init__(self, anchors, params, nms_threshold, input_size): self.anchors = anchors self.nms_threshold = nms_threshold self.input_size = input_size self.params = params def run(self, netout, obj_threshold): boxes = [] for l, output in enumerate(netout): output = np.squeeze(output) grid_h, grid_w, nb_box = output.shape[0:3] # decode the output by the network output[..., 4] = _sigmoid(output[..., 4]) output[..., 5:] = output[..., 4][..., np.newaxis] * _sigmoid(output[..., 5:]) output[..., 5:] *= output[..., 5:] > obj_threshold for row in range(grid_h): for col in range(grid_w): for b in range(nb_box): # from 4th element onwards are confidence and class classes classes = output[row, col, b, 5:] if np.sum(classes) > 0: # first 4 elements are x, y, w, and h x, y, w, h = output[row, col, b, :4] x = (col + _sigmoid(x)) / grid_w # center position, unit: image width y = (row + _sigmoid(y)) / grid_h # center position, unit: image height w = self.anchors[l][b][0] * np.exp(w) # unit: image width h = self.anchors[l][b][1] * np.exp(h) # unit: image height confidence = output[row, col, b, 4] box = BoundBox(x, y, w, h, confidence, classes) boxes.append(box) boxes = nms_boxes(boxes, len(classes), self.nms_threshold, obj_threshold) boxes, probs = boxes_to_array(boxes) return boxes, probs def _sigmoid(x): return 1. / (1. + np.exp(-x)) ================================================ FILE: axelerate/networks/yolo/backend/loss.py ================================================ import tensorflow as tf import tensorflow.python.keras.backend as K from tensorflow import map_fn import numpy as np import os import skimage import cv2 from math import cos, sin def tf_xywh_to_all(grid_pred_xy, grid_pred_wh, layer, params): """ rescale the pred raw [grid_pred_xy,grid_pred_wh] to [0~1] Parameters ---------- grid_pred_xy : tf.Tensor grid_pred_wh : tf.Tensor layer : int the output layer h : Helper Returns ------- tuple after process, [all_pred_xy, all_pred_wh] """ with tf.name_scope('xywh_to_all_%d' % layer): #print('xyoffset', params.xy_offset[layer], 'outhw', params.out_hw[layer][::-1]) all_pred_xy = (tf.sigmoid(grid_pred_xy[..., :]) + params.xy_offset[layer]) / params.out_hw[layer][::-1] all_pred_wh = tf.exp(grid_pred_wh[..., :]) * params.anchors[layer] return all_pred_xy, all_pred_wh def tf_xywh_to_grid(all_true_xy, all_true_wh, layer, params): """convert true label xy wh to grid scale Parameters ---------- all_true_xy : tf.Tensor all_true_wh : tf.Tensor layer : int layer index h : Helper Returns ------- [tf.Tensor, tf.Tensor] grid_true_xy, grid_true_wh shape = [out h ,out w,anchor num , 2 ] """ with tf.name_scope('xywh_to_grid_%d' % layer): grid_true_xy = (all_true_xy * params.out_hw[layer][::-1]) - params.xy_offset[layer] grid_true_wh = tf.math.log(all_true_wh / params.anchors[layer]) return grid_true_xy, grid_true_wh def tf_reshape_box(true_xy_A: tf.Tensor, true_wh_A: tf.Tensor, p_xy_A: tf.Tensor, p_wh_A: tf.Tensor, layer: int, params) -> tuple: """ reshape the xywh to [?,h,w,anchor_nums,true_box_nums,2] NOTE must use obj mask in atrue xywh ! Parameters ---------- true_xy_A : tf.Tensor shape will be [true_box_nums,2] true_wh_A : tf.Tensor shape will be [true_box_nums,2] p_xy_A : tf.Tensor shape will be [?,h,w,anhor_nums,2] p_wh_A : tf.Tensor shape will be [?,h,w,anhor_nums,2] layer : int helper : Helper Returns ------- tuple true_cent, true_box_wh, pred_cent, pred_box_wh """ with tf.name_scope('reshape_box_%d' % layer): true_cent = true_xy_A[tf.newaxis, tf.newaxis, tf.newaxis, tf.newaxis, ...] true_box_wh = true_wh_A[tf.newaxis, tf.newaxis, tf.newaxis, tf.newaxis, ...] true_cent = tf.tile(true_cent, [helper.batch_size, helper.out_hw[layer][0], helper.out_hw[layer][1], helper.anchor_number, 1, 1]) true_box_wh = tf.tile(true_box_wh, [helper.batch_size, helper.out_hw[layer][0], helper.out_hw[layer][1], helper.anchor_number, 1, 1]) pred_cent = p_xy_A[..., tf.newaxis, :] pred_box_wh = p_wh_A[..., tf.newaxis, :] pred_cent = tf.tile(pred_cent, [1, 1, 1, 1, tf.shape(true_xy_A)[0], 1]) pred_box_wh = tf.tile(pred_box_wh, [1, 1, 1, 1, tf.shape(true_wh_A)[0], 1]) return true_cent, true_box_wh, pred_cent, pred_box_wh def tf_iou(pred_xy: tf.Tensor, pred_wh: tf.Tensor, vaild_xy: tf.Tensor, vaild_wh: tf.Tensor) -> tf.Tensor: """ calc the iou form pred box with vaild box Parameters ---------- pred_xy : tf.Tensor pred box shape = [out h, out w, anchor num, 2] pred_wh : tf.Tensor pred box shape = [out h, out w, anchor num, 2] vaild_xy : tf.Tensor vaild box shape = [? , 2] vaild_wh : tf.Tensor vaild box shape = [? , 2] Returns ------- tf.Tensor iou value shape = [out h, out w, anchor num ,?] """ b1_xy = tf.expand_dims(pred_xy, -2) b1_wh = tf.expand_dims(pred_wh, -2) b1_wh_half = b1_wh / 2. b1_mins = b1_xy - b1_wh_half b1_maxes = b1_xy + b1_wh_half b2_xy = tf.expand_dims(vaild_xy, 0) b2_wh = tf.expand_dims(vaild_wh, 0) b2_wh_half = b2_wh / 2. b2_mins = b2_xy - b2_wh_half b2_maxes = b2_xy + b2_wh_half intersect_mins = tf.maximum(b1_mins, b2_mins) intersect_maxes = tf.minimum(b1_maxes, b2_maxes) intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1] b1_area = b1_wh[..., 0] * b1_wh[..., 1] b2_area = b2_wh[..., 0] * b2_wh[..., 1] iou = intersect_area / (b1_area + b2_area - intersect_area) return iou def calc_ignore_mask(t_xy_A: tf.Tensor, t_wh_A: tf.Tensor, p_xy: tf.Tensor, p_wh: tf.Tensor, obj_mask: tf.Tensor, iou_thresh: float, layer: int, params) -> tf.Tensor: """clac the ignore mask Parameters ---------- t_xy_A : tf.Tensor raw ture xy,shape = [batch size,h,w,anchors,2] t_wh_A : tf.Tensor raw true wh,shape = [batch size,h,w,anchors,2] p_xy : tf.Tensor raw pred xy,shape = [batch size,h,w,anchors,2] p_wh : tf.Tensor raw pred wh,shape = [batch size,h,w,anchors,2] obj_mask : tf.Tensor old obj mask,shape = [batch size,h,w,anchors] iou_thresh : float iou thresh helper : Helper Helper obj Returns ------- tf.Tensor ignore_mask : ignore_mask, shape = [batch size, h, w, anchors, 1] """ with tf.name_scope('calc_mask_%d' % layer): pred_xy, pred_wh = tf_xywh_to_all(p_xy, p_wh, layer, params) ignore_mask = [] for bc in range(params.batch_size): vaild_xy = tf.boolean_mask(t_xy_A[bc], obj_mask[bc]) vaild_wh = tf.boolean_mask(t_wh_A[bc], obj_mask[bc]) iou_score = tf_iou(pred_xy[bc], pred_wh[bc], vaild_xy, vaild_wh) best_iou = tf.reduce_max(iou_score, axis=-1, keepdims=True) ignore_mask.append(tf.cast(best_iou < iou_thresh, tf.float32)) return tf.stack(ignore_mask) class Params: def __init__(self, obj_thresh, iou_thresh, obj_weight, noobj_weight, wh_weight, out_hw, anchors, class_num): self.obj_thresh = obj_thresh self.iou_thresh = iou_thresh self.wh_weight = wh_weight self.obj_weight = obj_weight self.noobj_weight = noobj_weight self.class_num = class_num self.out_hw = np.reshape(np.array(out_hw), (-1, 2)) #print(self.out_hw) self.anchors = anchors self.grid_wh = (1 / self.out_hw)[:, [1, 0]] #print(self.grid_wh) self.wh_scale = Params._anchor_scale(self.anchors, self.grid_wh) self.xy_offset = Params._coordinate_offset(self.anchors, self.out_hw) self.batch_size = None @staticmethod def _coordinate_offset(anchors: np.ndarray, out_hw: np.ndarray) -> np.array: """construct the anchor coordinate offset array , used in convert scale Parameters ---------- anchors : np.ndarray anchors shape = [n,] = [ n x [m,2]] out_hw : np.ndarray output height width shape = [n,2] Returns ------- np.array scale shape = [n,] = [n x [h_n,w_n,m,2]] """ grid = [] for l in range(len(anchors)): grid_y = np.tile(np.reshape(np.arange(0, stop=out_hw[l][0]), [-1, 1, 1, 1]), [1, out_hw[l][1], 1, 1]) grid_x = np.tile(np.reshape(np.arange(0, stop=out_hw[l][1]), [1, -1, 1, 1]), [out_hw[l][0], 1, 1, 1]) grid.append(np.concatenate([grid_x, grid_y], axis=-1)) return np.array(grid) @staticmethod def _anchor_scale(anchors: np.ndarray, grid_wh: np.ndarray) -> np.array: """construct the anchor scale array , used in convert label to annotation Parameters ---------- anchors : np.ndarray anchors shape = [n,] = [ n x [m,2]] out_hw : np.ndarray output height width shape = [n,2] Returns ------- np.array scale shape = [n,] = [n x [m,2]] """ return np.array([anchors[i] * grid_wh[i] for i in range(len(anchors))]) def create_loss_fn(params, layer, batch_size): params.batch_size = batch_size shapes = [[-1] + list(params.out_hw[layer]) + [len(params.anchors[layer]), params.class_num + 5]] #print(shapes) # @tf.function def loss_fn(y_true: tf.Tensor, y_pred: tf.Tensor): #print(y_true, y_pred) """ split the label """ grid_pred_xy = y_pred[..., 0:2] grid_pred_wh = y_pred[..., 2:4] pred_confidence = y_pred[..., 4:5] pred_cls = y_pred[..., 5:] all_true_xy = y_true[..., 0:2] all_true_wh = y_true[..., 2:4] true_confidence = y_true[..., 4:5] true_cls = y_true[..., 5:] obj_mask = true_confidence # true_confidence[..., 0] > obj_thresh obj_mask_bool = y_true[..., 4] > params.obj_thresh """ calc the ignore mask """ ignore_mask = calc_ignore_mask(all_true_xy, all_true_wh, grid_pred_xy, grid_pred_wh, obj_mask_bool, params.iou_thresh, layer, params) grid_true_xy, grid_true_wh = tf_xywh_to_grid(all_true_xy, all_true_wh, layer, params) # NOTE When wh=0 , tf.log(0) = -inf, so use K.switch to avoid it grid_true_wh = K.switch(obj_mask_bool, grid_true_wh, tf.zeros_like(grid_true_wh)) """ define loss """ coord_weight = 2 - all_true_wh[..., 0:1] * all_true_wh[..., 1:2] xy_loss = tf.reduce_sum( obj_mask * coord_weight * tf.nn.sigmoid_cross_entropy_with_logits( labels=grid_true_xy, logits=grid_pred_xy)) / params.batch_size wh_loss = tf.reduce_sum( obj_mask * coord_weight * params.wh_weight * tf.square(tf.subtract( x=grid_true_wh, y=grid_pred_wh))) / params.batch_size obj_loss = params.obj_weight * tf.reduce_sum( obj_mask * tf.nn.sigmoid_cross_entropy_with_logits( labels=true_confidence, logits=pred_confidence)) / params.batch_size noobj_loss = params.noobj_weight * tf.reduce_sum( (1 - obj_mask) * ignore_mask * tf.nn.sigmoid_cross_entropy_with_logits( labels=true_confidence, logits=pred_confidence)) / params.batch_size cls_loss = tf.reduce_sum( obj_mask * tf.nn.sigmoid_cross_entropy_with_logits( labels=true_cls, logits=pred_cls)) / params.batch_size total_loss = obj_loss + noobj_loss + cls_loss + xy_loss + wh_loss return total_loss return loss_fn ================================================ FILE: axelerate/networks/yolo/backend/network.py ================================================ # -*- coding: utf-8 -*- import numpy as np import tensorflow as tf from tensorflow.keras.models import Model from tensorflow.keras.layers import Reshape, Conv2D, UpSampling2D, Concatenate, ZeroPadding2D from axelerate.networks.common_utils.feature import create_feature_extractor from axelerate.networks.common_utils.mobilenet_sipeed.mobilenet import _depthwise_conv_block, _conv_block def create_yolo_network(architecture, input_size, nb_classes, nb_box, nb_stages, weights): feature_extractor = create_feature_extractor(architecture, input_size, weights) yolo_net = YoloNetwork(feature_extractor, nb_stages, nb_classes, nb_box) return yolo_net class YoloNetwork(object): def __init__(self, feature_extractor, nb_stages, nb_classes, nb_box): # 1. create full network grid_size_y, grid_size_x = feature_extractor.get_output_size(layer = 'conv_pw_13_relu') x1 = feature_extractor.get_output_tensor('conv_pw_13_relu') #x1 = _depthwise_conv_block(inputs = x1, alpha = 1, pointwise_conv_filters = 128, block_id=14) # make the object detection layer y1 = Conv2D(nb_box * (4 + 1 + nb_classes), (1,1), strides=(1,1), padding='same', name='detection_layer_1', kernel_initializer='lecun_normal')(x1) if nb_stages == 2: grid_size_y_2, grid_size_x_2 = feature_extractor.get_output_size(layer = 'conv_pw_11_relu') x2 = feature_extractor.get_output_tensor('conv_pw_11_relu') #x1 = _depthwise_conv_block(inputs = x1, alpha = 1, pointwise_conv_filters = 128, block_id=14) x1 = UpSampling2D(2)(x1) if x1.shape[1:3] != x2.shape[1:3]: #print(x1.shape[1:3] - x2.shape[1:3]) #pad = tf.math.subtract(x1.shape[1:3], x2.shape[1:3]).numpy().tolist() #print(pad) x2 = ZeroPadding2D(padding=((0,1), (0,0)))(x2) grid_size_y_2, grid_size_x_2 = x2.shape[1:3] x2 = Concatenate()([x2, x1]) #x2 = _depthwise_conv_block(inputs = x2, alpha = 1, pointwise_conv_filters = 128, block_id=15) y2 = Conv2D(nb_box * (4 + 1 + nb_classes), (1,1), strides=(1,1), padding='same', name='detection_layer_2', kernel_initializer='lecun_normal')(x2) if nb_stages == 2: l1 = Reshape((grid_size_y, grid_size_x, nb_box, 4 + 1 + nb_classes))(y1) l2 = Reshape((grid_size_y_2, grid_size_x_2, nb_box, 4 + 1 + nb_classes))(y2) detection_layers = ['detection_layer_1', 'detection_layer_2'] output_tensors = [l1, l2] else: l1 = Reshape((grid_size_y, grid_size_x, nb_box, 4 + 1 + nb_classes))(y1) detection_layers = ['detection_layer_1'] output_tensors = [l1] model = Model(feature_extractor.feature_extractor.inputs[0], output_tensors, name='yolo') self._norm = feature_extractor.normalize self._model = model self._init_layers(detection_layers) def _init_layers(self, layers): for layer in layers: layer = self._model.get_layer(layer) weights = layer.get_weights() input_depth = weights[0].shape[-2] # 2048 new_kernel = np.random.normal(size=weights[0].shape)/ input_depth new_bias = np.zeros_like(weights[1]) layer.set_weights([new_kernel, new_bias]) def load_weights(self, weight_path, by_name): self._model.load_weights(weight_path, by_name=by_name) def forward(self, image): netout = self._model.predict(image) return netout def get_model(self, first_trainable_layer=None): return self._model def get_grid_size(self): grid_sizes = [] for model_output in self._model.outputs: grid_sizes.append(list(model_output.shape[1:3])) return grid_sizes def get_normalize_func(self): return self._norm ================================================ FILE: axelerate/networks/yolo/backend/utils/__init__.py ================================================ # All modules in utils package can be run independently and have no dependencies on other modules in the project. # This makes it easy to reuse in other projects. ================================================ FILE: axelerate/networks/yolo/backend/utils/annotation.py ================================================ # -*- coding: utf-8 -*- import os import numpy as np from xml.etree.ElementTree import parse def get_unique_labels(files): parser = PascalVocXmlParser() labels = [] for fname in files: labels += parser.get_labels(fname) labels = list(set(labels)) labels.sort() return labels def get_train_annotations(labels, img_folder, ann_folder, valid_img_folder = "", valid_ann_folder = "", is_only_detect=False): """ # Args labels : list of strings ["raccoon", "human", ...] img_folder : str ann_folder : str valid_img_folder : str valid_ann_folder : str # Returns train_anns : Annotations instance valid_anns : Annotations instance """ # parse annotations of the training set train_anns = parse_annotation(ann_folder, img_folder, labels, is_only_detect) # parse annotations of the validation set, if any, otherwise split the training set if os.path.exists(valid_ann_folder): print(valid_ann_folder) valid_anns = parse_annotation(valid_ann_folder, valid_img_folder, labels, is_only_detect) else: train_valid_split = int(0.8*len(train_anns)) train_anns.shuffle() # Todo : Hard coding valid_anns = Annotations(train_anns._label_namings) valid_anns._components = train_anns._components[train_valid_split:] train_anns._components = train_anns._components[:train_valid_split] return train_anns, valid_anns class PascalVocXmlParser(object): """Parse annotation for 1-annotation file """ def __init__(self): pass def get_fname(self, annotation_file): """ # Args annotation_file : str annotation file including directory path # Returns filename : str """ root = self._root_tag(annotation_file) return root.find("filename").text def get_path(self, annotation_file): """ # Args annotation_file : str annotation file including directory path # Returns pathfilename : str """ root = self._root_tag(annotation_file) path = root.find("path") return path if path is None else path.text def get_width(self, annotation_file): """ # Args annotation_file : str annotation file including directory path # Returns width : int """ tree = self._tree(annotation_file) for elem in tree.iter(): if 'width' in elem.tag: return int(elem.text) def get_height(self, annotation_file): """ # Args annotation_file : str annotation file including directory path # Returns height : int """ tree = self._tree(annotation_file) for elem in tree.iter(): if 'height' in elem.tag: return int(elem.text) def get_labels(self, annotation_file): """ # Args annotation_file : str annotation file including directory path # Returns labels : list of strs """ root = self._root_tag(annotation_file) labels = [] obj_tags = root.findall("object") for t in obj_tags: labels.append(t.find("name").text) return labels def get_boxes(self, annotation_file): """ # Args annotation_file : str annotation file including directory path # Returns bbs : 2d-array, shape of (N, 4) (x1, y1, x2, y2)-ordered """ root = self._root_tag(annotation_file) bbs = [] obj_tags = root.findall("object") for t in obj_tags: box_tag = t.find("bndbox") x1 = box_tag.find("xmin").text y1 = box_tag.find("ymin").text x2 = box_tag.find("xmax").text y2 = box_tag.find("ymax").text box = np.array([int(float(x1)), int(float(y1)), int(float(x2)), int(float(y2))]) bbs.append(box) bbs = np.array(bbs) return bbs def _root_tag(self, fname): tree = parse(fname) root = tree.getroot() return root def _tree(self, fname): tree = parse(fname) return tree def parse_annotation(ann_dir, img_dir, labels_naming=[], is_only_detect=False): """ # Args ann_dir : str img_dir : str labels_naming : list of strings # Returns all_imgs : list of dict """ parser = PascalVocXmlParser() if is_only_detect: annotations = Annotations(["object"]) else: annotations = Annotations(labels_naming) for ann in sorted(os.listdir(ann_dir)): annotation_file = os.path.join(ann_dir, ann) fname = parser.get_fname(annotation_file) path = parser.get_path(annotation_file) if not path or not os.path.exists(path): path = os.path.join(img_dir, fname) annotation = Annotation(path) labels = parser.get_labels(annotation_file) boxes = parser.get_boxes(annotation_file) for label, box in zip(labels, boxes): x1, y1, x2, y2 = box if is_only_detect: annotation.add_object(x1, y1, x2, y2, name="object") else: if label in labels_naming: annotation.add_object(x1, y1, x2, y2, name=label) if annotation.boxes is not None: annotations.add(annotation) return annotations class Annotation(object): """ # Attributes fname : image file path labels : list of strings boxes : Boxes instance """ def __init__(self, filename): self.fname = filename self.labels = [] self.boxes = None def add_object(self, x1, y1, x2, y2, name): self.labels.append(name) if self.boxes is None: self.boxes = np.array([x1, y1, x2, y2]).reshape(-1,4) else: box = np.array([x1, y1, x2, y2]).reshape(-1,4) self.boxes = np.concatenate([self.boxes, box]) class Annotations(object): def __init__(self, label_namings): self._components = [] self._label_namings = label_namings def n_classes(self): return len(self._label_namings) def add(self, annotation): self._components.append(annotation) def shuffle(self): np.random.shuffle(self._components) def fname(self, i): index = self._valid_index(i) return self._components[index].fname def boxes(self, i): index = self._valid_index(i) return self._components[index].boxes def labels(self, i): """ # Returns labels : list of strings """ index = self._valid_index(i) return self._components[index].labels def code_labels(self, i): """ # Returns code_labels : list of int """ str_labels = self.labels(i) labels = [] for label in str_labels: labels.append(self._label_namings.index(label)) return labels def _valid_index(self, i): valid_index = i % len(self._components) return valid_index def __len__(self): return len(self._components) def __getitem__(self, idx): return self._components[idx] ================================================ FILE: axelerate/networks/yolo/backend/utils/box.py ================================================ import numpy as np import cv2 class BoundBox: def __init__(self, x, y, w, h, c = None, classes = None): self.x = x self.y = y self.w = w self.h = h self.c = c self.classes = classes def get_label(self): return np.argmax(self.classes) def get_score(self): return self.classes[self.get_label()] def iou(self, bound_box): b1 = self.as_centroid() b2 = bound_box.as_centroid() return centroid_box_iou(b1, b2) def as_centroid(self): return np.array([self.x, self.y, self.w, self.h]) def boxes_to_array(bound_boxes): """ # Args boxes : list of BoundBox instances # Returns centroid_boxes : (N, 4) probs : (N, nb_classes) """ centroid_boxes = [] probs = [] for box in bound_boxes: centroid_boxes.append([box.x, box.y, box.w, box.h]) probs.append(box.classes) return np.array(centroid_boxes), np.array(probs) def nms_boxes(boxes, n_classes, nms_threshold=0.3, obj_threshold=0.3): """ # Args boxes : list of BoundBox # Returns boxes : list of BoundBox non maximum supressed BoundBox instances """ # suppress non-maximal boxes for c in range(n_classes): sorted_indices = list(reversed(np.argsort([box.classes[c] for box in boxes]))) for i in range(len(sorted_indices)): index_i = sorted_indices[i] if boxes[index_i].classes[c] == 0: continue else: for j in range(i+1, len(sorted_indices)): index_j = sorted_indices[j] if boxes[index_i].iou(boxes[index_j]) >= nms_threshold: boxes[index_j].classes[c] = 0 # remove the boxes which are less likely than a obj_threshold boxes = [box for box in boxes if box.get_score() > obj_threshold] return boxes def draw_scaled_boxes(image, boxes, probs, labels, desired_size=400): img_size = min(image.shape[:2]) if img_size < desired_size: scale_factor = float(desired_size) / img_size else: scale_factor = 1.0 h, w = image.shape[:2] img_scaled = cv2.resize(image, (int(w*scale_factor), int(h*scale_factor))) if boxes != []: boxes_scaled = boxes*scale_factor boxes_scaled = boxes_scaled.astype(np.int) else: boxes_scaled = boxes return draw_boxes(img_scaled, boxes_scaled, probs, labels) def draw_boxes(image, boxes, scores, classes, labels): color = (0, 125, 0) for i in range(len(boxes)): x_min, y_min, x_max, y_max = boxes[i] obj_class = classes[i] score = scores[i] # Draw bounding box around detected object cv2.rectangle(image, (x_min, y_min), (x_max, y_max), color, 2) #print(labels[obj_class], score) # Create label for detected object class label = "{}:{:.2f}%".format(labels[obj_class], np.max(score)) label_color = (255, 255, 255) text_size = 0.0015 * min(image.shape[0], image.shape[1]) # Make sure label always stays on-screen x_text, y_text = cv2.getTextSize(label, cv2.FONT_HERSHEY_DUPLEX, text_size, 1)[0][:2] lbl_box_xy_min = (x_min, y_min if y_min < 25 else y_min - y_text) lbl_box_xy_max = (x_min + x_text, y_min + y_text if y_min < 25 else y_min) lbl_text_pos = (x_min, y_min) # Add label and confidence value cv2.rectangle(image, lbl_box_xy_min, lbl_box_xy_max, color, -1) cv2.putText(image, label, lbl_text_pos, cv2.FONT_HERSHEY_DUPLEX, text_size, label_color, 1, cv2.LINE_AA) return image def centroid_box_iou(box1, box2): def _interval_overlap(interval_a, interval_b): x1, x2 = interval_a x3, x4 = interval_b if x3 < x1: if x4 < x1: return 0 else: return min(x2,x4) - x1 else: if x2 < x3: return 0 else: return min(x2,x4) - x3 _, _, w1, h1 = box1.reshape(-1,) _, _, w2, h2 = box2.reshape(-1,) x1_min, y1_min, x1_max, y1_max = to_minmax(box1.reshape(-1,4)).reshape(-1,) x2_min, y2_min, x2_max, y2_max = to_minmax(box2.reshape(-1,4)).reshape(-1,) intersect_w = _interval_overlap([x1_min, x1_max], [x2_min, x2_max]) intersect_h = _interval_overlap([y1_min, y1_max], [y2_min, y2_max]) intersect = intersect_w * intersect_h union = w1 * h1 + w2 * h2 - intersect return float(intersect) / union def to_centroid(minmax_boxes): """ minmax_boxes : (N, 4) [[100, 120, 140, 200]] centroid_boxes: [[120. 160. 40. 80.]] """ #minmax_boxes = np.asarray([[100, 120, 140, 200]]) minmax_boxes = minmax_boxes.astype(np.float) centroid_boxes = np.zeros_like(minmax_boxes) x1 = minmax_boxes[:,0] y1 = minmax_boxes[:,1] x2 = minmax_boxes[:,2] y2 = minmax_boxes[:,3] centroid_boxes[:,0] = (x1 + x2) / 2 centroid_boxes[:,1] = (y1 + y2) / 2 centroid_boxes[:,2] = x2 - x1 centroid_boxes[:,3] = y2 - y1 return centroid_boxes def to_minmax(centroid_boxes): centroid_boxes = centroid_boxes.astype(np.float) minmax_boxes = np.zeros_like(centroid_boxes) cx = centroid_boxes[:,0] cy = centroid_boxes[:,1] w = centroid_boxes[:,2] h = centroid_boxes[:,3] minmax_boxes[:,0] = cx - w/2 minmax_boxes[:,1] = cy - h/2 minmax_boxes[:,2] = cx + w/2 minmax_boxes[:,3] = cy + h/2 return minmax_boxes def create_anchor_boxes(anchors): """ # Args anchors : list of floats # Returns boxes : array, shape of (len(anchors)/2, 4) centroid-type """ boxes = [] n_boxes = int(len(anchors)/2) for i in range(n_boxes): boxes.append(np.array([0, 0, anchors[2*i], anchors[2*i+1]])) return np.array(boxes) def find_match_box(centroid_box, centroid_boxes): """Find the index of the boxes with the largest overlap among the N-boxes. # Args box : array, shape of (1, 4) boxes : array, shape of (N, 4) # Return match_index : int """ match_index = -1 max_iou = -1 for i, box in enumerate(centroid_boxes): iou = centroid_box_iou(centroid_box, box) if max_iou < iou: match_index = i max_iou = iou return match_index ================================================ FILE: axelerate/networks/yolo/backend/utils/custom.py ================================================ from tensorflow.python import keras from tensorflow.python.ops import init_ops from tensorflow.python.ops import math_ops from tensorflow.python.keras.utils.generic_utils import to_list from tensorflow.python.keras.utils import metrics_utils from tensorflow.python.keras.metrics import Metric from tensorflow.python.keras import backend as K from tensorflow.python.ops import state_ops from tensorflow.python.ops.resource_variable_ops import ResourceVariable import numpy as np import os import tensorflow as tf import tensorflow.keras class Yolo_Precision(Metric): def __init__(self, thresholds=None, name=None, dtype=None): super(Yolo_Precision, self).__init__(name=name, dtype=dtype) self.init_thresholds = thresholds default_threshold = 0.5 self.thresholds = default_threshold if thresholds is None else thresholds self.true_positives = self.add_weight( 'tp', initializer=init_ops.zeros_initializer) # type: ResourceVariable self.false_positives = self.add_weight( 'fp', initializer=init_ops.zeros_initializer) # type: ResourceVariable def update_state(self, y_true, y_pred, sample_weight=None): true_confidence = y_true[..., 4:5] pred_confidence = y_pred[..., 4:5] pred_confidence_sigmoid = math_ops.sigmoid(pred_confidence) values = math_ops.logical_and(true_confidence > self.thresholds, pred_confidence > self.thresholds) values = math_ops.cast(values, self.dtype) self.true_positives.assign_add(math_ops.reduce_sum(values)) values = math_ops.logical_and(math_ops.logical_not(true_confidence > self.thresholds), pred_confidence > self.thresholds) values = math_ops.cast(values, self.dtype) self.false_positives.assign_add(math_ops.reduce_sum(values)) def result(self): return math_ops.div_no_nan(self.true_positives, (math_ops.add(self.true_positives, self.false_positives))) class Yolo_Recall(Metric): def __init__(self, thresholds=None, name=None, dtype=None): super(Yolo_Recall, self).__init__(name=name, dtype=dtype) self.init_thresholds = thresholds default_threshold = 0.5 self.thresholds = default_threshold if thresholds is None else thresholds self.true_positives = self.add_weight( 'tp', initializer=init_ops.zeros_initializer) self.false_negatives = self.add_weight( 'fn', initializer=init_ops.zeros_initializer) def update_state(self, y_true, y_pred, sample_weight=None): true_confidence = y_true[..., 4:5] pred_confidence = y_pred[..., 4:5] pred_confidence_sigmoid = math_ops.sigmoid(pred_confidence) values = math_ops.logical_and(true_confidence > self.thresholds, pred_confidence > self.thresholds) values = math_ops.cast(values, self.dtype) self.true_positives.assign_add(math_ops.reduce_sum(values)) # type: ResourceVariable values = math_ops.logical_and(true_confidence > self.thresholds, math_ops.logical_not(pred_confidence > self.thresholds)) values = math_ops.cast(values, self.dtype) self.false_negatives.assign_add(math_ops.reduce_sum(values)) # type: ResourceVariable def result(self): return math_ops.div_no_nan(self.true_positives, (math_ops.add(self.true_positives, self.false_negatives))) class MergeMetrics(tensorflow.keras.callbacks.Callback): def __init__(self, model, type, period = 1, save_best=False, save_name=None, tensorboard=None): super().__init__() self.type = type self.name = "total_val_" + self.type output_names = [] for layer in model.layers: if 'reshape' in layer.name: output_names.append(layer.name) self.output_names = ['val_' + output_name + "_" + self.type if len(output_names) > 1 else 'val_' + self.type for output_name in output_names] print("Layers to use in {} callback monitoring: {}".format(self.name, self.output_names)) self.num_outputs = len(self.output_names) self._period = period self._save_best = save_best self._save_name = save_name self._tensorboard = tensorboard self.best_result = 0 if not isinstance(self._tensorboard, tensorflow.keras.callbacks.TensorBoard) and self._tensorboard is not None: raise ValueError("Tensorboard object must be a instance from keras.callbacks.TensorBoard") def on_epoch_end(self, epoch, logs={}): logs = logs or {} if epoch % self._period == 0 and self._period != 0: result = sum([logs[output_name] for output_name in self.output_names])/self.num_outputs logs[self.name] = result print('\n') print('{}: {:.4f}'.format(self.name, result)) if epoch == 0: print("Saving model on first epoch irrespective of {}".format(self.name)) self.model.save(self._save_name, overwrite=True, include_optimizer=False) else: if self._save_best and self._save_name is not None and result > self.best_result: print("{} improved from {} to {}, saving model to {}.".format(self.name, self.best_result, result, self._save_name)) self.best_result = result self.model.save(self._save_name, overwrite=True, include_optimizer=False) else: print("{} did not improve from {}.".format(self.name, self.best_result)) if self._tensorboard: writer = tf.summary.create_file_writer(self._tensorboard.log_dir) with writer.as_default(): tf.summary.scalar(self.name, result, step=epoch) writer.flush() ================================================ FILE: axelerate/networks/yolo/backend/utils/eval/__init__.py ================================================ ================================================ FILE: axelerate/networks/yolo/backend/utils/eval/_box_match.py ================================================ # -*- coding: utf-8 -*- import numpy as np from scipy.optimize import linear_sum_assignment as linear_assignment class BoxMatcher(object): """ # Args boxes1 : ndarray, shape of (N, 4) (x1, y1, x2, y2) ordered boxes2 : ndarray, shape of (M, 4) (x1, y1, x2, y2) ordered """ def __init__(self, boxes1, boxes2, labels1=None, labels2=None): self._boxes1 = boxes1 self._boxes2 = boxes2 if len(boxes1) == 0 or len(boxes2) == 0: pass else: if labels1 is None or labels2 is None: self._iou_matrix = self._calc(boxes1, boxes2, np.ones((len(boxes1),)), np.ones((len(boxes2),))) else: self._iou_matrix = self._calc(boxes1, boxes2, labels1, labels2) self._match_pairs = np.asarray(linear_assignment(-1*self._iou_matrix)) self._match_pairs = np.transpose(self._match_pairs) def match_idx_of_box1_idx(self, box1_idx): """ # Args box1_idx : int # Returns box2_idx : int or None if matching index does not exist, return None iou : float IOU (intersection over union) between the box corresponding to the box1 index and the box2 matching it """ assert box1_idx < len(self._boxes1) if len(self._boxes2) == 0: return None, 0 box1_matching_idx_list = self._match_pairs[:, 0] box2_matching_idx_list = self._match_pairs[:, 1] box2_idx = self._find(box1_idx, box1_matching_idx_list, box2_matching_idx_list) if box2_idx is None: iou = 0 else: iou = self._iou_matrix[box1_idx, box2_idx] return box2_idx, iou def match_idx_of_box2_idx(self, box2_idx): """ # Args box2_idx : int # Returns box1_idx : int or None if matching index does not exist, return None iou : float IOU (intersection over union) between the box corresponding to the box2 index and the box1 matching it """ assert box2_idx < len(self._boxes2) if len(self._boxes1) == 0: return None, 0 box1_matching_idx_list = self._match_pairs[:, 0] box2_matching_idx_list = self._match_pairs[:, 1] box1_idx = self._find(box2_idx, box2_matching_idx_list, box1_matching_idx_list) if box1_idx is None: iou = 0 else: iou = self._iou_matrix[box1_idx, box2_idx] return box1_idx, iou def _find(self, input_idx, input_idx_list, output_idx_list): if input_idx in input_idx_list: loc = np.where(input_idx_list == input_idx)[0][0] output_idx = int(output_idx_list[loc]) else: output_idx = None return output_idx def _calc_maximun_ious(self): ious_for_each_gt = self._calc(self._boxes1, self._boxes2) ious = np.max(ious_for_each_gt, axis=0) return ious def _calc(self, boxes, true_boxes, labels, true_labels): ious_for_each_gt = [] for truth_box, truth_label in zip(true_boxes, true_labels): x1 = boxes[:, 0] y1 = boxes[:, 1] x2 = boxes[:, 2] y2 = boxes[:, 3] x1_gt = truth_box[0] y1_gt = truth_box[1] x2_gt = truth_box[2] y2_gt = truth_box[3] xx1 = np.maximum(x1, x1_gt) yy1 = np.maximum(y1, y1_gt) xx2 = np.minimum(x2, x2_gt) yy2 = np.minimum(y2, y2_gt) w = np.maximum(0, xx2 - xx1 + 1) h = np.maximum(0, yy2 - yy1 + 1) intersections = w*h As = (x2 - x1 + 1) * (y2 - y1 + 1) B = (x2_gt - x1_gt + 1) * (y2_gt - y1_gt + 1) label_score = (labels == truth_label).astype(np.float) ious = label_score * intersections.astype(float) / (As + B -intersections) ious_for_each_gt.append(ious) # (n_truth, n_boxes) ious_for_each_gt = np.array(ious_for_each_gt) return ious_for_each_gt.T if __name__ == "__main__": labels = np.array([1,2,3,4]) label = np.array([4]) expected = np.array([0, 0, 0, 1]) label_score = (labels == label).astype(np.float) print(label_score) labels = np.array(["a","bb","a","cc"]) label = np.array(["cc"]) label_score = (labels == label).astype(np.float) print(label_score) ================================================ FILE: axelerate/networks/yolo/backend/utils/eval/fscore.py ================================================ # -*- coding: utf-8 -*- from ._box_match import BoxMatcher def count_true_positives(detect_boxes, true_boxes, detect_labels=None, true_labels=None): """ # Args detect_boxes : array, shape of (n_detected_boxes, 4) true_boxes : array, shape of (n_true_boxes, 4) detected_labels : array, shape of (n_detected_boxes,) true_labels : """ n_true_positives = 0 matcher = BoxMatcher(detect_boxes, true_boxes, detect_labels, true_labels) for i in range(len(detect_boxes)): matching_idx, iou = matcher.match_idx_of_box1_idx(i) print("detect_idx: {}, true_idx: {}, matching-score: {}".format(i, matching_idx, iou)) if matching_idx is not None and iou > 0.5: n_true_positives += 1 return n_true_positives def calc_score(n_true_positives, n_truth, n_pred): """ # Args detect_boxes : list of box-arrays true_boxes : list of box-arrays """ if n_pred > 0: precision = n_true_positives / n_pred else: precision = 0 if n_truth > 0: recall = n_true_positives / n_truth elif n_truth == 0 and n_true_positives == 0: recall = 1 else: recall = 0 if precision + recall > 0: fscore = 2* precision * recall / (precision + recall) score = {"fscore": fscore, "precision": precision, "recall": recall} else: score = 0 return score if __name__ == '__main__': pass ================================================ FILE: axelerate/networks/yolo/frontend.py ================================================ # -*- coding: utf-8 -*- # This module is responsible for communicating with the outside of the yolo package. # Outside the package, someone can use yolo detector accessing with this module. import os import time import numpy as np import tensorflow as tf from tqdm import tqdm from axelerate.networks.common_utils.fit import train from axelerate.networks.yolo.backend.decoder import YoloDecoder from axelerate.networks.yolo.backend.utils.custom import Yolo_Precision, Yolo_Recall from axelerate.networks.yolo.backend.loss import create_loss_fn, Params from axelerate.networks.yolo.backend.network import create_yolo_network from axelerate.networks.yolo.backend.batch_gen import create_batch_generator from axelerate.networks.yolo.backend.utils.annotation import get_train_annotations, get_unique_labels from axelerate.networks.yolo.backend.utils.box import to_minmax def get_object_labels(ann_directory): files = os.listdir(ann_directory) files = [os.path.join(ann_directory, fname) for fname in files] return get_unique_labels(files) def create_yolo(architecture, labels, input_size, anchors, obj_thresh, iou_thresh, coord_scale, object_scale, no_object_scale, weights = None): n_classes = len(labels) n_boxes = int(len(anchors[0])) n_branches = len(anchors) yolo_network = create_yolo_network(architecture, input_size, n_classes, n_boxes, n_branches, weights) yolo_params = Params(obj_thresh, iou_thresh, object_scale, no_object_scale, coord_scale, yolo_network.get_grid_size(), anchors, n_classes) yolo_loss = create_loss_fn metrics_dict = {'recall': [Yolo_Precision(obj_thresh, name='precision'), Yolo_Recall(obj_thresh, name='recall')], 'precision': [Yolo_Precision(obj_thresh, name='precision'), Yolo_Recall(obj_thresh, name='recall')]} yolo_decoder = YoloDecoder(anchors, yolo_params, 0.1, input_size) yolo = YOLO(yolo_network, yolo_loss, yolo_decoder, labels, input_size, yolo_params, metrics_dict) return yolo class YOLO(object): def __init__(self, yolo_network, yolo_loss, yolo_decoder, labels, input_size, yolo_params, metrics_dict): self.yolo_network = yolo_network self.yolo_loss = yolo_loss self.yolo_decoder = yolo_decoder self.labels = labels self.input_size = input_size self.norm = yolo_network._norm self.yolo_params = yolo_params self.num_branches = len(self.yolo_params.anchors) self.metrics_dict = metrics_dict def load_weights(self, weight_path, by_name=True): if os.path.exists(weight_path): print("Loading pre-trained weights for the whole model: ", weight_path) self.yolo_network.load_weights(weight_path, by_name=True) else: print("Failed to load pre-trained weights for the whole model. It might be because you didn't specify any or the weight file cannot be found") def predict(self, image, height, width, threshold=0.3): """ # Args image : 3d-array (RGB ordered) # Returns boxes : array, shape of (N, 4) probs : array, shape of (N, nb_classes) """ def _to_original_scale(boxes): minmax_boxes = to_minmax(boxes) minmax_boxes[:,0] *= width minmax_boxes[:,2] *= width minmax_boxes[:,1] *= height minmax_boxes[:,3] *= height return minmax_boxes.astype(np.int) start_time = time.time() netout = self.yolo_network.forward(image) elapsed_ms = (time.time() - start_time) * 1000 boxes, probs= self.yolo_decoder.run(netout, threshold) if len(boxes) > 0: boxes = _to_original_scale(boxes) print(boxes, probs) return elapsed_ms, boxes, probs else: return elapsed_ms, [], [] def evaluate(self, img_folder, ann_folder, batch_size): self.generator = create_batch_generator(img_folder, ann_folder, self.input_size, self.output_size, self.n_classes, batch_size, 1, False, self.norm) tp = np.zeros(self.n_classes) fp = np.zeros(self.n_classes) fn = np.zeros(self.n_classes) n_pixels = np.zeros(self.n_classes) for inp, gt in tqdm(list(self.generator)): y_pred = self.network.predict(inp) def train(self, img_folder, ann_folder, nb_epoch, project_folder, batch_size, jitter, learning_rate, train_times, valid_times, valid_img_folder, valid_ann_folder, first_trainable_layer, metrics): # 1. get annotations train_annotations, valid_annotations = get_train_annotations(self.labels, img_folder, ann_folder, valid_img_folder, valid_ann_folder, is_only_detect = False) # 1. get batch generator valid_batch_size = len(valid_annotations)*valid_times if valid_batch_size < batch_size: raise ValueError("Not enough validation images: batch size {} is larger than {} validation images. Add more validation images or decrease batch size!".format(batch_size, valid_batch_size)) train_batch_generator = self._get_batch_generator(train_annotations, batch_size, train_times, augment=jitter) valid_batch_generator = self._get_batch_generator(valid_annotations, batch_size, valid_times, augment=False) # 2. To train model get keras model instance & loss function model = self.yolo_network.get_model(first_trainable_layer) loss = self._get_loss_func(batch_size) # 3. Run training loop return train(model, loss, train_batch_generator, valid_batch_generator, learning_rate = learning_rate, nb_epoch = nb_epoch, project_folder = project_folder, first_trainable_layer = first_trainable_layer, metric=self.metrics_dict, metric_name=metrics) def _get_loss_func(self, batch_size): return [self.yolo_loss(self.yolo_params, layer, batch_size) for layer in range(self.num_branches)] def _get_batch_generator(self, annotations, batch_size, repeat_times, augment): """ # Args annotations : Annotations instance batch_size : int jitter : bool # Returns batch_generator : BatchGenerator instance """ batch_generator = create_batch_generator(annotations, self.input_size, self.yolo_network.get_grid_size(), batch_size, self.yolo_params.anchors, repeat_times, augment=augment, norm=self.yolo_network.get_normalize_func()) return batch_generator ================================================ FILE: axelerate/train.py ================================================ import shutil import numpy as np np.random.seed(111) import argparse import os import time import sys import json import matplotlib from axelerate.networks.yolo.frontend import create_yolo, get_object_labels from axelerate.networks.classifier.frontend_classifier import create_classifier, get_labels from axelerate.networks.segnet.frontend_segnet import create_segnet from axelerate.networks.common_utils.convert import Converter os.environ['TF_CPP_MIN_LOG_LEVEL'] = '4' import tensorflow as tf tf.get_logger().setLevel('ERROR') argparser = argparse.ArgumentParser( description='Train and validate YOLO_v2 model on any dataset') argparser.add_argument( '-c', '--config', default="configs/from_scratch.json", help='path to configuration file') def train_from_config(config,project_folder): try: matplotlib.use('Agg') except: pass #added for compatibility with < 0.5.7 versions try: input_size = config['model']['input_size'][:] except: input_size = [config['model']['input_size'],config['model']['input_size']] # Create the converter converter = Converter(config['converter']['type'], config['model']['architecture'], config['train']['valid_image_folder']) # Segmentation network if config['model']['type']=='SegNet': print('Segmentation') # 1. Construct the model segnet = create_segnet(config['model']['architecture'], input_size, config['model']['n_classes'], config['weights']['backend']) # 2. Load the pretrained weights (if any) segnet.load_weights(config['weights']['full'], by_name=True) # 3. actual training model_layers, model_path = segnet.train(config['train']['train_image_folder'], config['train']['train_annot_folder'], config['train']['actual_epoch'], project_folder, config["train"]["batch_size"], config["train"]["augmentation"], config['train']['learning_rate'], config['train']['train_times'], config['train']['valid_times'], config['train']['valid_image_folder'], config['train']['valid_annot_folder'], config['train']['first_trainable_layer'], config['train']['ignore_zero_class'], config['train']['valid_metric']) # Classifier if config['model']['type']=='Classifier': print('Classifier') if config['model']['labels']: labels = config['model']['labels'] else: labels = get_labels(config['train']['train_image_folder']) # 1. Construct the model classifier = create_classifier(config['model']['architecture'], labels, input_size, config['model']['fully-connected'], config['model']['dropout'], config['weights']['backend'], config['weights']['save_bottleneck']) # 2. Load the pretrained weights (if any) classifier.load_weights(config['weights']['full'], by_name=True) # 3. actual training model_layers, model_path = classifier.train(config['train']['train_image_folder'], config['train']['actual_epoch'], project_folder, config["train"]["batch_size"], config["train"]["augmentation"], config['train']['learning_rate'], config['train']['train_times'], config['train']['valid_times'], config['train']['valid_image_folder'], config['train']['first_trainable_layer'], config['train']['valid_metric']) # Detector if config['model']['type']=='Detector': if config['train']['is_only_detect']: labels = ["object"] else: if config['model']['labels']: labels = config['model']['labels'] else: labels = get_object_labels(config['train']['train_annot_folder']) print(labels) # 1. Construct the model yolo = create_yolo(config['model']['architecture'], labels, input_size, config['model']['anchors'], config['model']['obj_thresh'], config['model']['iou_thresh'], config['model']['coord_scale'], config['model']['object_scale'], config['model']['no_object_scale'], config['weights']['backend']) # 2. Load the pretrained weights (if any) yolo.load_weights(config['weights']['full'], by_name=True) # 3. actual training model_layers, model_path = yolo.train(config['train']['train_image_folder'], config['train']['train_annot_folder'], config['train']['actual_epoch'], project_folder, config["train"]["batch_size"], config["train"]["augmentation"], config['train']['learning_rate'], config['train']['train_times'], config['train']['valid_times'], config['train']['valid_image_folder'], config['train']['valid_annot_folder'], config['train']['first_trainable_layer'], config['train']['valid_metric']) # 4 Convert the model time.sleep(2) converter.convert_model(model_path) return model_path def setup_training(config_file=None, config_dict=None): """make directory to save weights & its configuration """ if config_file: with open(config_file) as config_buffer: config = json.loads(config_buffer.read()) elif config_dict: config = config_dict else: print('No config found') sys.exit() dirname = os.path.join("projects", config['train']['saved_folder']) if os.path.isdir(dirname): print("Project folder {} already exists. Creating a folder for new training session.".format(dirname)) else: print("Project folder {} is created.".format(dirname, dirname)) os.makedirs(dirname) return(train_from_config(config, dirname)) if __name__ == '__main__': argparser = argparse.ArgumentParser( description='Train and validate YOLO_v2 model on any dataset') argparser.add_argument( '-c', '--config', default="configs/classifer.json", help='path to configuration file') args = argparser.parse_args() setup_training(config_file=args.config) shutil.rmtree("logs", ignore_errors=True) ================================================ FILE: configs/classifier.json ================================================ { "model" : { "type": "Classifier", "architecture": "MobileNet7_5", "input_size": 224, "fully-connected": [100,50], "labels": [], "dropout" : 0.5 }, "weights" : { "full": "", "backend": "imagenet", "save_bottleneck": false }, "train" : { "actual_epoch": 1, "train_image_folder": "sample_datasets/classifier/imgs", "train_times": 4, "valid_image_folder": "sample_datasets/classifier/imgs_validation", "valid_times": 4, "valid_metric": "val_accuracy", "batch_size": 4, "learning_rate": 1e-4, "saved_folder": "classifier", "first_trainable_layer": "", "augmentation": true }, "converter" : { "type": ["k210","tflite"] } } ================================================ FILE: configs/detector.json ================================================ { "model" : { "type": "Detector", "architecture": "MobileNet7_5", "input_size": 224, "anchors": [[[0.76120044, 0.57155991], [0.6923348, 0.88535553], [0.47163042, 0.34163313]]], "labels": ["aeroplane","person","diningtable","bottle","bird","bus","boat","cow","sheep","train"], "obj_thresh" : 0.5, "iou_thresh" : 0.5, "coord_scale" : 2.0, "object_scale" : 2.0, "no_object_scale" : 1.0 }, "weights" : { "full": "", "backend": "imagenet" }, "train" : { "actual_epoch": 1, "train_image_folder": "sample_datasets/detector/imgs", "train_annot_folder": "sample_datasets/detector/anns", "train_times": 4, "valid_image_folder": "sample_datasets/detector/imgs_validation", "valid_annot_folder": "sample_datasets/detector/anns_validation", "valid_times": 4, "valid_metric": "mAP", "batch_size": 4, "learning_rate": 1e-4, "saved_folder": "detector", "first_trainable_layer": "", "augmentation": true, "is_only_detect" : false }, "converter" : { "type": ["k210", "tflite"] } } ================================================ FILE: configs/dogs_classifier.json ================================================ { "model" : { "type": "Classifier", "architecture": "NASNetMobile", "input_size": 224, "fully-connected": [], "labels": [], "dropout" : 0.2 }, "weights" : { "full": "", "backend": "imagenet", "save_bottleneck": false }, "train" : { "actual_epoch": 100, "train_image_folder": "/home/ubuntu/datasets/dogs_classification/imgs", "train_times": 1, "valid_image_folder": "/home/ubuntu/datasets/dogs_classification/imgs_validation", "valid_times": 1, "valid_metric": "val_accuracy", "batch_size": 16, "learning_rate": 1e-3, "saved_folder": "dogs_classifier", "first_trainable_layer": "", "augmentation": true }, "converter" : { "type": ["tflite"] } } ================================================ FILE: configs/face_detector.json ================================================ { "model":{ "type": "Detector", "architecture": "MobileNet2_5", "input_size": [240, 320], "anchors": [[[0.51424575, 0.54116074], [0.29523918, 0.45838044], [0.21371929, 0.21518053]]], "labels": ["face"], "obj_thresh" : 0.5, "iou_thresh" : 0.5, "coord_scale" : 2.0, "object_scale" : 2.0, "no_object_scale" : 1.0 }, "weights" : { "full": "", "backend": "imagenet" }, "train" : { "actual_epoch": 30, "train_image_folder": "/home/ubuntu/datasets/WideFace_large/imgs", "train_annot_folder": "/home/ubuntu/datasets/WideFace_large/anns", "train_times": 1, "valid_image_folder": "/home/ubuntu/datasets/WideFace_large/imgs_validation", "valid_annot_folder": "/home/ubuntu/datasets/WideFace_large/anns_validation", "valid_times": 1, "valid_metric": "val_recall", "batch_size": 32, "learning_rate": 1e-3, "saved_folder": "face_detector", "first_trainable_layer": "", "augmentation": true, "is_only_detect" : false }, "converter" : { "type": ["k210"] } } ================================================ FILE: configs/kangaroo_detector.json ================================================ { "model" : { "type": "Detector", "architecture": "MobileNet2_5", "input_size": 224, "anchors": [[[0.76120044, 0.57155991], [0.6923348, 0.88535553], [0.47163042, 0.34163313]]], "labels": ["kangaroo"], "obj_thresh" : 0.5, "iou_thresh" : 0.5, "coord_scale" : 2.0, "object_scale" : 2.0, "no_object_scale" : 1.0 }, "weights" : { "full": "", "backend": "imagenet" }, "train" : { "actual_epoch": 50, "train_image_folder": "/home/ubuntu/datasets/kangaroo_detection/imgs", "train_annot_folder": "/home/ubuntu/datasets/kangaroo_detection/anns", "train_times": 4, "valid_image_folder": "/home/ubuntu/datasets/kangaroo_detection/imgs_validation", "valid_annot_folder": "/home/ubuntu/datasets/kangaroo_detection/anns_validation", "valid_times": 2, "valid_metric": "mAP", "batch_size": 8, "learning_rate": 1e-3, "saved_folder": "kangaroo_detector", "first_trainable_layer": "", "augmentation": true, "is_only_detect" : false }, "converter" : { "type": ["openvino"] } } ================================================ FILE: configs/lego_detector.json ================================================ { "model" : { "type": "Detector", "architecture": "MobileNet7_5", "input_size": 224, "anchors": [[[0.76120044, 0.57155991], [0.6923348, 0.88535553], [0.47163042, 0.34163313]]], "labels": ["lego"], "obj_thresh" : 0.5, "iou_thresh" : 0.5, "coord_scale" : 2.0, "object_scale" : 2.0, "no_object_scale" : 1.0 }, "weights" : { "full": "", "backend": "imagenet" }, "train" : { "actual_epoch": 15, "train_image_folder": "../dataset/imgs", "train_annot_folder": "../dataset/anns", "train_times": 2, "valid_image_folder": "../dataset/imgs_validation", "valid_annot_folder": "../dataset/anns_validation", "valid_times": 2, "valid_metric": "mAP", "batch_size": 32, "learning_rate": 1e-3, "saved_folder": "detector", "first_trainable_layer": "", "augmentation": true, "is_only_detect" : false }, "converter" : { "type": ["edgetpu"] } } ================================================ FILE: configs/pascal_20_detector.json ================================================ { "model" : { "type": "Detector", "architecture": "MobileNet7_5", "input_size": 224, "anchors": [[[0.76120044, 0.57155991], [0.6923348, 0.88535553], [0.47163042, 0.34163313]]], "labels": ["person", "bird", "cat", "cow", "dog", "horse", "sheep", "aeroplane", "bicycle", "boat", "bus", "car", "motorbike", "train","bottle", "chair", "diningtable", "pottedplant", "sofa", "tvmonitor"], "obj_thresh" : 0.5, "iou_thresh" : 0.5, "coord_scale" : 2.0, "object_scale" : 2.0, "no_object_scale" : 1.0 }, "weights" : { "full": "", "backend": "imagenet" }, "train" : { "actual_epoch": 50, "train_image_folder": "/home/ubuntu/datasets/pascal_20_detection/imgs", "train_annot_folder": "/home/ubuntu/datasets/pascal_20_detection/anns", "train_times": 1, "valid_image_folder": "/home/ubuntu/datasets/pascal_20_detection/imgs_validation", "valid_annot_folder": "/home/ubuntu/datasets/pascal_20_detection/anns_validation", "valid_times": 1, "valid_metric": "val_loss", "batch_size": 32, "learning_rate": 1e-3, "saved_folder": "pascal", "first_trainable_layer": "", "augmentation": true, "is_only_detect" : false }, "converter" : { "type": ["tflite"] } } ================================================ FILE: configs/pascal_20_detector_2.json ================================================ { "model" : { "type": "Detector", "architecture": "MobileNet1_0", "input_size": [224, 320], "anchors": [[[0.76120044, 0.57155991], [0.6923348, 0.88535553], [0.47163042, 0.34163313]], [[0.33340788, 0.70065861], [0.18124964, 0.38986752], [0.08497349, 0.1527057 ]]], "labels": ["person", "bird", "cat", "cow", "dog", "horse", "sheep", "aeroplane", "bicycle", "boat", "bus", "car", "motorbike", "train","bottle", "chair", "diningtable", "pottedplant", "sofa", "tvmonitor"], "obj_thresh" : 0.5, "iou_thresh" : 0.5, "coord_scale" : 1.0, "object_scale" : 3.0, "no_object_scale" : 1.0 }, "weights" : { "full": "", "backend": "imagenet" }, "train" : { "actual_epoch": 50, "train_image_folder": "/home/ubuntu/datasets/pascal_20_detection/imgs", "train_annot_folder": "/home/ubuntu/datasets/pascal_20_detection/anns", "train_times": 1, "valid_image_folder": "/home/ubuntu/datasets/pascal_20_detection/imgs_validation", "valid_annot_folder": "/home/ubuntu/datasets/pascal_20_detection/anns_validation", "valid_times": 1, "valid_metric": "recall", "batch_size": 32, "learning_rate": 1e-3, "saved_folder": "pascal", "first_trainable_layer": "", "augmentation": true, "is_only_detect" : false }, "converter" : { "type": ["tflite"] } } ================================================ FILE: configs/pascal_20_segnet.json ================================================ { "model" : { "type": "SegNet", "architecture": "MobileNet7_5", "input_size": 224, "n_classes" : 20 }, "weights" : { "full": "", "backend": "imagenet" }, "train" : { "actual_epoch": 50, "train_image_folder": "/home/ubuntu/datasets/pascal_20_segmentation/imgs", "train_annot_folder": "/home/ubuntu/datasets/pascal_20_segmentation/anns", "train_times": 1, "valid_image_folder": "/home/ubuntu/datasets/pascal_20_segmentation/imgs_validation", "valid_annot_folder": "/home/ubuntu/datasets/pascal_20_segmentation/anns_validation", "valid_times": 1, "valid_metric": "val_loss", "batch_size": 8, "learning_rate": 1e-3, "saved_folder": "pascal_20", "first_trainable_layer": "0", "ignore_zero_class": false, "augmentation": true }, "converter" : { "type": ["tflite"] } } ================================================ FILE: configs/person_detector.json ================================================ { "model" : { "type": "Detector", "architecture": "MobileNet7_5", "input_size": [224, 320], "anchors": [[[0.76120044, 0.57155991], [0.6923348, 0.88535553], [0.47163042, 0.34163313]], [[0.33340788, 0.70065861], [0.18124964, 0.38986752], [0.08497349, 0.1527057 ]]], "labels": ["person"], "obj_thresh" : 0.7, "iou_thresh" : 0.5, "coord_scale" : 1.0, "class_scale" : 1.0, "object_scale" : 5.0, "no_object_scale" : 1.0 }, "weights" : { "full": "", "backend": "imagenet" }, "train" : { "actual_epoch": 100, "train_image_folder": "/home/ubuntu/datasets/pascal_20_detection/imgs", "train_annot_folder": "/home/ubuntu/datasets/pascal_20_detection/anns", "train_times": 1, "valid_image_folder": "/home/ubuntu/datasets/pascal_20_detection/imgs_validation", "valid_annot_folder": "/home/ubuntu/datasets/pascal_20_detection/anns_validation", "valid_times": 1, "valid_metric": "recall", "batch_size": 32, "learning_rate": 1e-3, "saved_folder": "person_detector", "first_trainable_layer": "", "augmentation": true, "is_only_detect" : false }, "converter" : { "type": ["k210", "tflite"] } } ================================================ FILE: configs/raccoon_detector.json ================================================ { "model" : { "type": "Detector", "architecture": "MobileNet5_0", "input_size": [240, 320], "anchors": [[[0.76120044, 0.57155991], [0.6923348, 0.88535553], [0.47163042, 0.34163313]]], "labels": ["raccoon"], "obj_thresh" : 0.5, "iou_thresh" : 0.5, "coord_scale" : 2.0, "object_scale" : 2.0, "no_object_scale" : 1.0 }, "weights" : { "full": "", "backend": "imagenet" }, "train" : { "actual_epoch": 50, "train_image_folder": "/home/ubuntu/datasets/raccoon_detector/imgs", "train_annot_folder": "/home/ubuntu/datasets/raccoon_detector/anns", "train_times": 2, "valid_image_folder": "/home/ubuntu/datasets/raccoon_detector/imgs_validation", "valid_annot_folder": "/home/ubuntu/datasets/raccoon_detector/anns_validation", "valid_times": 2, "valid_metric": "recall", "batch_size": 4, "learning_rate": 1e-4, "saved_folder": "raccoon_detector", "first_trainable_layer": "", "augmentation": true, "is_only_detect" : false }, "converter" : { "type": ["k210"] } } ================================================ FILE: configs/santa_uno.json ================================================ { "model" : { "type": "Classifier", "architecture": "MobileNet7_5", "input_size": 224, "fully-connected": [], "labels": [], "dropout" : 0.5 }, "weights" : { "full": "", "backend": "imagenet", "save_bottleneck": false }, "train" : { "actual_epoch": 3, "train_image_folder": "/home/ubuntu/santa_uno_dataset/imgs", "train_times": 1, "valid_image_folder": "/home/ubuntu/santa_uno_dataset/imgs_validation", "valid_times": 1, "valid_metric": "val_accuracy", "batch_size": 8, "learning_rate": 1e-4, "saved_folder": "santa_uno", "first_trainable_layer": "", "augmentation": true }, "converter" : { "type": ["k210", "tflite"] } } ================================================ FILE: configs/segmentation.json ================================================ { "model" : { "type": "SegNet", "architecture": "MobileNet7_5", "input_size": 224, "n_classes" : 20 }, "weights" : { "full": "", "backend": "imagenet" }, "train" : { "actual_epoch": 1, "train_image_folder": "sample_datasets/segmentation/imgs", "train_annot_folder": "sample_datasets/segmentation/anns", "train_times": 4, "valid_image_folder": "sample_datasets/segmentation/imgs_validation", "valid_annot_folder": "sample_datasets/segmentation/anns_validation", "valid_times": 4, "valid_metric": "val_loss", "batch_size": 8, "learning_rate": 1e-4, "saved_folder": "segment", "first_trainable_layer": "", "ignore_zero_class": false, "augmentation": true }, "converter" : { "type": ["k210", "tflite"] } } ================================================ FILE: example_scripts/arm_nn/README.md ================================================ # PyArmNN Object Detection Sample Application ## Introduction This sample application guides the user and shows how to perform object detection using PyArmNN API. We assume the user has already built PyArmNN by following the instructions of the README in the main PyArmNN directory. We provide example scripts for performing object detection from video file and video stream with `run_video_file.py` and `run_video_stream.py`. The application takes a model and video file or camera feed as input, runs inference on each frame, and draws bounding boxes around detected objects, with the corresponding labels and confidence scores overlaid. A similar implementation of this object detection application is also provided in C++ in the examples for ArmNN. ## Prerequisites ##### PyArmNN Before proceeding to the next steps, make sure that you have successfully installed the newest version of PyArmNN on your system by following the instructions in the README of the PyArmNN root directory. You can verify that PyArmNN library is installed and check PyArmNN version using: ```bash $ pip show pyarmnn ``` You can also verify it by running the following and getting output similar to below: ```bash $ python -c "import pyarmnn as ann;print(ann.GetVersion())" '24.0.0' ``` ##### Dependencies Install the following libraries on your system: ```bash $ sudo apt-get install python3-opencv libqtgui4 libqt4-test ``` Create a virtual environment: ```bash $ python3.7 -m venv devenv --system-site-packages $ source devenv/bin/activate ``` Install the dependencies: ```bash $ pip install -r requirements.txt ``` --- # Performing Object Detection ## Object Detection from Video File The `run_video_file.py` example takes a video file as input, runs inference on each frame, and produces frames with bounding boxes drawn around detected objects. The processed frames are written to video file. The user can specify these arguments at command line: * `--video_file_path` - Required: Path to the video file to run object detection on * `--model_file_path` - Required: Path to .tflite, .pb or .onnx object detection model * `--model_name` - Required: The name of the model being used. Assembles the workflow for the input model. The examples support the model names: * `ssd_mobilenet_v1` * `yolo_v3_tiny` * `--label_path` - Required: Path to labels file for the specified model file * `--output_video_file_path` - Path to the output video file with detections added in * `--preferred_backends` - You can specify one or more backend in order of preference. Accepted backends include `CpuAcc, GpuAcc, CpuRef`. Arm NN will decide which layers of the network are supported by the backend, falling back to the next if a layer is unsupported. Defaults to `['CpuAcc', 'CpuRef']` Run the sample script: ```bash $ python run_video_file.py --video_file_path --model_file_path --model_name ``` ## Object Detection from Video Stream The `run_video_stream.py` example captures frames from a video stream of a device, runs inference on each frame, and produces frames with bounding boxes drawn around detected objects. A window is displayed and refreshed with the latest processed frame. The user can specify these arguments at command line: * `--video_source` - Device index to access video stream. Defaults to primary device camera at index 0 * `--model_file_path` - Required: Path to .tflite, .pb or .onnx object detection model * `--model_name` - Required: The name of the model being used. Assembles the workflow for the input model. The examples support the model names: * `ssd_mobilenet_v1` * `yolo_v3_tiny` * `--label_path` - Required: Path to labels file for the specified model file * `--preferred_backends` - You can specify one or more backend in order of preference. Accepted backends include `CpuAcc, GpuAcc, CpuRef`. Arm NN will decide which layers of the network are supported by the backend, falling back to the next if a layer is unsupported. Defaults to `['CpuAcc', 'CpuRef']` Run the sample script: ```bash $ python run_video_stream.py --model_file_path --model_name ``` This application has been verified to work against the MobileNet SSD model, which can be downloaded along with it's label set from: * https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip ## Implementing Your Own Network The examples provide support for `ssd_mobilenet_v1` and `yolo_v3_tiny` models. However, the user is able to add their own network to the object detection scripts by following the steps: 1. Create a new file for your network, for example `network.py`, to contain functions to process the output of the model 2. In that file, the user will need to write a function that decodes the output vectors obtained from running inference on their network and return the bounding box positions of detected objects plus their class index and confidence. Additionally, include a function that returns a resize factor that will scale the obtained bounding boxes to their correct positions in the original frame 3. Import the functions into the main file and, such as with the provided networks, add a conditional statement to the `get_model_processing()` function with the new model name and functions 4. The labels associated with the model can then be passed in with `--label_path` argument --- # Application Overview This section provides a walkthrough of the application, explaining in detail the steps: 1. Initialisation 2. Creating a Network 3. Preparing the Workload Tensors 4. Executing Inference 5. Postprocessing ### Initialisation ##### Reading from Video Source After parsing user arguments, the chosen video file or stream is loaded into an OpenCV `cv2.VideoCapture()` object. We use this object to capture frames from the source using the `read()` function. The `VideoCapture` object also tells us information about the source, such as the framerate and resolution of the input video. Using this information, we create a `cv2.VideoWriter()` object which will be used at the end of every loop to write the processed frame to an output video file of the same format as the input. ##### Preparing Labels and Model Specific Functions In order to interpret the result of running inference on the loaded network, it is required to load the labels associated with the model. In the provided example code, the `dict_labels()` function creates a dictionary that is keyed on the classification index at the output node of the model, with values of the dictionary corresponding to a label and a randomly generated RGB color. This ensures that each class has a unique color which will prove helpful when plotting the bounding boxes of various detected objects in a frame. Depending on the model being used, the user-specified model name accesses and returns functions to decode and process the inference output, along with a resize factor used when plotting bounding boxes to ensure they are scaled to their correct position in the original frame. ### Creating a Network ##### Creating Parser and Importing Graph The first step with PyArmNN is to import a graph from file by using the appropriate parser. The Arm NN SDK provides parsers for reading graphs from a variety of model formats. In our application we specifically focus on `.tflite, .pb, .onnx` models. Based on the extension of the provided model file, the corresponding parser is created and the network file loaded with `CreateNetworkFromBinaryFile()` function. The parser will handle the creation of the underlying Arm NN graph. ##### Optimizing Graph for Compute Device Arm NN supports optimized execution on multiple CPU and GPU devices. Prior to executing a graph, we must select the appropriate device context. We do this by creating a runtime context with default options with `IRuntime()`. We can optimize the imported graph by specifying a list of backends in order of preference and implement backend-specific optimizations. The backends are identified by a string unique to the backend, for example `CpuAcc, GpuAcc, CpuRef`. Internally and transparently, Arm NN splits the graph into subgraph based on backends, it calls a optimize subgraphs function on each of them and, if possible, substitutes the corresponding subgraph in the original graph with its optimized version. Using the `Optimize()` function we optimize the graph for inference and load the optimized network onto the compute device with `LoadNetwork()`. This function creates the backend-specific workloads for the layers and a backend specific workload factory which is called to create the workloads. ##### Creating Input and Output Binding Information Parsers can also be used to extract the input information for the network. By calling `GetSubgraphInputTensorNames` we extract all the input names and, with `GetNetworkInputBindingInfo`, bind the input points of the graph. The input binding information contains all the essential information about the input. It is a tuple consisting of integer identifiers for bindable layers (inputs, outputs) and the tensor info (data type, quantization information, number of dimensions, total number of elements). Similarly, we can get the output binding information for an output layer by using the parser to retrieve output tensor names and calling `GetNetworkOutputBindingInfo()`. ### Preparing the Workload Tensors ##### Preprocessing the Captured Frame Each frame captured from source is read as an `ndarray` in BGR format and therefore has to be preprocessed before being passed into the network. This preprocessing step consists of swapping channels (BGR to RGB in this example), resizing the frame to the required resolution, expanding dimensions of the array and doing data type conversion to match the model input layer. This information about the input tensor can be readily obtained from reading the `input_binding_info`. For example, SSD MobileNet V1 takes for input a tensor with shape `[1, 300, 300, 3]` and data type `uint8`. ##### Making Input and Output Tensors To produce the workload tensors, calling the functions `make_input_tensors()` and `make_output_tensors()` will return the input and output tensors respectively. ### Executing Inference After making the workload tensors, a compute device performs inference for the loaded network using the `EnqueueWorkload()` function of the runtime context. By calling the `workload_tensors_to_ndarray()` function, we obtain the results from inference as a list of `ndarrays`. ### Postprocessing ##### Decoding and Processing Inference Output The output from inference must be decoded to obtain information about detected objects in the frame. In the examples there are implementations for two networks but you may also implement your own network decoding solution here. Please refer to Implementing Your Own Network section of this document to learn how to do this. For SSD MobileNet V1 models, we decode the results to obtain the bounding box positions, classification index, confidence and number of detections in the input frame. For YOLO V3 Tiny models, we decode the output and perform non-maximum suppression to filter out any weak detections below a confidence threshold and any redudant bounding boxes above an intersection-over-union threshold. It is encouraged to experiment with threshold values for confidence and intersection-over-union (IoU) to achieve the best visual results. The detection results are always returned as a list in the form `[class index, [box positions], confidence score]`, with the box positions list containing bounding box coordinates in the form `[x_min, y_min, x_max, y_max]`. ##### Drawing Bounding Boxes With the obtained results and using `draw_bounding_boxes()`, we are able to draw bounding boxes around detected objects and add the associated label and confidence score. The labels dictionary created earlier uses the class index of the detected object as a key to return the associated label and color for that class. The resize factor defined at the beginning scales the bounding box coordinates to their correct positions in the original frame. The processed frames are written to file or displayed in a separate window. ================================================ FILE: example_scripts/arm_nn/box.py ================================================ import numpy as np import cv2 # Todo : BoundBox & its related method extraction class BoundBox: def __init__(self, x, y, w, h, c = None, classes = None): self.x = x self.y = y self.w = w self.h = h self.c = c self.classes = classes def get_label(self): return np.argmax(self.classes) def get_score(self): return self.classes[self.get_label()] def iou(self, bound_box): b1 = self.as_centroid() b2 = bound_box.as_centroid() return centroid_box_iou(b1, b2) def as_centroid(self): return np.array([self.x, self.y, self.w, self.h]) def boxes_to_array(bound_boxes): """ # Args boxes : list of BoundBox instances # Returns centroid_boxes : (N, 4) probs : (N, nb_classes) """ centroid_boxes = [] probs = [] for box in bound_boxes: centroid_boxes.append([box.x, box.y, box.w, box.h]) probs.append(box.classes) return np.array(centroid_boxes), np.array(probs) def nms_boxes(boxes, n_classes, nms_threshold=0.3, obj_threshold=0.3): """ # Args boxes : list of BoundBox # Returns boxes : list of BoundBox non maximum supressed BoundBox instances """ # suppress non-maximal boxes for c in range(n_classes): sorted_indices = list(reversed(np.argsort([box.classes[c] for box in boxes]))) for i in range(len(sorted_indices)): index_i = sorted_indices[i] if boxes[index_i].classes[c] == 0: continue else: for j in range(i+1, len(sorted_indices)): index_j = sorted_indices[j] if boxes[index_i].iou(boxes[index_j]) >= nms_threshold: boxes[index_j].classes[c] = 0 # remove the boxes which are less likely than a obj_threshold boxes = [box for box in boxes if box.get_score() > obj_threshold] return boxes def draw_scaled_boxes(image, boxes, probs, labels, desired_size=400): img_size = min(image.shape[:2]) if img_size < desired_size: scale_factor = float(desired_size) / img_size else: scale_factor = 1.0 h, w = image.shape[:2] img_scaled = cv2.resize(image, (int(w*scale_factor), int(h*scale_factor))) if boxes != []: boxes_scaled = boxes*scale_factor boxes_scaled = boxes_scaled.astype(np.int) else: boxes_scaled = boxes return draw_boxes(img_scaled, boxes_scaled, probs, labels) def draw_boxes(image, boxes, probs, labels): for box, classes in zip(boxes, probs): x1, y1, x2, y2 = box cv2.rectangle(image, (x1,y1), (x2,y2), (0,255,0), 3) cv2.putText(image, '{}: {:.2f}'.format(labels[np.argmax(classes)], classes.max()), (x1, y1 - 13), cv2.FONT_HERSHEY_SIMPLEX, 1e-3 * image.shape[0], (0,255,0), 2) return image def centroid_box_iou(box1, box2): def _interval_overlap(interval_a, interval_b): x1, x2 = interval_a x3, x4 = interval_b if x3 < x1: if x4 < x1: return 0 else: return min(x2,x4) - x1 else: if x2 < x3: return 0 else: return min(x2,x4) - x3 _, _, w1, h1 = box1.reshape(-1,) _, _, w2, h2 = box2.reshape(-1,) x1_min, y1_min, x1_max, y1_max = to_minmax(box1.reshape(-1,4)).reshape(-1,) x2_min, y2_min, x2_max, y2_max = to_minmax(box2.reshape(-1,4)).reshape(-1,) intersect_w = _interval_overlap([x1_min, x1_max], [x2_min, x2_max]) intersect_h = _interval_overlap([y1_min, y1_max], [y2_min, y2_max]) intersect = intersect_w * intersect_h union = w1 * h1 + w2 * h2 - intersect return float(intersect) / union def to_centroid(minmax_boxes): """ minmax_boxes : (N, 4) """ minmax_boxes = minmax_boxes.astype(np.float) centroid_boxes = np.zeros_like(minmax_boxes) x1 = minmax_boxes[:,0] y1 = minmax_boxes[:,1] x2 = minmax_boxes[:,2] y2 = minmax_boxes[:,3] centroid_boxes[:,0] = (x1 + x2) / 2 centroid_boxes[:,1] = (y1 + y2) / 2 centroid_boxes[:,2] = x2 - x1 centroid_boxes[:,3] = y2 - y1 return centroid_boxes def to_minmax(centroid_boxes): centroid_boxes = centroid_boxes.astype(np.float) minmax_boxes = np.zeros_like(centroid_boxes) cx = centroid_boxes[:,0] cy = centroid_boxes[:,1] w = centroid_boxes[:,2] h = centroid_boxes[:,3] minmax_boxes[:,0] = cx - w/2 minmax_boxes[:,1] = cy - h/2 minmax_boxes[:,2] = cx + w/2 minmax_boxes[:,3] = cy + h/2 return minmax_boxes def create_anchor_boxes(anchors): """ # Args anchors : list of floats # Returns boxes : array, shape of (len(anchors)/2, 4) centroid-type """ boxes = [] n_boxes = int(len(anchors)/2) for i in range(n_boxes): boxes.append(np.array([0, 0, anchors[2*i], anchors[2*i+1]])) return np.array(boxes) def find_match_box(centroid_box, centroid_boxes): """Find the index of the boxes with the largest overlap among the N-boxes. # Args box : array, shape of (1, 4) boxes : array, shape of (N, 4) # Return match_index : int """ match_index = -1 max_iou = -1 for i, box in enumerate(centroid_boxes): iou = centroid_box_iou(centroid_box, box) if max_iou < iou: match_index = i max_iou = iou return match_index ================================================ FILE: example_scripts/arm_nn/cv_utils.py ================================================ # Copyright © 2020 Arm Ltd and Contributors. All rights reserved. # SPDX-License-Identifier: MIT """ This file contains helper functions for reading video/image data and pre/postprocessing of video/image data using OpenCV. """ import os import cv2 import numpy as np import pyarmnn as ann def preprocess(frame: np.ndarray, input_binding_info: tuple): """ Takes a frame, resizes, swaps channels and converts data type to match model input layer. The converted frame is wrapped in a const tensor and bound to the input tensor. Args: frame: Captured frame from video. input_binding_info: Contains shape and data type of model input layer. Returns: Input tensor. """ # Swap channels and resize frame to model resolution frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) resized_frame = resize_with_aspect_ratio(frame, input_binding_info) # Expand dimensions and convert data type to match model input data_type = np.float32 if input_binding_info[1].GetDataType() == ann.DataType_Float32 else np.uint8 resized_frame = np.expand_dims(np.asarray(resized_frame, dtype=data_type), axis=0) resized_frame /= 255. resized_frame -= 0.5 resized_frame *= 2 assert resized_frame.shape == tuple(input_binding_info[1].GetShape()) input_tensors = ann.make_input_tensors([input_binding_info], [resized_frame]) return input_tensors def resize_with_aspect_ratio(frame: np.ndarray, input_binding_info: tuple): """ Resizes frame while maintaining aspect ratio, padding any empty space. Args: frame: Captured frame. input_binding_info: Contains shape of model input layer. Returns: Frame resized to the size of model input layer. """ aspect_ratio = frame.shape[1] / frame.shape[0] model_height, model_width = list(input_binding_info[1].GetShape())[1:3] if aspect_ratio >= 1.0: new_height, new_width = int(model_width / aspect_ratio), model_width b_padding, r_padding = model_height - new_height, 0 else: new_height, new_width = model_height, int(model_height * aspect_ratio) b_padding, r_padding = 0, model_width - new_width # Resize and pad any empty space frame = cv2.resize(frame, (new_width, new_height), interpolation=cv2.INTER_LINEAR) frame = cv2.copyMakeBorder(frame, top=0, bottom=b_padding, left=0, right=r_padding, borderType=cv2.BORDER_CONSTANT, value=[0, 0, 0]) return frame def create_video_writer(video: cv2.VideoCapture, video_path: str, output_path: str): """ Creates a video writer object to write processed frames to file. Args: video: Video capture object, contains information about data source. video_path: User-specified video file path. output_path: Optional path to save the processed video. Returns: Video writer object. """ _, ext = os.path.splitext(video_path) if output_path is not None: assert os.path.isdir(output_path) i, filename = 0, os.path.join(output_path if output_path is not None else str(), f'object_detection_demo{ext}') while os.path.exists(filename): i += 1 filename = os.path.join(output_path if output_path is not None else str(), f'object_detection_demo({i}){ext}') video_writer = cv2.VideoWriter(filename=filename, fourcc=get_source_encoding_int(video), fps=int(video.get(cv2.CAP_PROP_FPS)), frameSize=(int(video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)))) return video_writer def init_video_file_capture(video_path: str, output_path: str): """ Creates a video capture object from a video file. Args: video_path: User-specified video file path. output_path: Optional path to save the processed video. Returns: Video capture object to capture frames, video writer object to write processed frames to file, plus total frame count of video source to iterate through. """ if not os.path.exists(video_path): raise FileNotFoundError(f'Video file not found for: {video_path}') video = cv2.VideoCapture(video_path) if not video.isOpened: raise RuntimeError(f'Failed to open video capture from file: {video_path}') video_writer = create_video_writer(video, video_path, output_path) iter_frame_count = range(int(video.get(cv2.CAP_PROP_FRAME_COUNT))) return video, video_writer, iter_frame_count def init_video_stream_capture(video_source: int): """ Creates a video capture object from a device. Args: video_source: Device index used to read video stream. Returns: Video capture object used to capture frames from a video stream. """ video = cv2.VideoCapture(video_source) if not video.isOpened: raise RuntimeError(f'Failed to open video capture for device with index: {video_source}') print('Processing video stream. Press \'Esc\' key to exit the demo.') return video def draw_bounding_boxes(frame: np.ndarray, detections: list, resize_factor, labels: dict): """ Draws bounding boxes around detected objects and adds a label and confidence score. Args: frame: The original captured frame from video source. detections: A list of detected objects in the form [class, [box positions], confidence]. resize_factor: Resizing factor to scale box coordinates to output frame size. labels: Dictionary of labels and colors keyed on the classification index. """ for detection in detections: class_idx, box, confidence = [d for d in detection] label, color = labels[class_idx][0].capitalize(), labels[class_idx][1] # Obtain frame size and resized bounding box positions frame_height, frame_width = frame.shape[:2] x_min, y_min, x_max, y_max = [int(position * resize_factor) for position in box] # Ensure box stays within the frame x_min, y_min = max(0, x_min), max(0, y_min) x_max, y_max = min(frame_width, x_max), min(frame_height, y_max) # Draw bounding box around detected object cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), color, 2) # Create label for detected object class label = f'{label} {confidence * 100:.1f}%' label_color = (0, 0, 0) if sum(color)>200 else (255, 255, 255) # Make sure label always stays on-screen x_text, y_text = cv2.getTextSize(label, cv2.FONT_HERSHEY_DUPLEX, 1, 1)[0][:2] lbl_box_xy_min = (x_min, y_min if y_min<25 else y_min - y_text) lbl_box_xy_max = (x_min + int(0.55 * x_text), y_min + y_text if y_min<25 else y_min) lbl_text_pos = (x_min + 5, y_min + 16 if y_min<25 else y_min - 5) # Add label and confidence value cv2.rectangle(frame, lbl_box_xy_min, lbl_box_xy_max, color, -1) cv2.putText(frame, label, lbl_text_pos, cv2.FONT_HERSHEY_DUPLEX, 0.50, label_color, 1, cv2.LINE_AA) def get_source_encoding_int(video_capture): return int(video_capture.get(cv2.CAP_PROP_FOURCC)) ================================================ FILE: example_scripts/arm_nn/network_executor.py ================================================ # Copyright © 2020 Arm Ltd and Contributors. All rights reserved. # SPDX-License-Identifier: MIT import os from typing import List, Tuple import pyarmnn as ann import numpy as np def create_network(model_file: str, backends: list, input_names: Tuple[str] = (), output_names: Tuple[str] = ()): """ Creates a network based on the model file and a list of backends. Args: model_file: User-specified model file. backends: List of backends to optimize network. input_names: output_names: Returns: net_id: Unique ID of the network to run. runtime: Runtime context for executing inference. input_binding_info: Contains essential information about the model input. output_binding_info: Used to map output tensor and its memory. """ if not os.path.exists(model_file): raise FileNotFoundError(f'Model file not found for: {model_file}') _, ext = os.path.splitext(model_file) if ext == '.tflite': parser = ann.ITfLiteParser() else: raise ValueError("Supplied model file type is not supported. Supported types are [ tflite ]") network = parser.CreateNetworkFromBinaryFile(model_file) # Specify backends to optimize network preferred_backends = [] for b in backends: preferred_backends.append(ann.BackendId(b)) # Select appropriate device context and optimize the network for that device options = ann.CreationOptions() runtime = ann.IRuntime(options) opt_network, messages = ann.Optimize(network, preferred_backends, runtime.GetDeviceSpec(), ann.OptimizerOptions()) print(f'Preferred backends: {backends}\n{runtime.GetDeviceSpec()}\n' f'Optimization warnings: {messages}') # Load the optimized network onto the Runtime device net_id, _ = runtime.LoadNetwork(opt_network) # Get input and output binding information graph_id = parser.GetSubgraphCount() - 1 input_names = parser.GetSubgraphInputTensorNames(graph_id) input_binding_info = parser.GetNetworkInputBindingInfo(graph_id, input_names[0]) output_names = parser.GetSubgraphOutputTensorNames(graph_id) output_binding_info = [] for output_name in output_names: out_bind_info = parser.GetNetworkOutputBindingInfo(graph_id, output_name) output_binding_info.append(out_bind_info) return net_id, runtime, input_binding_info, output_binding_info def execute_network(input_tensors: list, output_tensors: list, runtime, net_id: int) -> List[np.ndarray]: """ Executes inference for the loaded network. Args: input_tensors: The input frame tensor. output_tensors: The output tensor from output node. runtime: Runtime context for executing inference. net_id: Unique ID of the network to run. Returns: list: Inference results as a list of ndarrays. """ runtime.EnqueueWorkload(net_id, input_tensors, output_tensors) output = ann.workload_tensors_to_ndarray(output_tensors) return output class ArmnnNetworkExecutor: def __init__(self, model_file: str, backends: list): """ Creates an inference executor for a given network and a list of backends. Args: model_file: User-specified model file. backends: List of backends to optimize network. """ self.network_id, self.runtime, self.input_binding_info, self.output_binding_info = create_network(model_file, backends) self.output_tensors = ann.make_output_tensors(self.output_binding_info) def run(self, input_tensors: list) -> List[np.ndarray]: """ Executes inference for the loaded network. Args: input_tensors: The input frame tensor. Returns: list: Inference results as a list of ndarrays. """ return execute_network(input_tensors, self.output_tensors, self.runtime, self.network_id) ================================================ FILE: example_scripts/arm_nn/run_video_file.py ================================================ # Copyright © 2020 Arm Ltd and Contributors. All rights reserved. # SPDX-License-Identifier: MIT """ Object detection demo that takes a video file, runs inference on each frame producing bounding boxes and labels around detected objects, and saves the processed video. python3 run_video_file.py --fd_model_file_path YOLO_best_mAP.tflite --kp_model_file MobileFaceNet_kpts.tflite --video_file_path test_s.mp4 """ import os import sys import time script_dir = os.path.dirname(__file__) sys.path.insert(1, os.path.join(script_dir, '..', 'common')) import cv2 import numpy as np from tqdm import tqdm from argparse import ArgumentParser from yolov2 import yolo_processing, yolo_resize_factor from utils import dict_labels from cv_utils import init_video_file_capture, resize_with_aspect_ratio from network_executor import ArmnnNetworkExecutor import pyarmnn as ann def preprocess(frame: np.ndarray, input_binding_info: tuple): """ Takes a frame, resizes, swaps channels and converts data type to match model input layer. The converted frame is wrapped in a const tensor and bound to the input tensor. Args: frame: Captured frame from video. input_binding_info: Contains shape and data type of model input layer. Returns: Input tensor. """ # Swap channels and resize frame to model resolution frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) resized_frame = resize_with_aspect_ratio(frame, input_binding_info) # Expand dimensions and convert data type to match model input data_type = np.float32 if input_binding_info[1].GetDataType() == ann.DataType_Float32 else np.uint8 resized_frame = np.expand_dims(np.asarray(resized_frame, dtype=data_type), axis=0) resized_frame /= 255. resized_frame -= 0.5 resized_frame *= 2 assert resized_frame.shape == tuple(input_binding_info[1].GetShape()) input_tensors = ann.make_input_tensors([input_binding_info], [resized_frame]) return input_tensors def process_faces(frame, detections, executor_kp, resize_factor): kpts_list = [] frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) for detection in detections: box = detection[1].copy() for i in range(len(box)): box[i] = int(box[i] * resize_factor) x, y, w, h = box[0], box[1], box[2] - box[0], box[3] - box[1] face_img = frame[box[1]:box[3], box[0]:box[2]] face_img = cv2.resize(face_img, (128, 128)) face_img = face_img.astype(np.float32) face_img /= 127.5 face_img -= 1. input_tensors = ann.make_input_tensors([executor_kp.input_binding_info], [face_img]) plist = executor_kp.run(input_tensors)[0][0] le = (x + int(plist[0] * w+5), y + int(plist[1] * h+5)) re = (x + int(plist[2] * w), y + int(plist[3] * h+5)) n = (x + int(plist[4] * w), y + int(plist[5] * h)) lm = (x + int(plist[6] * w), y + int(plist[7] * h)) rm = (x + int(plist[8] * w), y + int(plist[9] * h)) kpts = [le, re, n, lm, rm] kpts_list.append(kpts) return kpts_list def draw_bounding_boxes(frame: np.ndarray, detections: list, resize_factor, kpts): """ Draws bounding boxes around detected objects and adds a label and confidence score. Args: frame: The original captured frame from video source. detections: A list of detected objects in the form [class, [box positions], confidence]. resize_factor: Resizing factor to scale box coordinates to output frame size. labels: Dictionary of labels and colors keyed on the classification index. """ for detection in detections: class_idx, box, confidence = [d for d in detection] label, color = 'Person', (0, 255, 0) # Obtain frame size and resized bounding box positions frame_height, frame_width = frame.shape[:2] x_min, y_min, x_max, y_max = [int(position * resize_factor) for position in box] # Ensure box stays within the frame x_min, y_min = max(0, x_min), max(0, y_min) x_max, y_max = min(frame_width, x_max), min(frame_height, y_max) # Draw bounding box around detected object cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), color, 2) # Create label for detected object class label = f'{label} {confidence * 100:.1f}%' label_color = (0, 0, 0) if sum(color)>200 else (255, 255, 255) # Make sure label always stays on-screen x_text, y_text = cv2.getTextSize(label, cv2.FONT_HERSHEY_DUPLEX, 1, 1)[0][:2] lbl_box_xy_min = (x_min, y_min if y_min<25 else y_min - y_text) lbl_box_xy_max = (x_min + int(0.55 * x_text), y_min + y_text if y_min<25 else y_min) lbl_text_pos = (x_min + 5, y_min + 16 if y_min<25 else y_min - 5) # Add label and confidence value cv2.rectangle(frame, lbl_box_xy_min, lbl_box_xy_max, color, -1) cv2.putText(frame, label, lbl_text_pos, cv2.FONT_HERSHEY_DUPLEX, 0.50, label_color, 1, cv2.LINE_AA) for kpt_set in kpts: for kpt in kpt_set: cv2.circle(frame, (int(kpt[0]), int(kpt[1])), 5, (255, 0, 0), 2) def main(args): video, video_writer, frame_count = init_video_file_capture(args.video_file_path, args.output_video_file_path) frame_num = len(frame_count) executor_fd = ArmnnNetworkExecutor(args.fd_model_file_path, args.preferred_backends) executor_kp = ArmnnNetworkExecutor(args.kp_model_file_path, args.preferred_backends) process_output, resize_factor = yolo_processing, yolo_resize_factor(video, executor_fd.input_binding_info) times = [] for _ in tqdm(frame_count, desc='Processing frames'): frame_present, frame = video.read() if not frame_present: continue input_tensors = preprocess(frame, executor_fd.input_binding_info) start_time = time.time() # start time of the loop output_result = executor_fd.run(input_tensors) detections = process_output(output_result) kpts = process_faces(frame, detections, executor_kp, resize_factor) draw_bounding_boxes(frame, detections, resize_factor, kpts) end_time = (time.time() - start_time)*1000 times.append(end_time) video_writer.write(frame) print('Finished processing frames') video.release(), video_writer.release() print("Average time(ms): ", sum(times)//frame_num) print("FPS: ", 1000.0 / (sum(times)//frame_num)) # FPS = 1 / time to process loop if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--video_file_path', required=True, type=str, help='Path to the video file to run object detection on') parser.add_argument('--fd_model_file_path', required=True, type=str, help='Path to the Object Detection model to use') parser.add_argument('--kp_model_file_path', required=True, type=str, help='Path to the Object Detection model to use') parser.add_argument('--output_video_file_path', type=str, help='Path to the output video file with detections added in') parser.add_argument('--preferred_backends', type=str, nargs='+', default=['CpuAcc', 'CpuRef'], help='Takes the preferred backends in preference order, separated by whitespace, ' 'for example: CpuAcc GpuAcc CpuRef. Accepted options: [CpuAcc, CpuRef, GpuAcc]. ' 'Defaults to [CpuAcc, CpuRef]') args = parser.parse_args() main(args) ================================================ FILE: example_scripts/arm_nn/run_video_stream.py ================================================ """ Object detection demo that takes a video stream from a device, runs inference on each frame producing bounding boxes and labels around detected objects, and displays a window with the latest processed frame. """ import os import sys import time script_dir = os.path.dirname(__file__) sys.path.insert(1, os.path.join(script_dir, '..', 'common')) import cv2 import numpy as np from tqdm import tqdm from argparse import ArgumentParser from yolov2 import yolo_processing, yolo_resize_factor from cv_utils import init_video_stream_capture, resize_with_aspect_ratio from network_executor import ArmnnNetworkExecutor import pyarmnn as ann def preprocess(frame: np.ndarray, input_binding_info: tuple): """ Takes a frame, resizes, swaps channels and converts data type to match model input layer. The converted frame is wrapped in a const tensor and bound to the input tensor. Args: frame: Captured frame from video. input_binding_info: Contains shape and data type of model input layer. Returns: Input tensor. """ # Swap channels and resize frame to model resolution frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) resized_frame = resize_with_aspect_ratio(frame, input_binding_info) # Expand dimensions and convert data type to match model input data_type = np.float32 if input_binding_info[1].GetDataType() == ann.DataType_Float32 else np.uint8 resized_frame = np.expand_dims(np.asarray(resized_frame, dtype=data_type), axis=0) resized_frame /= 255. resized_frame -= 0.5 resized_frame *= 2 assert resized_frame.shape == tuple(input_binding_info[1].GetShape()) input_tensors = ann.make_input_tensors([input_binding_info], [resized_frame]) return input_tensors def process_faces(frame, detections, executor_kp, resize_factor): kpts_list = [] frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) for detection in detections: box = detection[1].copy() for i in range(len(box)): box[i] = int(box[i] * resize_factor) x, y, w, h = box[0], box[1], box[2] - box[0], box[3] - box[1] face_img = frame[box[1]:box[3], box[0]:box[2]] face_img = cv2.resize(face_img, (128, 128)) #cv2.imshow('PyArmNN Object Detection Demo face', face_img) face_img = face_img.astype(np.float32) face_img /= 127.5 face_img -= 1. input_tensors = ann.make_input_tensors([executor_kp.input_binding_info], [face_img]) plist = executor_kp.run(input_tensors)[0][0] le = (x + int(plist[0] * w+5), y + int(plist[1] * h+5)) re = (x + int(plist[2] * w), y + int(plist[3] * h+5)) n = (x + int(plist[4] * w), y + int(plist[5] * h)) lm = (x + int(plist[6] * w), y + int(plist[7] * h)) rm = (x + int(plist[8] * w), y + int(plist[9] * h)) kpts = [le, re, n, lm, rm] kpts_list.append(kpts) return kpts_list def draw_bounding_boxes(frame: np.ndarray, detections: list, resize_factor, kpts): """ Draws bounding boxes around detected objects and adds a label and confidence score. Args: frame: The original captured frame from video source. detections: A list of detected objects in the form [class, [box positions], confidence]. resize_factor: Resizing factor to scale box coordinates to output frame size. labels: Dictionary of labels and colors keyed on the classification index. """ for detection in detections: class_idx, box, confidence = [d for d in detection] label, color = 'Person', (0, 255, 0) # Obtain frame size and resized bounding box positions frame_height, frame_width = frame.shape[:2] x_min, y_min, x_max, y_max = [int(position * resize_factor) for position in box] # Ensure box stays within the frame x_min, y_min = max(0, x_min), max(0, y_min) x_max, y_max = min(frame_width, x_max), min(frame_height, y_max) # Draw bounding box around detected object cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), color, 2) # Create label for detected object class label = f'{label} {confidence * 100:.1f}%' label_color = (0, 0, 0) if sum(color)>200 else (255, 255, 255) # Make sure label always stays on-screen x_text, y_text = cv2.getTextSize(label, cv2.FONT_HERSHEY_DUPLEX, 1, 1)[0][:2] lbl_box_xy_min = (x_min, y_min if y_min<25 else y_min - y_text) lbl_box_xy_max = (x_min + int(0.55 * x_text), y_min + y_text if y_min<25 else y_min) lbl_text_pos = (x_min + 5, y_min + 16 if y_min<25 else y_min - 5) # Add label and confidence value cv2.rectangle(frame, lbl_box_xy_min, lbl_box_xy_max, color, -1) cv2.putText(frame, label, lbl_text_pos, cv2.FONT_HERSHEY_DUPLEX, 0.50, label_color, 1, cv2.LINE_AA) for kpt_set in kpts: for kpt in kpt_set: cv2.circle(frame, (int(kpt[0]), int(kpt[1])), 5, (255, 0, 0), 2) def main(args): video = init_video_stream_capture(args.video_source) executor_fd = ArmnnNetworkExecutor(args.fd_model_file_path, args.preferred_backends) executor_kp = ArmnnNetworkExecutor(args.kp_model_file_path, args.preferred_backends) process_output, resize_factor = yolo_processing, yolo_resize_factor(video, executor_fd.input_binding_info) while True: frame_present, frame = video.read() frame = cv2.flip(frame, 1) # Horizontally flip the frame if not frame_present: raise RuntimeError('Error reading frame from video stream') input_tensors = preprocess(frame, executor_fd.input_binding_info) print("Running inference...") start_time = time.time() output_result = executor_fd.run(input_tensors) detections = process_output(output_result) kpts = process_faces(frame, detections, executor_kp, resize_factor) print("FPS: ", 1.0 / (time.time() - start_time)) # FPS = 1 / time to process loop print("Time(ms): ", (time.time() - start_time)*1000) draw_bounding_boxes(frame, detections, resize_factor, kpts) cv2.imshow('PyArmNN Object Detection Demo', frame) if cv2.waitKey(1) == 27: print('\nExit key activated. Closing video...') break video.release(), cv2.destroyAllWindows() if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('--video_source', type=int, default=0, help='Device index to access video stream. Defaults to primary device camera at index 0') parser.add_argument('--fd_model_file_path', required=True, type=str, help='Path to the Object Detection model to use') parser.add_argument('--kp_model_file_path', required=True, type=str, help='Path to the Object Detection model to use') parser.add_argument('--preferred_backends', type=str, nargs='+', default=['CpuAcc', 'CpuRef'], help='Takes the preferred backends in preference order, separated by whitespace, ' 'for example: CpuAcc GpuAcc CpuRef. Accepted options: [CpuAcc, CpuRef, GpuAcc]. ' 'Defaults to [CpuAcc, CpuRef]') args = parser.parse_args() main(args) ================================================ FILE: example_scripts/arm_nn/yolov2.py ================================================ # Copyright © 2020 Arm Ltd and Contributors. All rights reserved. # SPDX-License-Identifier: MIT """ Contains functions specific to decoding and processing inference results for YOLO V3 Tiny models. """ import cv2 import numpy as np from box import BoundBox, nms_boxes, boxes_to_array, to_minmax, draw_boxes def yolo_processing(netout): anchors = [1.889, 2.5245, 2.9465, 3.94056, 3.99987, 5.3658, 5.155437, 6.92275, 6.718375, 9.01025] nms_threshold=0.2 """Convert Yolo network output to bounding box # Args netout : 4d-array, shape of (grid_h, grid_w, num of boxes per grid, 5 + n_classes) YOLO neural network output array # Returns boxes : array, shape of (N, 4) coordinate scale is normalized [0, 1] probs : array, shape of (N, nb_classes) """ netout = netout[0].reshape(7,7,5,6) grid_h, grid_w, nb_box = netout.shape[:3] boxes = [] # decode the output by the network netout[..., 4] = _sigmoid(netout[..., 4]) netout[..., 5:] = netout[..., 4][..., np.newaxis] * _softmax(netout[..., 5:]) netout[..., 5:] *= netout[..., 5:] > 0.3 for row in range(grid_h): for col in range(grid_w): for b in range(nb_box): # from 4th element onwards are confidence and class classes classes = netout[row,col,b,5:] if np.sum(classes) > 0: # first 4 elements are x, y, w, and h x, y, w, h = netout[row,col,b,:4] x = (col + _sigmoid(x)) / grid_w # center position, unit: image width y = (row + _sigmoid(y)) / grid_h # center position, unit: image height w = anchors[2 * b + 0] * np.exp(w) / grid_w # unit: image width h = anchors[2 * b + 1] * np.exp(h) / grid_h # unit: image height confidence = netout[row,col,b,4] box = BoundBox(x, y, w, h, confidence, classes) boxes.append(box) boxes = nms_boxes(boxes, len(classes), nms_threshold, 0.3) boxes, probs = boxes_to_array(boxes) #print(boxes) predictions = [] def _to_original_scale(boxes): minmax_boxes = to_minmax(boxes) minmax_boxes[:,0] *= 224 minmax_boxes[:,2] *= 224 minmax_boxes[:,1] *= 224 minmax_boxes[:,3] *= 224 return minmax_boxes.astype(np.int) if len(boxes) > 0: boxes = _to_original_scale(boxes) for i in range(len(boxes)): predictions.append([0, boxes[i], probs[i][0]]) return predictions def _sigmoid(x): return 1. / (1. + np.exp(-x)) def _softmax(x, axis=-1, t=-100.): x = x - np.max(x) if np.min(x) < t: x = x/np.min(x)*t e_x = np.exp(x) return e_x / e_x.sum(axis, keepdims=True) def yolo_resize_factor(video: cv2.VideoCapture, input_binding_info: tuple): """ Gets a multiplier to scale the bounding box positions to their correct position in the frame. Args: video: Video capture object, contains information about data source. input_binding_info: Contains shape of model input layer. Returns: Resizing factor to scale box coordinates to output frame size. """ frame_height = video.get(cv2.CAP_PROP_FRAME_HEIGHT) frame_width = video.get(cv2.CAP_PROP_FRAME_WIDTH) model_height, model_width = list(input_binding_info[1].GetShape())[1:3] return max(frame_height, frame_width) / max(model_height, model_width) ================================================ FILE: example_scripts/edge_tpu/detector/box.py ================================================ import numpy as np import cv2 # Todo : BoundBox & its related method extraction class BoundBox: def __init__(self, x, y, w, h, c = None, classes = None): self.x = x self.y = y self.w = w self.h = h self.c = c self.classes = classes def get_label(self): return np.argmax(self.classes) def get_score(self): return self.classes[self.get_label()] def iou(self, bound_box): b1 = self.as_centroid() b2 = bound_box.as_centroid() return centroid_box_iou(b1, b2) def as_centroid(self): return np.array([self.x, self.y, self.w, self.h]) def boxes_to_array(bound_boxes): """ # Args boxes : list of BoundBox instances # Returns centroid_boxes : (N, 4) probs : (N, nb_classes) """ centroid_boxes = [] probs = [] for box in bound_boxes: centroid_boxes.append([box.x, box.y, box.w, box.h]) probs.append(box.classes) return np.array(centroid_boxes), np.array(probs) def nms_boxes(boxes, n_classes, nms_threshold=0.3, obj_threshold=0.3): """ # Args boxes : list of BoundBox # Returns boxes : list of BoundBox non maximum supressed BoundBox instances """ # suppress non-maximal boxes for c in range(n_classes): sorted_indices = list(reversed(np.argsort([box.classes[c] for box in boxes]))) for i in range(len(sorted_indices)): index_i = sorted_indices[i] if boxes[index_i].classes[c] == 0: continue else: for j in range(i+1, len(sorted_indices)): index_j = sorted_indices[j] if boxes[index_i].iou(boxes[index_j]) >= nms_threshold: boxes[index_j].classes[c] = 0 # remove the boxes which are less likely than a obj_threshold boxes = [box for box in boxes if box.get_score() > obj_threshold] return boxes def draw_scaled_boxes(image, boxes, probs, labels, desired_size=400): img_size = min(image.shape[:2]) if img_size < desired_size: scale_factor = float(desired_size) / img_size else: scale_factor = 1.0 h, w = image.shape[:2] img_scaled = cv2.resize(image, (int(w*scale_factor), int(h*scale_factor))) if boxes != []: boxes_scaled = boxes*scale_factor boxes_scaled = boxes_scaled.astype(np.int) else: boxes_scaled = boxes return draw_boxes(img_scaled, boxes_scaled, probs, labels) def draw_boxes(image, boxes, probs, labels): for box, classes in zip(boxes, probs): x1, y1, x2, y2 = box cv2.rectangle(image, (x1,y1), (x2,y2), (0,255,0), 3) cv2.putText(image, '{}: {:.2f}'.format(labels[np.argmax(classes)], classes.max()), (x1, y1 - 13), cv2.FONT_HERSHEY_SIMPLEX, 1e-3 * image.shape[0], (0,255,0), 2) return image def centroid_box_iou(box1, box2): def _interval_overlap(interval_a, interval_b): x1, x2 = interval_a x3, x4 = interval_b if x3 < x1: if x4 < x1: return 0 else: return min(x2,x4) - x1 else: if x2 < x3: return 0 else: return min(x2,x4) - x3 _, _, w1, h1 = box1.reshape(-1,) _, _, w2, h2 = box2.reshape(-1,) x1_min, y1_min, x1_max, y1_max = to_minmax(box1.reshape(-1,4)).reshape(-1,) x2_min, y2_min, x2_max, y2_max = to_minmax(box2.reshape(-1,4)).reshape(-1,) intersect_w = _interval_overlap([x1_min, x1_max], [x2_min, x2_max]) intersect_h = _interval_overlap([y1_min, y1_max], [y2_min, y2_max]) intersect = intersect_w * intersect_h union = w1 * h1 + w2 * h2 - intersect return float(intersect) / union def to_centroid(minmax_boxes): """ minmax_boxes : (N, 4) """ minmax_boxes = minmax_boxes.astype(np.float) centroid_boxes = np.zeros_like(minmax_boxes) x1 = minmax_boxes[:,0] y1 = minmax_boxes[:,1] x2 = minmax_boxes[:,2] y2 = minmax_boxes[:,3] centroid_boxes[:,0] = (x1 + x2) / 2 centroid_boxes[:,1] = (y1 + y2) / 2 centroid_boxes[:,2] = x2 - x1 centroid_boxes[:,3] = y2 - y1 return centroid_boxes def to_minmax(centroid_boxes): centroid_boxes = centroid_boxes.astype(np.float) minmax_boxes = np.zeros_like(centroid_boxes) cx = centroid_boxes[:,0] cy = centroid_boxes[:,1] w = centroid_boxes[:,2] h = centroid_boxes[:,3] minmax_boxes[:,0] = cx - w/2 minmax_boxes[:,1] = cy - h/2 minmax_boxes[:,2] = cx + w/2 minmax_boxes[:,3] = cy + h/2 return minmax_boxes def create_anchor_boxes(anchors): """ # Args anchors : list of floats # Returns boxes : array, shape of (len(anchors)/2, 4) centroid-type """ boxes = [] n_boxes = int(len(anchors)/2) for i in range(n_boxes): boxes.append(np.array([0, 0, anchors[2*i], anchors[2*i+1]])) return np.array(boxes) def find_match_box(centroid_box, centroid_boxes): """Find the index of the boxes with the largest overlap among the N-boxes. # Args box : array, shape of (1, 4) boxes : array, shape of (N, 4) # Return match_index : int """ match_index = -1 max_iou = -1 for i, box in enumerate(centroid_boxes): iou = centroid_box_iou(centroid_box, box) if max_iou < iou: match_index = i max_iou = iou return match_index ================================================ FILE: example_scripts/edge_tpu/detector/detector_video.py ================================================ import argparse import io import time import numpy as np import cv2 from box import BoundBox, nms_boxes, boxes_to_array, to_minmax, draw_boxes #from tflite_runtime.interpreter import Interpreter import tflite_runtime.interpreter as tflite class Detector(object): def __init__(self, label_file, model_file, threshold): self._threshold = float(threshold) self.labels = self.load_labels(label_file) self.interpreter = tflite.Interpreter(model_file, experimental_delegates=[tflite.load_delegate('libedgetpu.so.1')]) self.interpreter.allocate_tensors() _, self.input_height, self.input_width, _ = self.interpreter.get_input_details()[0]['shape'] self.tensor_index = self.interpreter.get_input_details()[0]['index'] def load_labels(self, path): with open(path, 'r') as f: return {i: line.strip() for i, line in enumerate(f.read().replace('"','').split(','))} def preprocess(self, img): img = cv2.resize(img, (self.input_width, self.input_height)) img = img.astype(np.float32) img = img / 255. img = img - 0.5 img = img * 2. img = img[:, :, ::-1] img = np.expand_dims(img, 0) return img def get_output_tensor(self, index): """Returns the output tensor at the given index.""" output_details = self.interpreter.get_output_details()[index] tensor = np.squeeze(self.interpreter.get_tensor(output_details['index'])) return tensor def detect_objects(self, image): """Returns a list of detection results, each a dictionary of object info.""" img = self.preprocess(image) self.interpreter.set_tensor(self.tensor_index, img) self.interpreter.invoke() # Get all output details raw_detections = self.get_output_tensor(0) output_shape = [7, 7, 5, 6] output = np.reshape(raw_detections, output_shape) return output def detect(self, original_image): self.output_height, self.output_width = original_image.shape[0:2] start_time = time.time() results = self.detect_objects(original_image) elapsed_ms = (time.time() - start_time) * 1000 fps = 1 / elapsed_ms*1000 print("Estimated frames per second : {0:.2f} Inference time: {1:.2f}".format(fps, elapsed_ms)) def _to_original_scale(boxes): minmax_boxes = to_minmax(boxes) minmax_boxes[:,0] *= self.output_width minmax_boxes[:,2] *= self.output_width minmax_boxes[:,1] *= self.output_height minmax_boxes[:,3] *= self.output_height return minmax_boxes.astype(np.int) boxes, probs = self.run(results) print(boxes) if len(boxes) > 0: boxes = _to_original_scale(boxes) original_image = draw_boxes(original_image, boxes, probs, self.labels) return original_image def run(self, netout): anchors = [0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828] nms_threshold=0.2 """Convert Yolo network output to bounding box # Args netout : 4d-array, shape of (grid_h, grid_w, num of boxes per grid, 5 + n_classes) YOLO neural network output array # Returns boxes : array, shape of (N, 4) coordinate scale is normalized [0, 1] probs : array, shape of (N, nb_classes) """ grid_h, grid_w, nb_box = netout.shape[:3] boxes = [] # decode the output by the network netout[..., 4] = _sigmoid(netout[..., 4]) netout[..., 5:] = netout[..., 4][..., np.newaxis] * _softmax(netout[..., 5:]) netout[..., 5:] *= netout[..., 5:] > self._threshold for row in range(grid_h): for col in range(grid_w): for b in range(nb_box): # from 4th element onwards are confidence and class classes classes = netout[row,col,b,5:] if np.sum(classes) > 0: # first 4 elements are x, y, w, and h x, y, w, h = netout[row,col,b,:4] x = (col + _sigmoid(x)) / grid_w # center position, unit: image width y = (row + _sigmoid(y)) / grid_h # center position, unit: image height w = anchors[2 * b + 0] * np.exp(w) / grid_w # unit: image width h = anchors[2 * b + 1] * np.exp(h) / grid_h # unit: image height confidence = netout[row,col,b,4] box = BoundBox(x, y, w, h, confidence, classes) boxes.append(box) boxes = nms_boxes(boxes, len(classes), nms_threshold, self._threshold) boxes, probs = boxes_to_array(boxes) return boxes, probs def _sigmoid(x): return 1. / (1. + np.exp(-x)) def _softmax(x, axis=-1, t=-100.): x = x - np.max(x) if np.min(x) < t: x = x/np.min(x)*t e_x = np.exp(x) return e_x / e_x.sum(axis, keepdims=True) parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--model', help='File path of .tflite file.', required=True) parser.add_argument('--labels', help='File path of labels file.', required=True) parser.add_argument('--threshold', help='Confidence threshold.', default=0.3) args = parser.parse_args() detector = Detector(args.labels, args.model, args.threshold) camera = cv2.VideoCapture(2) while(camera.isOpened()): ret, frame = camera.read() image = detector.detect(frame) if ret == True: # Display the resulting frame cv2.imshow('Frame', image) # Press Q on keyboard to exit if cv2.waitKey(25) & 0xFF == ord('q'): break # Break the loop else: break # When everything done, release the video capture object camera.release() # Closes all the frames cv2.destroyAllWindows() ================================================ FILE: example_scripts/k210/classifier/santa_uno.py ================================================ # tested with firmware maixpy_v0.6.2_72_g22a8555b5_openmv_kmodel_v4_with_ide_support import sensor, image, lcd, time import KPU as kpu lcd.init() sensor.reset() sensor.set_pixformat(sensor.RGB565) sensor.set_framesize(sensor.QVGA) sensor.set_windowing((224, 224)) sensor.set_vflip(1) lcd.clear() labels=['arduino_uno','santa_claus'] #number of labels should match the number of labels the model was trained with task = kpu.load(0x200000) #change to "/sd/name_of_the_model_file.kmodel" if loading from SD card kpu.set_outputs(task, 0, 1, 1, 2) #the actual shape needs to match the last layer shape of your model while(True): kpu.memtest() img = sensor.snapshot() #img = img.rotation_corr(z_rotation=90.0) uncomment if need rotation correction - only present in full maixpy firmware #a = img.pix_to_ai() fmap = kpu.forward(task, img) plist=fmap[:] pmax=max(plist) max_index=plist.index(pmax) a = img.draw_string(0,0, str(labels[max_index].strip()), color=(255,0,0), scale=2) a = img.draw_string(0,20, str(pmax), color=(255,0,0), scale=2) print((pmax, labels[max_index].strip())) a = lcd.display(img) a = kpu.deinit(task) ================================================ FILE: example_scripts/k210/detector/yolov2/person_detector_v4.py ================================================ #tested with firmware maixpy_v0.6.2_72_g22a8555b5_openmv_kmodel_v4_with_ide_support import sensor, image, lcd import KPU as kpu lcd.init() sensor.reset() sensor.set_pixformat(sensor.RGB565) sensor.set_framesize(sensor.QVGA) sensor.set_windowing((224, 224)) sensor.set_vflip(1) sensor.run(1) classes = ["person"] task = kpu.load(0x200000) #change to "/sd/name_of_the_model_file.kmodel" if loading from SD card a = kpu.set_outputs(task, 0, 7,7,30) #the actual shape needs to match the last layer shape of your model(before Reshape) anchor = (0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828) a = kpu.init_yolo2(task, 0.3, 0.3, 5, anchor) #tweak the second parameter if you're getting too many false positives while(True): img = sensor.snapshot().rotation_corr(z_rotation=180.0) a = img.pix_to_ai() code = kpu.run_yolo2(task, img) if code: for i in code: a = img.draw_rectangle(i.rect(),color = (0, 255, 0)) a = img.draw_string(i.x(),i.y(), classes[i.classid()], color=(255,0,0), scale=3) a = lcd.display(img) else: a = lcd.display(img) a = kpu.deinit(task) ================================================ FILE: example_scripts/k210/detector/yolov2/raccoon_detector.py ================================================ # tested with firmware maixpy_v0.6.2_72_g22a8555b5_openmv_kmodel_v4_with_ide_support import sensor, image, lcd import KPU as kpu lcd.init() sensor.reset() sensor.set_pixformat(sensor.RGB565) sensor.set_framesize(sensor.QVGA) sensor.set_windowing((224, 224)) sensor.set_vflip(1) sensor.run(1) classes = ["raccoon"] task = kpu.load(0x200000) #change to "/sd/name_of_the_model_file.kmodel" if loading from SD card a = kpu.set_outputs(task, 0, 7,7,30) #the actual shape needs to match the last layer shape of your model(before Reshape) anchor = (0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828) a = kpu.init_yolo2(task, 0.3, 0.3, 5, anchor) #tweak the second parameter if you're getting too many false positives while(True): img = sensor.snapshot().rotation_corr(z_rotation=90.0) a = img.pix_to_ai() code = kpu.run_yolo2(task, img) if code: for i in code: a = img.draw_rectangle(i.rect(),color = (0, 255, 0)) a = img.draw_string(i.x(),i.y(), classes[i.classid()], color=(255,0,0), scale=3) a = lcd.display(img) else: a = lcd.display(img) a = kpu.deinit(task) ================================================ FILE: example_scripts/k210/detector/yolov2/raccoon_detector_uart.py ================================================ # tested with firmware 5-0.22 import sensor,image,lcd import KPU as kpu from fpioa_manager import fm from machine import UART from board import board_info lcd.init() sensor.reset() sensor.set_pixformat(sensor.RGB565) sensor.set_framesize(sensor.QVGA) sensor.set_windowing((224, 224)) sensor.set_vflip(1) sensor.run(1) fm.register(board_info.PIN15,fm.fpioa.UART1_TX) fm.register(board_info.PIN17,fm.fpioa.UART1_RX) uart_A = UART(UART.UART1, 115200, 8, None, 1, timeout=1000, read_buf_len=4096) classes = ["raccoon"] task = kpu.load(0x200000) #change to "/sd/name_of_the_model_file.kmodel" if loading from SD card a = kpu.set_outputs(task, 0, 7,7,30) #the actual shape needs to match the last layer shape of your model(before Reshape) anchor = (0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828) a = kpu.init_yolo2(task, 0.3, 0.3, 5, anchor) #tweak the second parameter if you're getting too many false positives while(True): img = sensor.snapshot().rotation_corr(z_rotation=90.0) a = img.pix_to_ai() code = kpu.run_yolo2(task, img) if code: for i in code: a=img.draw_rectangle(i.rect(),color = (0, 255, 0)) a = img.draw_string(i.x(),i.y(), classes[i.classid()], color=(255,0,0), scale=3) uart_A.write(str(i.rect())) a = lcd.display(img) else: a = lcd.display(img) a = kpu.deinit(task) uart_A.deinit() del uart_A ================================================ FILE: example_scripts/k210/detector/yolov3/raccoon_detector.py ================================================ # needs firmware from my fork with yolov3 support, see # https://github.com/sipeed/MaixPy/pull/451 import sensor, image, lcd import KPU as kpu lcd.init() sensor.reset() sensor.set_pixformat(sensor.RGB565) sensor.set_framesize(sensor.QVGA) sensor.set_vflip(1) sensor.run(1) classes = ["raccoon"] task = kpu.load(0x300000) #change to "/sd/name_of_the_model_file.kmodel" if loading from SD card a = kpu.set_outputs(task, 0, 10, 8, 18) #the actual shape needs to match the last layer shape of your model(before Reshape) anchor = (0.76120044, 0.57155991, 0.6923348, 0.88535553, 0.47163042, 0.34163313) a = kpu.init_yolo3(task, 0.5, 0.3, 3, 1, anchor) # second parameter - obj_threshold, tweak if you're getting too many false positives # third parameter - nms_threshold # fourth parameter - number of anchors # fifth parameter - number of branches for YOLOv3, in this case we only use one branch while(True): img = sensor.snapshot() #a = img.pix_to_ai() # only necessary if you do opeartions (e.g. resize) on image code = kpu.run_yolo3(task, img) if code: for i in code: a = img.draw_rectangle(i.rect(),color = (0, 255, 0)) a = img.draw_string(i.x(), i.y(), classes[i.classid()], color=(255,0,0), scale = 1.5) a = lcd.display(img) else: a = lcd.display(img) a = kpu.deinit(task) ================================================ FILE: example_scripts/k210/segnet/segnet-support-is-WIP-contributions-welcome ================================================ ================================================ FILE: example_scripts/oak/yolov2/YOLO_best_mAP.json ================================================ { "NN_config": { "output_format" : "raw", "NN_family" : "YOLO", "NN_specific_metadata" : { "classes" : 1, "coordinates" : 4, "anchors" : [10,14, 23,27, 37,58, 81,82, 135,169, 344,319], "anchor_masks" : { "side26" : [1,2,3], "side13" : [3,4,5] }, "iou_threshold" : 0.5, "confidence_threshold" : 0.5 } }, "mappings": { "labels": [ "person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed", "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush" ] } } ================================================ FILE: example_scripts/oak/yolov2/box.py ================================================ import numpy as np import cv2 # Todo : BoundBox & its related method extraction class BoundBox: def __init__(self, x, y, w, h, c = None, classes = None): self.x = x self.y = y self.w = w self.h = h self.c = c self.classes = classes def get_label(self): return np.argmax(self.classes) def get_score(self): return self.classes[self.get_label()] def iou(self, bound_box): b1 = self.as_centroid() b2 = bound_box.as_centroid() return centroid_box_iou(b1, b2) def as_centroid(self): return np.array([self.x, self.y, self.w, self.h]) def boxes_to_array(bound_boxes): """ # Args boxes : list of BoundBox instances # Returns centroid_boxes : (N, 4) probs : (N, nb_classes) """ centroid_boxes = [] probs = [] for box in bound_boxes: centroid_boxes.append([box.x, box.y, box.w, box.h]) probs.append(box.classes) return np.array(centroid_boxes), np.array(probs) def nms_boxes(boxes, n_classes, nms_threshold=0.3, obj_threshold=0.3): """ # Args boxes : list of BoundBox # Returns boxes : list of BoundBox non maximum supressed BoundBox instances """ # suppress non-maximal boxes for c in range(n_classes): sorted_indices = list(reversed(np.argsort([box.classes[c] for box in boxes]))) for i in range(len(sorted_indices)): index_i = sorted_indices[i] if boxes[index_i].classes[c] == 0: continue else: for j in range(i+1, len(sorted_indices)): index_j = sorted_indices[j] if boxes[index_i].iou(boxes[index_j]) >= nms_threshold: boxes[index_j].classes[c] = 0 # remove the boxes which are less likely than a obj_threshold boxes = [box for box in boxes if box.get_score() > obj_threshold] return boxes def draw_scaled_boxes(image, boxes, probs, labels, desired_size=400): img_size = min(image.shape[:2]) if img_size < desired_size: scale_factor = float(desired_size) / img_size else: scale_factor = 1.0 h, w = image.shape[:2] img_scaled = cv2.resize(image, (int(w*scale_factor), int(h*scale_factor))) if boxes != []: boxes_scaled = boxes*scale_factor boxes_scaled = boxes_scaled.astype(np.int) else: boxes_scaled = boxes return draw_boxes(img_scaled, boxes_scaled, probs, labels) def draw_boxes(image, boxes, probs, labels): for box, classes in zip(boxes, probs): x1, y1, x2, y2 = box cv2.rectangle(image, (x1,y1), (x2,y2), (0,255,0), 3) cv2.putText(image, '{}: {:.2f}'.format(labels[np.argmax(classes)], classes.max()), (x1, y1 - 13), cv2.FONT_HERSHEY_SIMPLEX, 1e-3 * image.shape[0], (0,255,0), 2) return image def centroid_box_iou(box1, box2): def _interval_overlap(interval_a, interval_b): x1, x2 = interval_a x3, x4 = interval_b if x3 < x1: if x4 < x1: return 0 else: return min(x2,x4) - x1 else: if x2 < x3: return 0 else: return min(x2,x4) - x3 _, _, w1, h1 = box1.reshape(-1,) _, _, w2, h2 = box2.reshape(-1,) x1_min, y1_min, x1_max, y1_max = to_minmax(box1.reshape(-1,4)).reshape(-1,) x2_min, y2_min, x2_max, y2_max = to_minmax(box2.reshape(-1,4)).reshape(-1,) intersect_w = _interval_overlap([x1_min, x1_max], [x2_min, x2_max]) intersect_h = _interval_overlap([y1_min, y1_max], [y2_min, y2_max]) intersect = intersect_w * intersect_h union = w1 * h1 + w2 * h2 - intersect return float(intersect) / union def to_centroid(minmax_boxes): """ minmax_boxes : (N, 4) """ minmax_boxes = minmax_boxes.astype(np.float) centroid_boxes = np.zeros_like(minmax_boxes) x1 = minmax_boxes[:,0] y1 = minmax_boxes[:,1] x2 = minmax_boxes[:,2] y2 = minmax_boxes[:,3] centroid_boxes[:,0] = (x1 + x2) / 2 centroid_boxes[:,1] = (y1 + y2) / 2 centroid_boxes[:,2] = x2 - x1 centroid_boxes[:,3] = y2 - y1 return centroid_boxes def to_minmax(centroid_boxes): centroid_boxes = centroid_boxes.astype(np.float) minmax_boxes = np.zeros_like(centroid_boxes) cx = centroid_boxes[:,0] cy = centroid_boxes[:,1] w = centroid_boxes[:,2] h = centroid_boxes[:,3] minmax_boxes[:,0] = cx - w/2 minmax_boxes[:,1] = cy - h/2 minmax_boxes[:,2] = cx + w/2 minmax_boxes[:,3] = cy + h/2 return minmax_boxes def create_anchor_boxes(anchors): """ # Args anchors : list of floats # Returns boxes : array, shape of (len(anchors)/2, 4) centroid-type """ boxes = [] n_boxes = int(len(anchors)/2) for i in range(n_boxes): boxes.append(np.array([0, 0, anchors[2*i], anchors[2*i+1]])) return np.array(boxes) def find_match_box(centroid_box, centroid_boxes): """Find the index of the boxes with the largest overlap among the N-boxes. # Args box : array, shape of (1, 4) boxes : array, shape of (N, 4) # Return match_index : int """ match_index = -1 max_iou = -1 for i, box in enumerate(centroid_boxes): iou = centroid_box_iou(centroid_box, box) if max_iou < iou: match_index = i max_iou = iou return match_index ================================================ FILE: example_scripts/oak/yolov2/yolo.py ================================================ import consts.resource_paths import cv2 import depthai import argparse import time import numpy as np IOU_THRESHOLD = 0.1 labels = ['null', 'kangaroo'] GREEN = '\033[1;32m' RED = '\033[1;31m' NOCOLOR = '\033[0m' YELLOW = '\033[1;33m' DEVICE = "MYRIAD" def sigmoid(x): return 1.0 / (1 + np.exp(x * -1.0)) def calculate_overlap(x1, w1, x2, w2): box1_coordinate = max(x1 - w1 / 2.0, x2 - w2 / 2.0) box2_coordinate = min(x1 + w1 / 2.0, x2 + w2 / 2.0) overlap = box2_coordinate - box1_coordinate return overlap def calculate_iou(box, truth): # calculate the iou intersection over union by first calculating the overlapping height and width width_overlap = calculate_overlap(box[0], box[2], truth[0], truth[2]) height_overlap = calculate_overlap(box[1], box[3], truth[1], truth[3]) # no overlap if width_overlap < 0 or height_overlap < 0: return 0 intersection_area = width_overlap * height_overlap union_area = box[2] * box[3] + truth[2] * truth[3] - intersection_area iou = intersection_area / union_area return iou def apply_nms(boxes): # sort the boxes by score in descending order sorted_boxes = sorted(boxes, key=lambda d: d[7])[::-1] high_iou_objs = dict() # compare the iou for each of the detected objects for current_object in range(len(sorted_boxes)): if current_object in high_iou_objs: continue truth = sorted_boxes[current_object] for next_object in range(current_object + 1, len(sorted_boxes)): if next_object in high_iou_objs: continue box = sorted_boxes[next_object] iou = calculate_iou(box, truth) if iou >= IOU_THRESHOLD: high_iou_objs[next_object] = 1 # filter and sort detected items filtered_result = list() for current_object in range(len(sorted_boxes)): if current_object not in high_iou_objs: filtered_result.append(sorted_boxes[current_object]) return filtered_result def post_processing(output, label_list, threshold): num_classes = 1 num_grids = 7 num_anchor_boxes = 5 original_results = output.astype(np.float32) # Tiny Yolo V2 uses a 13 x 13 grid with 5 anchor boxes for each grid cell. # This specific model was trained with the VOC Pascal data set and is comprised of 20 classes original_results = np.reshape(original_results, (num_anchor_boxes, 5+num_classes, num_grids, num_grids)) reordered_results = np.transpose(original_results, (2, 3, 0, 1)) reordered_results = np.reshape(reordered_results, (num_grids*num_grids, num_anchor_boxes, 5+num_classes)) # The 125 results need to be re-organized into 5 chunks of 25 values # 20 classes + 1 score + 4 coordinates = 25 values # 25 values for each of the 5 anchor bounding boxes = 125 values #reordered_results = np.zeros((13 * 13, 5, 25)) index = 0 #for row in range( num_grids ): # for col in range( num_grids ): # for b_box_voltron in range(125): # b_box = row * num_grids + col # b_box_num = int(b_box_voltron / 25) # b_box_info = b_box_voltron % 25 # reordered_results[b_box][b_box_num][b_box_info] = original_results[row][col][b_box_voltron] # shapes for the 5 Tiny Yolo v2 bounding boxes anchor_boxes = [0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828] boxes = list() # iterate through the grids and anchor boxes and filter out all scores which do not exceed the DETECTION_THRESHOLD for row in range(num_grids): for col in range(num_grids): for anchor_box_num in range(num_anchor_boxes): box = list() class_list = list() current_score_total = 0 # calculate the coordinates for the current anchor box box_x = (col + sigmoid(reordered_results[row * num_grids + col][anchor_box_num][0])) / 7.0 box_y = (row + sigmoid(reordered_results[row * num_grids + col][anchor_box_num][1])) / 7.0 box_w = (np.exp(reordered_results[row * num_grids + col][anchor_box_num][2]) * anchor_boxes[2 * anchor_box_num]) / 7.0 box_h = (np.exp(reordered_results[row * num_grids + col][anchor_box_num][3]) * anchor_boxes[2 * anchor_box_num + 1]) / 7.0 # find the class with the highest score for class_enum in range(num_classes): class_list.append(reordered_results[row * num_grids + col][anchor_box_num][5 + class_enum]) current_score_total = sum(class_list) for current_class in range(len(class_list)): class_list[current_class] = class_list[current_class] * 1.0 / current_score_total # probability that the current anchor box contains an item object_confidence = sigmoid(reordered_results[row * num_grids + col][anchor_box_num][4]) # highest class score detected for the object in the current anchor box highest_class_score = max(class_list) # index of the class with the highest score class_w_highest_score = class_list.index(max(class_list)) + 1 # the final score for the detected object final_object_score = object_confidence * highest_class_score box.append(box_x) box.append(box_y) box.append(box_w) box.append(box_h) box.append(class_w_highest_score) box.append(object_confidence) box.append(highest_class_score) box.append(final_object_score) # filter out all detected objects with a score less than the threshold if final_object_score > threshold: boxes.append(box) # gets rid of all duplicate boxes using non-maximal suppression results = apply_nms(boxes) return results def show_tiny_yolo(results, original_img, is_depth=0): image_width = original_img.shape[1] image_height = original_img.shape[0] label_list = labels # calculate the actual box coordinates in relation to the input image print('\n Found this many objects in the image: ' + str(len(results))) for box in results: box_xmin = int((box[0] - box[2] / 2.0) * image_width) box_xmax = int((box[0] + box[2] / 2.0) * image_width) box_ymin = int((box[1] - box[3] / 2.0) * image_height) box_ymax = int((box[1] + box[3] / 2.0) * image_height) # ensure the box is not drawn out of the window resolution if box_xmin < 0: box_xmin = 0 if box_xmax > image_width: box_xmax = image_width if box_ymin < 0: box_ymin = 0 if box_ymax > image_height: box_ymax = image_height print(" - object: " + YELLOW + label_list[box[4]] + NOCOLOR + " is at left: " + str(box_xmin) + " top: " + str(box_ymin) + " right: " + str(box_xmax) + " bottom: " + str(box_ymax)) # label shape and colorization label_text = label_list[box[4]] + " " + str("{0:.2f}".format(box[5]*box[6])) label_background_color = (70, 120, 70) # grayish green background for text label_text_color = (255, 255, 255) # white text label_size = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)[0] label_left = int(box_xmin) label_top = int(box_ymin) - label_size[1] label_right = label_left + label_size[0] label_bottom = label_top + label_size[1] # set up the colored rectangle background for text cv2.rectangle(original_img, (label_left - 1, label_top - 5),(label_right + 1, label_bottom + 1), label_background_color, -1) # set up text cv2.putText(original_img, label_text, (int(box_xmin), int(box_ymin - 5)), cv2.FONT_HERSHEY_SIMPLEX, 0.5, label_text_color, 1) # set up the rectangle around the object cv2.rectangle(original_img, (int(box_xmin), int(box_ymin)), (int(box_xmax), int(box_ymax)), (0, 255, 0), 2) return original_img parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--model', help='File path of .tflite file.', required=True) parser.add_argument('--config', help='File path of config file.', required=True) parser.add_argument('--threshold', help='Confidence threshold.', default=0.4) args = parser.parse_args() if __name__ == "__main__" : if not depthai.init_device(consts.resource_paths.device_cmd_fpath): raise RuntimeError("Error initializing device. Try to reset it.") p = depthai.create_pipeline(config={ "streams": ["metaout", "previewout"], "ai": { "blob_file": args.model, "blob_file_config": 'YOLO_best_mAP.json' } }) if p is None: raise RuntimeError("Error initializing pipelne") recv = False while True: nnet_packets, data_packets = p.get_available_nnet_and_data_packets() for nnet_packet in nnet_packets: raw_detections = nnet_packet.get_tensor(0) raw_detections.dtype = np.float16 raw_detections = np.squeeze(raw_detections) recv = True for packet in data_packets: if packet.stream_name == 'previewout': data = packet.getData() data0 = data[0, :, :] data1 = data[1, :, :] data2 = data[2, :, :] frame = cv2.merge([data0, data1, data2]) if recv: filtered_objects = post_processing(raw_detections, ['kangaroo'], args.threshold) frame = show_tiny_yolo(filtered_objects, frame, 0) cv2.imshow('previewout', frame) if cv2.waitKey(1) == ord('q'): break del p depthai.deinit_device() ================================================ FILE: example_scripts/oak/yolov2/yolo_alt.py ================================================ import consts.resource_paths import cv2 import depthai import argparse import time import numpy as np from box import BoundBox, nms_boxes, boxes_to_array, to_minmax, draw_boxes class Detector(object): def __init__(self, label_file, model_file, threshold): self._threshold = float(threshold) self.labels = self.load_labels(label_file) def load_labels(self, path): with open(path, 'r') as f: return {i: line.strip() for i, line in enumerate(f.read().replace('"','').split(','))} def parse(self, original_image, tensor): #start_time = time.time() #elapsed_ms = (time.time() - start_time) * 1000 #fps = 1 / elapsed_ms*1000 #print("Estimated frames per second : {0:.2f} Inference time: {1:.2f}".format(fps, elapsed_ms)) boxes, probs = self.run(tensor) def _to_original_scale(boxes): minmax_boxes = to_minmax(boxes) minmax_boxes[:,0] *= 224 minmax_boxes[:,2] *= 224 minmax_boxes[:,1] *= 224 minmax_boxes[:,3] *= 224 return minmax_boxes.astype(np.int) if len(boxes) > 0: boxes = _to_original_scale(boxes) #print(boxes) original_image = draw_boxes(original_image, boxes, probs, self.labels) return original_image def run(self, netout): anchors = [0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828] nms_threshold=0.2 """Convert Yolo network output to bounding box # Args netout : 4d-array, shape of (grid_h, grid_w, num of boxes per grid, 5 + n_classes) YOLO neural network output array # Returns boxes : array, shape of (N, 4) coordinate scale is normalized [0, 1] probs : array, shape of (N, nb_classes) """ grid_h, grid_w, nb_box = netout.shape[:3] boxes = [] # decode the output by the network netout[..., 4] = _sigmoid(netout[..., 4]) netout[..., 5:] = netout[..., 4][..., np.newaxis] * _softmax(netout[..., 5:]) netout[..., 5:] *= netout[..., 5:] > self._threshold for row in range(grid_h): for col in range(grid_w): for b in range(nb_box): # from 4th element onwards are confidence and class classes classes = netout[row,col,b,5:] if np.sum(classes) > 0: # first 4 elements are x, y, w, and h x, y, w, h = netout[row,col,b,:4] x = (col + _sigmoid(x)) / grid_w # center position, unit: image width y = (row + _sigmoid(y)) / grid_h # center position, unit: image height w = anchors[2 * b + 0] * np.exp(w) / grid_w # unit: image width h = anchors[2 * b + 1] * np.exp(h) / grid_h # unit: image height confidence = netout[row,col,b,4] box = BoundBox(x, y, w, h, confidence, classes) boxes.append(box) boxes = nms_boxes(boxes, len(classes), nms_threshold, self._threshold) boxes, probs = boxes_to_array(boxes) return boxes, probs def _sigmoid(x): return 1. / (1. + np.exp(-x)) def _softmax(x, axis=-1, t=-100.): x = x - np.max(x) if np.min(x) < t: x = x/np.min(x)*t e_x = np.exp(x) return e_x / e_x.sum(axis, keepdims=True) parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--model', help='File path of .tflite file.', required=True) parser.add_argument('--labels', help='File path of labels file.', required=True) parser.add_argument('--threshold', help='Confidence threshold.', default=0.3) args = parser.parse_args() if __name__ == "__main__" : detector = Detector(args.labels, args.model, args.threshold) if not depthai.init_device(consts.resource_paths.device_cmd_fpath): raise RuntimeError("Error initializing device. Try to reset it.") p = depthai.create_pipeline(config={ "streams": ["metaout", "previewout"], "ai": { "blob_file": args.model, "blob_file_config": 'yolov2/YOLO_best_mAP_alt.json' } }) if p is None: raise RuntimeError("Error initializing pipelne") recv = False while True: nnet_packets, data_packets = p.get_available_nnet_and_data_packets() for nnet_packet in nnet_packets: raw_detections = nnet_packet.get_tensor(0) raw_detections.dtype = np.float16 raw_detections = np.squeeze(raw_detections) output_shape = [5, 6, 7, 7] output = np.reshape(raw_detections, output_shape) output = np.transpose(output, (2, 3, 0, 1)) recv = True for packet in data_packets: if packet.stream_name == 'previewout': data = packet.getData() data0 = data[0, :, :] data1 = data[1, :, :] data2 = data[2, :, :] frame = cv2.merge([data0, data1, data2]) if recv: frame = detector.parse(frame, output) cv2.imshow('previewout', frame) if cv2.waitKey(1) == ord('q'): break del p depthai.deinit_device() ================================================ FILE: example_scripts/tensorflow_lite/classifier/base_camera.py ================================================ import time import threading try: from greenlet import getcurrent as get_ident except ImportError: try: from thread import get_ident except ImportError: from _thread import get_ident class CameraEvent(object): """An Event-like class that signals all active clients when a new frame is available. """ def __init__(self): self.events = {} def wait(self): """Invoked from each client's thread to wait for the next frame.""" ident = get_ident() if ident not in self.events: # this is a new client # add an entry for it in the self.events dict # each entry has two elements, a threading.Event() and a timestamp self.events[ident] = [threading.Event(), time.time()] return self.events[ident][0].wait() def set(self): """Invoked by the camera thread when a new frame is available.""" now = time.time() remove = None for ident, event in self.events.items(): if not event[0].isSet(): # if this client's event is not set, then set it # also update the last set timestamp to now event[0].set() event[1] = now else: # if the client's event is already set, it means the client # did not process a previous frame # if the event stays set for more than 5 seconds, then assume # the client is gone and remove it if now - event[1] > 5: remove = ident if remove: del self.events[remove] def clear(self): """Invoked from each client's thread after a frame was processed.""" self.events[get_ident()][0].clear() class BaseCamera(object): thread = None # background thread that reads frames from camera frame = None # current frame is stored here by background thread last_access = 0 # time of last client access to the camera event = CameraEvent() def __init__(self): """Start the background camera thread if it isn't running yet.""" if BaseCamera.thread is None: BaseCamera.last_access = time.time() # start background frame thread BaseCamera.thread = threading.Thread(target=self._thread) BaseCamera.thread.start() # wait until frames are available while self.get_frame() is None: time.sleep(0) def get_frame(self): """Return the current camera frame.""" BaseCamera.last_access = time.time() # wait for a signal from the camera thread BaseCamera.event.wait() BaseCamera.event.clear() return BaseCamera.frame @staticmethod def frames(): """"Generator that returns frames from the camera.""" raise RuntimeError('Must be implemented by subclasses.') @classmethod def _thread(cls): """Camera background thread.""" print('Starting camera thread.') frames_iterator = cls.frames() for frame in frames_iterator: BaseCamera.frame = frame BaseCamera.event.set() # send signal to clients time.sleep(0) # if there hasn't been any clients asking for frames in # the last 10 seconds then stop the thread if time.time() - BaseCamera.last_access > 10: frames_iterator.close() print('Stopping camera thread due to inactivity.') break BaseCamera.thread = None ================================================ FILE: example_scripts/tensorflow_lite/classifier/camera_opencv.py ================================================ import cv2 from base_camera import BaseCamera class Camera(BaseCamera): video_source = 0 @staticmethod def set_video_source(source): Camera.video_source = source @staticmethod def frames(): camera = cv2.VideoCapture(Camera.video_source) if not camera.isOpened(): raise RuntimeError('Could not start camera.') while True: # read current frame _, img = camera.read() # encode as a jpeg image and return it yield img ================================================ FILE: example_scripts/tensorflow_lite/classifier/camera_pi.py ================================================ import io import time import picamera import picamera.array import cv2 from base_camera import BaseCamera class Camera(BaseCamera): video_source = 0 @staticmethod def set_video_source(source): pass @staticmethod def frames(): with picamera.PiCamera(resolution = (1280,720)) as camera: # let camera warm up time.sleep(2) with picamera.array.PiRGBArray(camera, size=(1280,720)) as stream: while True: camera.capture(stream, format='bgr', use_video_port=True) # At this point the image is available as stream.array image = stream.array stream.truncate(0) yield image ================================================ FILE: example_scripts/tensorflow_lite/classifier/classifier_file.py ================================================ import time import argparse import os import cv2 import numpy as np from tqdm import tqdm from cv_utils import init_video_file_capture, decode_classifier, draw_classification, preprocess from tflite_runtime.interpreter import Interpreter def load_labels(path): with open(path, 'r') as f: return {i: line.strip() for i, line in enumerate(f.read().replace('"','').split(','))} class NetworkExecutor(object): def __init__(self, model_file): self.interpreter = Interpreter(model_file, num_threads=3) self.interpreter.allocate_tensors() _, self.input_height, self.input_width, _ = self.interpreter.get_input_details()[0]['shape'] self.tensor_index = self.interpreter.get_input_details()[0]['index'] def get_output_tensors(self): output_details = self.interpreter.get_output_details() tensor_indices = [] tensor_list = [] for output in output_details: tensor = np.squeeze(self.interpreter.get_tensor(output['index'])) tensor_list.append(tensor) return tensor_list def run(self, image): if image.shape[1:2] != (self.input_height, self.input_width): img = cv2.resize(image, (self.input_width, self.input_height)) img = preprocess(img) self.interpreter.set_tensor(self.tensor_index, img) self.interpreter.invoke() return self.get_output_tensors() def main(args): video, video_writer, frame_count = init_video_file_capture(args.file, 'classifier_demo') if not os.path.exists(args.labels[0]): labels = args.labels else: labels = load_labels(args.labels[0]) frame_num = len(frame_count) times = [] for _ in tqdm(frame_count, desc='Processing frames'): frame_present, frame = video.read() if not frame_present: continue start_time = time.time() results = classification_network.run(frame) elapsed_ms = (time.time() - start_time) * 1000 classification = decode_classifier(netout = results, top_k = args.top_k) draw_classification(frame, classification, labels) times.append(elapsed_ms) video_writer.write(frame) print('Finished processing frames') video.release(), video_writer.release() print("Average time(ms): ", sum(times)//frame_num) print("FPS: ", 1000.0 / (sum(times)//frame_num)) # FPS = 1 / time to process loop if __name__ == "__main__" : print("OpenCV version: {}".format(cv2. __version__)) parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--model', help='File path of .tflite file.', required=True) parser.add_argument('--labels', nargs="+", help='File path of labels file.', required=True) parser.add_argument('--top_k', help='How many top results to display', default=3) parser.add_argument('--file', help='File path of video file', default=None) args = parser.parse_args() classification_network = NetworkExecutor(args.model) main(args) ================================================ FILE: example_scripts/tensorflow_lite/classifier/classifier_stream.py ================================================ import time import argparse import os import cv2 import numpy as np from cv_utils import decode_classifier, draw_classification, preprocess from tflite_runtime.interpreter import Interpreter from flask import Flask, render_template, request, Response app = Flask (__name__, static_url_path = '') def load_labels(path): with open(path, 'r') as f: return {i: line.strip() for i, line in enumerate(f.read().replace('"','').split(','))} class NetworkExecutor(object): def __init__(self, model_file): self.interpreter = Interpreter(model_file, num_threads=3) self.interpreter.allocate_tensors() _, self.input_height, self.input_width, _ = self.interpreter.get_input_details()[0]['shape'] self.tensor_index = self.interpreter.get_input_details()[0]['index'] def get_output_tensors(self): output_details = self.interpreter.get_output_details() tensor_indices = [] tensor_list = [] for output in output_details: tensor = np.squeeze(self.interpreter.get_tensor(output['index'])) tensor_list.append(tensor) return tensor_list def run(self, image): if image.shape[1:2] != (self.input_height, self.input_width): img = cv2.resize(image, (self.input_width, self.input_height)) img = preprocess(img) self.interpreter.set_tensor(self.tensor_index, img) self.interpreter.invoke() return self.get_output_tensors() class Classifier(NetworkExecutor): def __init__(self, label_file, model_file, top_k): super().__init__(model_file) self.top_k = top_k if not os.path.exists(label_file): self.labels = [label_file] else: self.labels = load_labels(label_file) def classify(self, frame): start_time = time.time() results = self.run(frame) elapsed_ms = (time.time() - start_time) * 1000 classification = decode_classifier(netout = results, top_k = self.top_k) draw_classification(frame, classification, self.labels) fps = 1 / elapsed_ms*1000 print("Estimated frames per second : {0:.2f} Inference time: {1:.2f}".format(fps, elapsed_ms)) return cv2.imencode('.jpg', frame)[1].tobytes() @app.route("/") def index(): return render_template('index.html', name = None) def gen(camera): while True: frame = camera.get_frame() image = classifier.classify(frame) yield (b'--frame\r\n'+b'Content-Type: image/jpeg\r\n\r\n' + image + b'\r\n') @app.route('/video_feed') def video_feed(): return Response(gen(Camera()), mimetype='multipart/x-mixed-replace; boundary=frame') parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--model', help='File path of .tflite file.', required=True) parser.add_argument('--labels', help='File path of labels file.', required=True) parser.add_argument('--top_k', help='How many top results to display', default=3) parser.add_argument('--source', help='picamera or cv', default='cv') args = parser.parse_args() if args.source == "cv": from camera_opencv import Camera source = 0 elif args.source == "picamera": from camera_pi import Camera source = 0 Camera.set_video_source(source) classifier = Classifier(args.labels, args.model, args.top_k) if __name__ == "__main__" : app.run(host = '0.0.0.0', port = 5000, debug = True) ================================================ FILE: example_scripts/tensorflow_lite/classifier/cv_utils.py ================================================ # Copyright © 2020 Arm Ltd and Contributors. All rights reserved. # SPDX-License-Identifier: MIT """ This file contains helper functions for reading video/image data and pre/postprocessing of video/image data using OpenCV. """ import os import cv2 import numpy as np def preprocess(img): img = img.astype(np.float32) img = img / 255. img = img - 0.5 img = img * 2. img = img[:, :, ::-1] img = np.expand_dims(img, 0) return img def decode_yolov2(netout, nms_threshold = 0.2, threshold = 0.3, anchors = [1.889, 2.5245, 2.9465, 3.94056, 3.99987, 5.3658, 5.155437, 6.92275, 6.718375, 9.01025]): #Convert Yolo network output to bounding box netout = netout[0].reshape(7,7,5,6) grid_h, grid_w, nb_box = netout.shape[:3] boxes = [] # decode the output by the network netout[..., 4] = _sigmoid(netout[..., 4]) netout[..., 5:] = netout[..., 4][..., np.newaxis] * _softmax(netout[..., 5:]) netout[..., 5:] *= netout[..., 5:] > threshold for row in range(grid_h): for col in range(grid_w): for b in range(nb_box): # from 4th element onwards are confidence and class classes classes = netout[row,col,b,5:] if np.sum(classes) > 0: # first 4 elements are x, y, w, and h x, y, w, h = netout[row,col,b,:4] x = (col + _sigmoid(x)) / grid_w # center position, unit: image width y = (row + _sigmoid(y)) / grid_h # center position, unit: image height w = anchors[2 * b + 0] * np.exp(w) / grid_w # unit: image width h = anchors[2 * b + 1] * np.exp(h) / grid_h # unit: image height confidence = netout[row,col,b,4] box = BoundBox(x, y, w, h, confidence, classes) boxes.append(box) boxes = nms_boxes(boxes, len(classes), nms_threshold, threshold) if len(boxes) > 0: return boxes_to_array(boxes) else: return [] def decode_yolov3(netout, nms_threshold = 0.2, threshold = 0.3, anchors = [[[0.76120044, 0.57155991], [0.6923348, 0.88535553], [0.47163042, 0.34163313]], [[0.33340788, 0.70065861], [0.18124964, 0.38986752], [0.08497349, 0.1527057 ]]]): #Convert Yolo network output to bounding box boxes = [] for l, output in enumerate(netout): grid_h, grid_w, nb_box = output.shape[0:3] # decode the output by the network output[..., 4] = _sigmoid(output[..., 4]) output[..., 5:] = output[..., 4][..., np.newaxis] * _sigmoid(output[..., 5:]) output[..., 5:] *= output[..., 5:] > threshold for row in range(grid_h): for col in range(grid_w): for b in range(nb_box): # from 4th element onwards are confidence and class classes classes = output[row, col, b, 5:] if np.sum(classes) > 0: # first 4 elements are x, y, w, and h x, y, w, h = output[row, col, b, :4] x = (col + _sigmoid(x)) / grid_w # center position, unit: image width y = (row + _sigmoid(y)) / grid_h # center position, unit: image height w = anchors[l][b][0] * np.exp(w) # unit: image width h = anchors[l][b][1] * np.exp(h) # unit: image height confidence = output[row, col, b, 4] box = BoundBox(x, y, w, h, confidence, classes) boxes.append(box) boxes = nms_boxes(boxes, len(classes), nms_threshold, threshold) if len(boxes) > 0: return boxes_to_array(boxes) else: return [] def decode_classifier(netout, top_k=3): netout = netout[0] ordered = np.argsort(netout) results = [(i, netout[i]) for i in ordered[-top_k:][::-1]] return results def decode_segnet(netout, labels, class_colors): netout = netout[0] seg_arr = netout.argmax(axis=2) seg_img = np.zeros((netout.shape[0], netout.shape[1], 3)) for c in range(len(labels)): seg_img[:, :, 0] += ((seg_arr[:, :] == c)*(class_colors[c][0])).astype('uint8') seg_img[:, :, 1] += ((seg_arr[:, :] == c)*(class_colors[c][1])).astype('uint8') seg_img[:, :, 2] += ((seg_arr[:, :] == c)*(class_colors[c][2])).astype('uint8') return seg_img def get_legends(class_names, colors): n_classes = len(class_names) legend = np.zeros(((len(class_names) * 25), 150, 3), dtype="uint8") + 255 for (i, (class_name, color)) in enumerate(zip(class_names.values() , colors)): color = [int(c) for c in color] cv2.putText(legend, class_name, (5, (i * 25) + 17),cv2.FONT_HERSHEY_COMPLEX, 0.5, (0, 0, 0), 1) cv2.rectangle(legend, (125, (i * 25)), (150, (i * 25) + 25), tuple(color), -1) return legend def overlay_seg_image(inp_img, seg_img): orininal_h = inp_img.shape[0] orininal_w = inp_img.shape[1] seg_img = cv2.resize(seg_img, (orininal_w, orininal_h)) fused_img = (inp_img/2 + seg_img/2 ).astype('uint8') return fused_img def concat_lenends(seg_img, legend_img): seg_img[:legend_img.shape[0],:legend_img.shape[1]] = np.copy(legend_img) return seg_img def _sigmoid(x): return 1. / (1. + np.exp(-x)) def _softmax(x, axis=-1, t=-100.): x = x - np.max(x) if np.min(x) < t: x = x/np.min(x)*t e_x = np.exp(x) return e_x / e_x.sum(axis, keepdims=True) def resize_with_aspect_ratio(frame: np.ndarray, input_binding_info: tuple): """ Resizes frame while maintaining aspect ratio, padding any empty space. Args: frame: Captured frame. input_binding_info: Contains shape of model input layer. Returns: Frame resized to the size of model input layer. """ aspect_ratio = frame.shape[1] / frame.shape[0] model_height, model_width = list(input_binding_info[1].GetShape())[1:3] if aspect_ratio >= 1.0: new_height, new_width = int(model_width / aspect_ratio), model_width b_padding, r_padding = model_height - new_height, 0 else: new_height, new_width = model_height, int(model_height * aspect_ratio) b_padding, r_padding = 0, model_width - new_width # Resize and pad any empty space frame = cv2.resize(frame, (new_width, new_height), interpolation=cv2.INTER_LINEAR) frame = cv2.copyMakeBorder(frame, top=0, bottom=b_padding, left=0, right=r_padding, borderType=cv2.BORDER_CONSTANT, value=[0, 0, 0]) return frame def create_video_writer(video, video_path, output_name): """ Creates a video writer object to write processed frames to file. Args: video: Video capture object, contains information about data source. video_path: User-specified video file path. output_path: Optional path to save the processed video. Returns: Video writer object. """ _, ext = os.path.splitext(video_path) i, filename = 0, output_name + ext while os.path.exists(filename): i += 1 filename = output_name + str(i) + ext video_writer = cv2.VideoWriter(filename=filename, fourcc=get_source_encoding_int(video), fps=int(video.get(cv2.CAP_PROP_FPS)), frameSize=(int(video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)))) return video_writer def init_video_file_capture(video_path, output_name): """ Creates a video capture object from a video file. Args: video_path: User-specified video file path. output_path: Optional path to save the processed video. Returns: Video capture object to capture frames, video writer object to write processed frames to file, plus total frame count of video source to iterate through. """ if not os.path.exists(video_path): raise FileNotFoundError(f'Video file not found for: {video_path}') video = cv2.VideoCapture(video_path) if not video.isOpened: raise RuntimeError(f'Failed to open video capture from file: {video_path}') video_writer = create_video_writer(video, video_path, output_name) iter_frame_count = range(int(video.get(cv2.CAP_PROP_FRAME_COUNT))) return video, video_writer, iter_frame_count def draw_bounding_boxes(frame, detections, labels=None, processing_function=None): """ Draws bounding boxes around detected objects and adds a label and confidence score. Args: frame: The original captured frame from video source. detections: A list of detected objects in the form [class, [box positions], confidence]. resize_factor: Resizing factor to scale box coordinates to output frame size. labels: Dictionary of labels and colors keyed on the classification index. """ def _to_original_scale(boxes, frame_height, frame_width): minmax_boxes = np.empty(shape=(4, ), dtype=np.int) cx = boxes[0] * frame_width cy = boxes[1] * frame_height w = boxes[2] * frame_width h = boxes[3] * frame_height minmax_boxes[0] = cx - w/2 minmax_boxes[1] = cy - h/2 minmax_boxes[2] = cx + w/2 minmax_boxes[3] = cy + h/2 return minmax_boxes color = (0, 255, 0) label_color = (125, 125, 125) for i in range(len(detections)): class_idx, box, confidence = [d for d in detections[i]] # Obtain frame size and resized bounding box positions frame_height, frame_width = frame.shape[:2] x_min, y_min, x_max, y_max = _to_original_scale(box, frame_height, frame_width) # Ensure box stays within the frame x_min, y_min = max(0, x_min), max(0, y_min) x_max, y_max = min(frame_width, x_max), min(frame_height, y_max) # Draw bounding box around detected object cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), color, 2) if processing_function: roi_img = frame[y_min:y_max, x_min:x_max] label = processing_function(roi_img) else: # Create label for detected object class label = labels[class_idx].capitalize() label = f'{label} {confidence * 100:.1f}%' # Make sure label always stays on-screen x_text, y_text = cv2.getTextSize(label, cv2.FONT_HERSHEY_DUPLEX, 1, 1)[0][:2] lbl_box_xy_min = (x_min, y_min if y_min<25 else y_min - y_text) lbl_box_xy_max = (x_min + int(0.55 * x_text), y_min + y_text if y_min<25 else y_min) lbl_text_pos = (x_min + 5, y_min + 16 if y_min<25 else y_min - 5) # Add label and confidence value cv2.rectangle(frame, lbl_box_xy_min, lbl_box_xy_max, color, -1) cv2.putText(frame, label, lbl_text_pos, cv2.FONT_HERSHEY_DUPLEX, 0.50, label_color, 1, cv2.LINE_AA) def draw_classification(frame, classifications, labels): for i in range(len(classifications)): label_id, prob = classifications[i] text = '%s : %.2f' % (labels[label_id], prob) cv2.putText(frame, text, (10, 20*i+20), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2, True) def get_source_encoding_int(video_capture): return int(video_capture.get(cv2.CAP_PROP_FOURCC)) class BoundBox: def __init__(self, x, y, w, h, c = None, classes = None): self.x = x self.y = y self.w = w self.h = h self.c = c self.classes = classes def get_label(self): return np.argmax(self.classes) def get_score(self): return self.classes[self.get_label()] def iou(self, bound_box): b1 = self.as_centroid() b2 = bound_box.as_centroid() return centroid_box_iou(b1, b2) def as_centroid(self): return np.array([self.x, self.y, self.w, self.h]) def boxes_to_array(bound_boxes): """ # Args boxes : list of BoundBox instances # Returns centroid_boxes : (N, 4) probs : (N, nb_classes) """ temp_list = [] for box in bound_boxes: temp_list.append([np.argmax(box.classes), np.asarray([box.x, box.y, box.w, box.h]), np.max(box.classes)]) return np.array(temp_list) def nms_boxes(boxes, n_classes, nms_threshold=0.3, obj_threshold=0.3): """ # Args boxes : list of BoundBox # Returns boxes : list of BoundBox non maximum supressed BoundBox instances """ # suppress non-maximal boxes for c in range(n_classes): sorted_indices = list(reversed(np.argsort([box.classes[c] for box in boxes]))) for i in range(len(sorted_indices)): index_i = sorted_indices[i] if boxes[index_i].classes[c] == 0: continue else: for j in range(i+1, len(sorted_indices)): index_j = sorted_indices[j] if boxes[index_i].iou(boxes[index_j]) >= nms_threshold: boxes[index_j].classes[c] = 0 # remove the boxes which are less likely than a obj_threshold boxes = [box for box in boxes if box.get_score() > obj_threshold] return boxes def centroid_box_iou(box1, box2): def _interval_overlap(interval_a, interval_b): x1, x2 = interval_a x3, x4 = interval_b if x3 < x1: if x4 < x1: return 0 else: return min(x2,x4) - x1 else: if x2 < x3: return 0 else: return min(x2,x4) - x3 _, _, w1, h1 = box1.reshape(-1,) _, _, w2, h2 = box2.reshape(-1,) x1_min, y1_min, x1_max, y1_max = to_minmax(box1.reshape(-1,4)).reshape(-1,) x2_min, y2_min, x2_max, y2_max = to_minmax(box2.reshape(-1,4)).reshape(-1,) intersect_w = _interval_overlap([x1_min, x1_max], [x2_min, x2_max]) intersect_h = _interval_overlap([y1_min, y1_max], [y2_min, y2_max]) intersect = intersect_w * intersect_h union = w1 * h1 + w2 * h2 - intersect return float(intersect) / union def to_minmax(centroid_boxes): centroid_boxes = centroid_boxes.astype(np.float) minmax_boxes = np.zeros_like(centroid_boxes) cx = centroid_boxes[:,0] cy = centroid_boxes[:,1] w = centroid_boxes[:,2] h = centroid_boxes[:,3] minmax_boxes[:,0] = cx - w/2 minmax_boxes[:,1] = cy - h/2 minmax_boxes[:,2] = cx + w/2 minmax_boxes[:,3] = cy + h/2 return minmax_boxes ================================================ FILE: example_scripts/tensorflow_lite/classifier/templates/index.html ================================================ Video Streaming Demonstration

Tflite Image Classification Demo

================================================ FILE: example_scripts/tensorflow_lite/detector/base_camera.py ================================================ import time import threading try: from greenlet import getcurrent as get_ident except ImportError: try: from thread import get_ident except ImportError: from _thread import get_ident class CameraEvent(object): """An Event-like class that signals all active clients when a new frame is available. """ def __init__(self): self.events = {} def wait(self): """Invoked from each client's thread to wait for the next frame.""" ident = get_ident() if ident not in self.events: # this is a new client # add an entry for it in the self.events dict # each entry has two elements, a threading.Event() and a timestamp self.events[ident] = [threading.Event(), time.time()] return self.events[ident][0].wait() def set(self): """Invoked by the camera thread when a new frame is available.""" now = time.time() remove = None for ident, event in self.events.items(): if not event[0].isSet(): # if this client's event is not set, then set it # also update the last set timestamp to now event[0].set() event[1] = now else: # if the client's event is already set, it means the client # did not process a previous frame # if the event stays set for more than 5 seconds, then assume # the client is gone and remove it if now - event[1] > 5: remove = ident if remove: del self.events[remove] def clear(self): """Invoked from each client's thread after a frame was processed.""" self.events[get_ident()][0].clear() class BaseCamera(object): thread = None # background thread that reads frames from camera frame = None # current frame is stored here by background thread last_access = 0 # time of last client access to the camera event = CameraEvent() def __init__(self): """Start the background camera thread if it isn't running yet.""" if BaseCamera.thread is None: BaseCamera.last_access = time.time() # start background frame thread BaseCamera.thread = threading.Thread(target=self._thread) BaseCamera.thread.start() # wait until frames are available while self.get_frame() is None: time.sleep(0) def get_frame(self): """Return the current camera frame.""" BaseCamera.last_access = time.time() # wait for a signal from the camera thread BaseCamera.event.wait() BaseCamera.event.clear() return BaseCamera.frame @staticmethod def frames(): """"Generator that returns frames from the camera.""" raise RuntimeError('Must be implemented by subclasses.') @classmethod def _thread(cls): """Camera background thread.""" print('Starting camera thread.') frames_iterator = cls.frames() for frame in frames_iterator: BaseCamera.frame = frame BaseCamera.event.set() # send signal to clients time.sleep(0) # if there hasn't been any clients asking for frames in # the last 10 seconds then stop the thread if time.time() - BaseCamera.last_access > 10: frames_iterator.close() print('Stopping camera thread due to inactivity.') break BaseCamera.thread = None ================================================ FILE: example_scripts/tensorflow_lite/detector/camera_opencv.py ================================================ import cv2 from base_camera import BaseCamera class Camera(BaseCamera): video_source = 0 @staticmethod def set_video_source(source): Camera.video_source = source @staticmethod def frames(): camera = cv2.VideoCapture(Camera.video_source) if not camera.isOpened(): raise RuntimeError('Could not start camera.') while True: # read current frame _, img = camera.read() #img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # return img yield img ================================================ FILE: example_scripts/tensorflow_lite/detector/camera_pi.py ================================================ import io import time import picamera import picamera.array import cv2 from base_camera import BaseCamera class Camera(BaseCamera): video_source = 0 @staticmethod def set_video_source(source): pass @staticmethod def frames(): with picamera.PiCamera(resolution = (1280,720)) as camera: # let camera warm up time.sleep(2) with picamera.array.PiRGBArray(camera, size=(1280,720)) as stream: while True: camera.capture(stream, format='bgr', use_video_port=True) # At this point the image is available as stream.array image = stream.array stream.truncate(0) yield image ================================================ FILE: example_scripts/tensorflow_lite/detector/cv_utils.py ================================================ # Copyright © 2020 Arm Ltd and Contributors. All rights reserved. # SPDX-License-Identifier: MIT """ This file contains helper functions for reading video/image data and pre/postprocessing of video/image data using OpenCV. """ import os import cv2 import numpy as np def preprocess(img): img = img.astype(np.float32) img = img / 255. img = img - 0.5 img = img * 2. img = img[:, :, ::-1] img = np.expand_dims(img, 0) return img def decode_yolov2(netout, nms_threshold = 0.2, threshold = 0.3, anchors = [1.889, 2.5245, 2.9465, 3.94056, 3.99987, 5.3658, 5.155437, 6.92275, 6.718375, 9.01025]): #Convert Yolo network output to bounding box netout = netout[0].reshape(7,7,5,6) grid_h, grid_w, nb_box = netout.shape[:3] boxes = [] # decode the output by the network netout[..., 4] = _sigmoid(netout[..., 4]) netout[..., 5:] = netout[..., 4][..., np.newaxis] * _softmax(netout[..., 5:]) netout[..., 5:] *= netout[..., 5:] > threshold for row in range(grid_h): for col in range(grid_w): for b in range(nb_box): # from 4th element onwards are confidence and class classes classes = netout[row,col,b,5:] if np.sum(classes) > 0: # first 4 elements are x, y, w, and h x, y, w, h = netout[row,col,b,:4] x = (col + _sigmoid(x)) / grid_w # center position, unit: image width y = (row + _sigmoid(y)) / grid_h # center position, unit: image height w = anchors[2 * b + 0] * np.exp(w) / grid_w # unit: image width h = anchors[2 * b + 1] * np.exp(h) / grid_h # unit: image height confidence = netout[row,col,b,4] box = BoundBox(x, y, w, h, confidence, classes) boxes.append(box) boxes = nms_boxes(boxes, len(classes), nms_threshold, threshold) if len(boxes) > 0: return boxes_to_array(boxes) else: return [] def decode_yolov3(netout, nms_threshold = 0.2, threshold = 0.3, anchors = [[[0.76120044, 0.57155991], [0.6923348, 0.88535553], [0.47163042, 0.34163313]], [[0.33340788, 0.70065861], [0.18124964, 0.38986752], [0.08497349, 0.1527057 ]]]): #Convert Yolo network output to bounding box boxes = [] for l, output in enumerate(netout): grid_h, grid_w, nb_box = output.shape[0:3] # decode the output by the network output[..., 4] = _sigmoid(output[..., 4]) output[..., 5:] = output[..., 4][..., np.newaxis] * _sigmoid(output[..., 5:]) output[..., 5:] *= output[..., 5:] > threshold for row in range(grid_h): for col in range(grid_w): for b in range(nb_box): # from 4th element onwards are confidence and class classes classes = output[row, col, b, 5:] if np.sum(classes) > 0: # first 4 elements are x, y, w, and h x, y, w, h = output[row, col, b, :4] x = (col + _sigmoid(x)) / grid_w # center position, unit: image width y = (row + _sigmoid(y)) / grid_h # center position, unit: image height w = anchors[l][b][0] * np.exp(w) # unit: image width h = anchors[l][b][1] * np.exp(h) # unit: image height confidence = output[row, col, b, 4] box = BoundBox(x, y, w, h, confidence, classes) boxes.append(box) boxes = nms_boxes(boxes, len(classes), nms_threshold, threshold) if len(boxes) > 0: return boxes_to_array(boxes) else: return [] def decode_classifier(netout, top_k=3): netout = netout[0] ordered = np.argsort(netout) results = [(i, netout[i]) for i in ordered[-top_k:][::-1]] return results def decode_segnet(netout, labels, class_colors): netout = netout[0] seg_arr = netout.argmax(axis=2) seg_img = np.zeros((netout.shape[0], netout.shape[1], 3)) for c in range(len(labels)): seg_img[:, :, 0] += ((seg_arr[:, :] == c)*(class_colors[c][0])).astype('uint8') seg_img[:, :, 1] += ((seg_arr[:, :] == c)*(class_colors[c][1])).astype('uint8') seg_img[:, :, 2] += ((seg_arr[:, :] == c)*(class_colors[c][2])).astype('uint8') return seg_img def get_legends(class_names, colors): n_classes = len(class_names) legend = np.zeros(((len(class_names) * 25), 150, 3), dtype="uint8") + 255 for (i, (class_name, color)) in enumerate(zip(class_names.values() , colors)): color = [int(c) for c in color] cv2.putText(legend, class_name, (5, (i * 25) + 17),cv2.FONT_HERSHEY_COMPLEX, 0.5, (0, 0, 0), 1) cv2.rectangle(legend, (125, (i * 25)), (150, (i * 25) + 25), tuple(color), -1) return legend def overlay_seg_image(inp_img, seg_img): orininal_h = inp_img.shape[0] orininal_w = inp_img.shape[1] seg_img = cv2.resize(seg_img, (orininal_w, orininal_h)) fused_img = (inp_img/2 + seg_img/2 ).astype('uint8') return fused_img def concat_lenends(seg_img, legend_img): seg_img[:legend_img.shape[0],:legend_img.shape[1]] = np.copy(legend_img) return seg_img def _sigmoid(x): return 1. / (1. + np.exp(-x)) def _softmax(x, axis=-1, t=-100.): x = x - np.max(x) if np.min(x) < t: x = x/np.min(x)*t e_x = np.exp(x) return e_x / e_x.sum(axis, keepdims=True) def resize_with_aspect_ratio(frame: np.ndarray, input_binding_info: tuple): """ Resizes frame while maintaining aspect ratio, padding any empty space. Args: frame: Captured frame. input_binding_info: Contains shape of model input layer. Returns: Frame resized to the size of model input layer. """ aspect_ratio = frame.shape[1] / frame.shape[0] model_height, model_width = list(input_binding_info[1].GetShape())[1:3] if aspect_ratio >= 1.0: new_height, new_width = int(model_width / aspect_ratio), model_width b_padding, r_padding = model_height - new_height, 0 else: new_height, new_width = model_height, int(model_height * aspect_ratio) b_padding, r_padding = 0, model_width - new_width # Resize and pad any empty space frame = cv2.resize(frame, (new_width, new_height), interpolation=cv2.INTER_LINEAR) frame = cv2.copyMakeBorder(frame, top=0, bottom=b_padding, left=0, right=r_padding, borderType=cv2.BORDER_CONSTANT, value=[0, 0, 0]) return frame def create_video_writer(video, video_path, output_name): """ Creates a video writer object to write processed frames to file. Args: video: Video capture object, contains information about data source. video_path: User-specified video file path. output_path: Optional path to save the processed video. Returns: Video writer object. """ _, ext = os.path.splitext(video_path) i, filename = 0, output_name + ext while os.path.exists(filename): i += 1 filename = output_name + str(i) + ext video_writer = cv2.VideoWriter(filename=filename, fourcc=get_source_encoding_int(video), fps=int(video.get(cv2.CAP_PROP_FPS)), frameSize=(int(video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)))) return video_writer def init_video_file_capture(video_path, output_name): """ Creates a video capture object from a video file. Args: video_path: User-specified video file path. output_path: Optional path to save the processed video. Returns: Video capture object to capture frames, video writer object to write processed frames to file, plus total frame count of video source to iterate through. """ if not os.path.exists(video_path): raise FileNotFoundError(f'Video file not found for: {video_path}') video = cv2.VideoCapture(video_path) if not video.isOpened: raise RuntimeError(f'Failed to open video capture from file: {video_path}') video_writer = create_video_writer(video, video_path, output_name) iter_frame_count = range(int(video.get(cv2.CAP_PROP_FRAME_COUNT))) return video, video_writer, iter_frame_count def draw_bounding_boxes(frame, detections, labels=None, processing_function=None): """ Draws bounding boxes around detected objects and adds a label and confidence score. Args: frame: The original captured frame from video source. detections: A list of detected objects in the form [class, [box positions], confidence]. resize_factor: Resizing factor to scale box coordinates to output frame size. labels: Dictionary of labels and colors keyed on the classification index. """ def _to_original_scale(boxes, frame_height, frame_width): minmax_boxes = np.empty(shape=(4, ), dtype=np.int) cx = boxes[0] * frame_width cy = boxes[1] * frame_height w = boxes[2] * frame_width h = boxes[3] * frame_height minmax_boxes[0] = cx - w/2 minmax_boxes[1] = cy - h/2 minmax_boxes[2] = cx + w/2 minmax_boxes[3] = cy + h/2 return minmax_boxes color = (0, 255, 0) label_color = (125, 125, 125) for i in range(len(detections)): class_idx, box, confidence = [d for d in detections[i]] # Obtain frame size and resized bounding box positions frame_height, frame_width = frame.shape[:2] x_min, y_min, x_max, y_max = _to_original_scale(box, frame_height, frame_width) # Ensure box stays within the frame x_min, y_min = max(0, x_min), max(0, y_min) x_max, y_max = min(frame_width, x_max), min(frame_height, y_max) # Draw bounding box around detected object cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), color, 2) if processing_function: roi_img = frame[y_min:y_max, x_min:x_max] label = processing_function(roi_img) else: # Create label for detected object class label = labels[class_idx].capitalize() label = f'{label} {confidence * 100:.1f}%' # Make sure label always stays on-screen x_text, y_text = cv2.getTextSize(label, cv2.FONT_HERSHEY_DUPLEX, 1, 1)[0][:2] lbl_box_xy_min = (x_min, y_min if y_min<25 else y_min - y_text) lbl_box_xy_max = (x_min + int(0.55 * x_text), y_min + y_text if y_min<25 else y_min) lbl_text_pos = (x_min + 5, y_min + 16 if y_min<25 else y_min - 5) # Add label and confidence value cv2.rectangle(frame, lbl_box_xy_min, lbl_box_xy_max, color, -1) cv2.putText(frame, label, lbl_text_pos, cv2.FONT_HERSHEY_DUPLEX, 0.50, label_color, 1, cv2.LINE_AA) def draw_classification(frame, classifications, labels): for i in range(len(classifications)): label_id, prob = classifications[i] text = '%s : %.2f' % (labels[label_id], prob) cv2.putText(frame, text, (10, 20*i+20), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2, True) def get_source_encoding_int(video_capture): return int(video_capture.get(cv2.CAP_PROP_FOURCC)) class BoundBox: def __init__(self, x, y, w, h, c = None, classes = None): self.x = x self.y = y self.w = w self.h = h self.c = c self.classes = classes def get_label(self): return np.argmax(self.classes) def get_score(self): return self.classes[self.get_label()] def iou(self, bound_box): b1 = self.as_centroid() b2 = bound_box.as_centroid() return centroid_box_iou(b1, b2) def as_centroid(self): return np.array([self.x, self.y, self.w, self.h]) def boxes_to_array(bound_boxes): """ # Args boxes : list of BoundBox instances # Returns centroid_boxes : (N, 4) probs : (N, nb_classes) """ temp_list = [] for box in bound_boxes: temp_list.append([np.argmax(box.classes), np.asarray([box.x, box.y, box.w, box.h]), np.max(box.classes)]) return np.array(temp_list) def nms_boxes(boxes, n_classes, nms_threshold=0.3, obj_threshold=0.3): """ # Args boxes : list of BoundBox # Returns boxes : list of BoundBox non maximum supressed BoundBox instances """ # suppress non-maximal boxes for c in range(n_classes): sorted_indices = list(reversed(np.argsort([box.classes[c] for box in boxes]))) for i in range(len(sorted_indices)): index_i = sorted_indices[i] if boxes[index_i].classes[c] == 0: continue else: for j in range(i+1, len(sorted_indices)): index_j = sorted_indices[j] if boxes[index_i].iou(boxes[index_j]) >= nms_threshold: boxes[index_j].classes[c] = 0 # remove the boxes which are less likely than a obj_threshold boxes = [box for box in boxes if box.get_score() > obj_threshold] return boxes def centroid_box_iou(box1, box2): def _interval_overlap(interval_a, interval_b): x1, x2 = interval_a x3, x4 = interval_b if x3 < x1: if x4 < x1: return 0 else: return min(x2,x4) - x1 else: if x2 < x3: return 0 else: return min(x2,x4) - x3 _, _, w1, h1 = box1.reshape(-1,) _, _, w2, h2 = box2.reshape(-1,) x1_min, y1_min, x1_max, y1_max = to_minmax(box1.reshape(-1,4)).reshape(-1,) x2_min, y2_min, x2_max, y2_max = to_minmax(box2.reshape(-1,4)).reshape(-1,) intersect_w = _interval_overlap([x1_min, x1_max], [x2_min, x2_max]) intersect_h = _interval_overlap([y1_min, y1_max], [y2_min, y2_max]) intersect = intersect_w * intersect_h union = w1 * h1 + w2 * h2 - intersect return float(intersect) / union def to_minmax(centroid_boxes): centroid_boxes = centroid_boxes.astype(np.float) minmax_boxes = np.zeros_like(centroid_boxes) cx = centroid_boxes[:,0] cy = centroid_boxes[:,1] w = centroid_boxes[:,2] h = centroid_boxes[:,3] minmax_boxes[:,0] = cx - w/2 minmax_boxes[:,1] = cy - h/2 minmax_boxes[:,2] = cx + w/2 minmax_boxes[:,3] = cy + h/2 return minmax_boxes ================================================ FILE: example_scripts/tensorflow_lite/detector/detector_file.py ================================================ import time import argparse import os import cv2 import numpy as np from tqdm import tqdm from cv_utils import init_video_file_capture, decode_yolov3, draw_bounding_boxes, preprocess from tflite_runtime.interpreter import Interpreter def load_labels(path): with open(path, 'r') as f: return {i: line.strip() for i, line in enumerate(f.read().replace('"','').split(','))} class NetworkExecutor(object): def __init__(self, model_file): self.interpreter = Interpreter(model_file, num_threads=3) self.interpreter.allocate_tensors() _, self.input_height, self.input_width, _ = self.interpreter.get_input_details()[0]['shape'] self.tensor_index = self.interpreter.get_input_details()[0]['index'] def get_output_tensors(self): output_details = self.interpreter.get_output_details() tensor_indices = [] tensor_list = [] for output in output_details: tensor = np.squeeze(self.interpreter.get_tensor(output['index'])) tensor_list.append(tensor) return tensor_list def run(self, image): if image.shape[1:2] != (self.input_height, self.input_width): img = cv2.resize(image, (self.input_width, self.input_height)) img = preprocess(img) self.interpreter.set_tensor(self.tensor_index, img) self.interpreter.invoke() return self.get_output_tensors() def main(args, detector): video, video_writer, frame_count = init_video_file_capture(args.file, 'detector_demo') if not os.path.exists(args.labels[0]): labels = args.labels else: labels = load_labels(args.labels[0]) frame_num = len(frame_count) times = [] for _ in tqdm(frame_count, desc='Processing frames'): frame_present, frame = video.read() if not frame_present: continue start_time = time.time() results = detection_network.run(frame) elapsed_ms = (time.time() - start_time) * 1000 detections = decode_yolov3(netout = results, threshold = args.threshold) draw_bounding_boxes(frame, detections, labels) times.append(elapsed_ms) video_writer.write(frame) print('Finished processing frames') video.release(), video_writer.release() print("Average time(ms): ", sum(times)//frame_num) print("FPS: ", 1000.0 / (sum(times)//frame_num)) # FPS = 1 / time to process loop if __name__ == "__main__" : print("OpenCV version: {}".format(cv2. __version__)) parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--model', help='File path of .tflite file.', required=True) parser.add_argument('--labels', nargs="+", help='File path of labels file.', required=True) parser.add_argument('--threshold', help='Confidence threshold.', default=0.7) parser.add_argument('--file', help='File path of video file', default=None) args = parser.parse_args() detection_network = NetworkExecutor(args.model) main(args, detection_network) ================================================ FILE: example_scripts/tensorflow_lite/detector/detector_stream.py ================================================ import time import argparse import os import cv2 import numpy as np from cv_utils import decode_yolov3, preprocess, draw_bounding_boxes from tflite_runtime.interpreter import Interpreter from flask import Flask, render_template, request, Response app = Flask (__name__, static_url_path = '') def load_labels(path): with open(path, 'r') as f: return {i: line.strip() for i, line in enumerate(f.read().replace('"','').split(','))} class NetworkExecutor(object): def __init__(self, model_file): self.interpreter = Interpreter(model_file, num_threads=3) self.interpreter.allocate_tensors() _, self.input_height, self.input_width, _ = self.interpreter.get_input_details()[0]['shape'] self.tensor_index = self.interpreter.get_input_details()[0]['index'] def get_output_tensors(self): output_details = self.interpreter.get_output_details() tensor_indices = [] tensor_list = [] for output in output_details: tensor = np.squeeze(self.interpreter.get_tensor(output['index'])) tensor_list.append(tensor) return tensor_list def run(self, image): if image.shape[1:2] != (self.input_height, self.input_width): img = cv2.resize(image, (self.input_width, self.input_height)) img = preprocess(img) self.interpreter.set_tensor(self.tensor_index, img) self.interpreter.invoke() return self.get_output_tensors() class Detector(NetworkExecutor): def __init__(self, label_file, model_file, threshold): super().__init__(model_file) self._threshold = float(threshold) if not os.path.exists(label_file): self.labels = [label_file] else: self.labels = load_labels(label_file) def detect(self, original_image): start_time = time.time() results = self.run(original_image) elapsed_ms = (time.time() - start_time) * 1000 detections = decode_yolov3(netout = results, threshold = self._threshold) draw_bounding_boxes(original_image, detections, self.labels) fps = 1 / elapsed_ms*1000 print("Estimated frames per second : {0:.2f} Inference time: {1:.2f}".format(fps, elapsed_ms)) return cv2.imencode('.jpg', original_image)[1].tobytes() @app.route("/") def index(): return render_template('index.html', name = None) def gen(camera): while True: frame = camera.get_frame() image = detector.detect(frame) yield (b'--frame\r\n'+b'Content-Type: image/jpeg\r\n\r\n' + image + b'\r\n') @app.route('/video_feed') def video_feed(): return Response(gen(Camera()), mimetype='multipart/x-mixed-replace; boundary=frame') parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--model', help='File path of .tflite file.', required=True) parser.add_argument('--labels', help='File path of labels file.', required=True) parser.add_argument('--threshold', help='Confidence threshold.', default=0.7) parser.add_argument('--source', help='picamera or cv', default='cv') args = parser.parse_args() if args.source == "cv": from camera_opencv import Camera source = 0 elif args.source == "picamera": from camera_pi import Camera source = 0 Camera.set_video_source(source) detector = Detector(args.labels, args.model, args.threshold) if __name__ == "__main__" : app.run(host = '0.0.0.0', port = 5000, debug = True) ================================================ FILE: example_scripts/tensorflow_lite/detector/templates/index.html ================================================ Video Streaming Demonstration

Tflite Object Detection Demo

================================================ FILE: example_scripts/tensorflow_lite/segnet/base_camera.py ================================================ import time import threading try: from greenlet import getcurrent as get_ident except ImportError: try: from thread import get_ident except ImportError: from _thread import get_ident class CameraEvent(object): """An Event-like class that signals all active clients when a new frame is available. """ def __init__(self): self.events = {} def wait(self): """Invoked from each client's thread to wait for the next frame.""" ident = get_ident() if ident not in self.events: # this is a new client # add an entry for it in the self.events dict # each entry has two elements, a threading.Event() and a timestamp self.events[ident] = [threading.Event(), time.time()] return self.events[ident][0].wait() def set(self): """Invoked by the camera thread when a new frame is available.""" now = time.time() remove = None for ident, event in self.events.items(): if not event[0].isSet(): # if this client's event is not set, then set it # also update the last set timestamp to now event[0].set() event[1] = now else: # if the client's event is already set, it means the client # did not process a previous frame # if the event stays set for more than 5 seconds, then assume # the client is gone and remove it if now - event[1] > 5: remove = ident if remove: del self.events[remove] def clear(self): """Invoked from each client's thread after a frame was processed.""" self.events[get_ident()][0].clear() class BaseCamera(object): thread = None # background thread that reads frames from camera frame = None # current frame is stored here by background thread last_access = 0 # time of last client access to the camera event = CameraEvent() def __init__(self): """Start the background camera thread if it isn't running yet.""" if BaseCamera.thread is None: BaseCamera.last_access = time.time() # start background frame thread BaseCamera.thread = threading.Thread(target=self._thread) BaseCamera.thread.start() # wait until frames are available while self.get_frame() is None: time.sleep(0) def get_frame(self): """Return the current camera frame.""" BaseCamera.last_access = time.time() # wait for a signal from the camera thread BaseCamera.event.wait() BaseCamera.event.clear() return BaseCamera.frame @staticmethod def frames(): """"Generator that returns frames from the camera.""" raise RuntimeError('Must be implemented by subclasses.') @classmethod def _thread(cls): """Camera background thread.""" print('Starting camera thread.') frames_iterator = cls.frames() for frame in frames_iterator: BaseCamera.frame = frame BaseCamera.event.set() # send signal to clients time.sleep(0) # if there hasn't been any clients asking for frames in # the last 10 seconds then stop the thread if time.time() - BaseCamera.last_access > 10: frames_iterator.close() print('Stopping camera thread due to inactivity.') break BaseCamera.thread = None ================================================ FILE: example_scripts/tensorflow_lite/segnet/camera_opencv.py ================================================ import cv2 from base_camera import BaseCamera class Camera(BaseCamera): video_source = 0 @staticmethod def set_video_source(source): Camera.video_source = source @staticmethod def frames(): camera = cv2.VideoCapture(Camera.video_source) if not camera.isOpened(): raise RuntimeError('Could not start camera.') while True: # read current frame _, img = camera.read() #img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # return img yield img ================================================ FILE: example_scripts/tensorflow_lite/segnet/camera_pi.py ================================================ import io import time import picamera import picamera.array import cv2 from base_camera import BaseCamera class Camera(BaseCamera): @staticmethod def frames(): with picamera.PiCamera(resolution = (1280,720)) as camera: # let camera warm up time.sleep(2) with picamera.array.PiRGBArray(camera, size=(1280,720)) as stream: while True: camera.capture(stream, format='bgr') # At this point the image is available as stream.array image = stream.array stream.truncate(0) yield image ================================================ FILE: example_scripts/tensorflow_lite/segnet/cv_utils.py ================================================ # Copyright © 2020 Arm Ltd and Contributors. All rights reserved. # SPDX-License-Identifier: MIT """ This file contains helper functions for reading video/image data and pre/postprocessing of video/image data using OpenCV. """ import os import cv2 import numpy as np def preprocess(img): img = img.astype(np.float32) img = img / 255. img = img - 0.5 img = img * 2. img = img[:, :, ::-1] img = np.expand_dims(img, 0) return img def decode_yolov2(netout, nms_threshold = 0.2, threshold = 0.3, anchors = [1.889, 2.5245, 2.9465, 3.94056, 3.99987, 5.3658, 5.155437, 6.92275, 6.718375, 9.01025]): #Convert Yolo network output to bounding box netout = netout[0].reshape(7,7,5,6) grid_h, grid_w, nb_box = netout.shape[:3] boxes = [] # decode the output by the network netout[..., 4] = _sigmoid(netout[..., 4]) netout[..., 5:] = netout[..., 4][..., np.newaxis] * _softmax(netout[..., 5:]) netout[..., 5:] *= netout[..., 5:] > threshold for row in range(grid_h): for col in range(grid_w): for b in range(nb_box): # from 4th element onwards are confidence and class classes classes = netout[row,col,b,5:] if np.sum(classes) > 0: # first 4 elements are x, y, w, and h x, y, w, h = netout[row,col,b,:4] x = (col + _sigmoid(x)) / grid_w # center position, unit: image width y = (row + _sigmoid(y)) / grid_h # center position, unit: image height w = anchors[2 * b + 0] * np.exp(w) / grid_w # unit: image width h = anchors[2 * b + 1] * np.exp(h) / grid_h # unit: image height confidence = netout[row,col,b,4] box = BoundBox(x, y, w, h, confidence, classes) boxes.append(box) boxes = nms_boxes(boxes, len(classes), nms_threshold, threshold) if len(boxes) > 0: return boxes_to_array(boxes) else: return [] def decode_yolov3(netout, nms_threshold = 0.2, threshold = 0.3, anchors = [[[0.76120044, 0.57155991], [0.6923348, 0.88535553], [0.47163042, 0.34163313]], [[0.33340788, 0.70065861], [0.18124964, 0.38986752], [0.08497349, 0.1527057 ]]]): #Convert Yolo network output to bounding box boxes = [] for l, output in enumerate(netout): grid_h, grid_w, nb_box = output.shape[0:3] # decode the output by the network output[..., 4] = _sigmoid(output[..., 4]) output[..., 5:] = output[..., 4][..., np.newaxis] * _sigmoid(output[..., 5:]) output[..., 5:] *= output[..., 5:] > threshold for row in range(grid_h): for col in range(grid_w): for b in range(nb_box): # from 4th element onwards are confidence and class classes classes = output[row, col, b, 5:] if np.sum(classes) > 0: # first 4 elements are x, y, w, and h x, y, w, h = output[row, col, b, :4] x = (col + _sigmoid(x)) / grid_w # center position, unit: image width y = (row + _sigmoid(y)) / grid_h # center position, unit: image height w = anchors[l][b][0] * np.exp(w) # unit: image width h = anchors[l][b][1] * np.exp(h) # unit: image height confidence = output[row, col, b, 4] box = BoundBox(x, y, w, h, confidence, classes) boxes.append(box) boxes = nms_boxes(boxes, len(classes), nms_threshold, threshold) if len(boxes) > 0: return boxes_to_array(boxes) else: return [] def decode_classifier(netout, top_k=3): netout = netout[0] ordered = np.argsort(netout) results = [(i, netout[i]) for i in ordered[-top_k:][::-1]] return results def decode_segnet(netout, labels, class_colors): netout = netout[0] seg_arr = netout.argmax(axis=2) seg_img = np.zeros((netout.shape[0], netout.shape[1], 3)) for c in range(len(labels)): seg_img[:, :, 0] += ((seg_arr[:, :] == c)*(class_colors[c][0])).astype('uint8') seg_img[:, :, 1] += ((seg_arr[:, :] == c)*(class_colors[c][1])).astype('uint8') seg_img[:, :, 2] += ((seg_arr[:, :] == c)*(class_colors[c][2])).astype('uint8') return seg_img def get_legends(class_names, colors): n_classes = len(class_names) legend = np.zeros(((len(class_names) * 25), 150, 3), dtype="uint8") + 255 for (i, (class_name, color)) in enumerate(zip(class_names.values() , colors)): color = [int(c) for c in color] cv2.putText(legend, class_name, (5, (i * 25) + 17),cv2.FONT_HERSHEY_COMPLEX, 0.5, (0, 0, 0), 1) cv2.rectangle(legend, (125, (i * 25)), (150, (i * 25) + 25), tuple(color), -1) return legend def overlay_seg_image(inp_img, seg_img): orininal_h = inp_img.shape[0] orininal_w = inp_img.shape[1] seg_img = cv2.resize(seg_img, (orininal_w, orininal_h)) fused_img = (inp_img/2 + seg_img/2 ).astype('uint8') return fused_img def concat_lenends(seg_img, legend_img): seg_img[:legend_img.shape[0],:legend_img.shape[1]] = np.copy(legend_img) return seg_img def _sigmoid(x): return 1. / (1. + np.exp(-x)) def _softmax(x, axis=-1, t=-100.): x = x - np.max(x) if np.min(x) < t: x = x/np.min(x)*t e_x = np.exp(x) return e_x / e_x.sum(axis, keepdims=True) def resize_with_aspect_ratio(frame: np.ndarray, input_binding_info: tuple): """ Resizes frame while maintaining aspect ratio, padding any empty space. Args: frame: Captured frame. input_binding_info: Contains shape of model input layer. Returns: Frame resized to the size of model input layer. """ aspect_ratio = frame.shape[1] / frame.shape[0] model_height, model_width = list(input_binding_info[1].GetShape())[1:3] if aspect_ratio >= 1.0: new_height, new_width = int(model_width / aspect_ratio), model_width b_padding, r_padding = model_height - new_height, 0 else: new_height, new_width = model_height, int(model_height * aspect_ratio) b_padding, r_padding = 0, model_width - new_width # Resize and pad any empty space frame = cv2.resize(frame, (new_width, new_height), interpolation=cv2.INTER_LINEAR) frame = cv2.copyMakeBorder(frame, top=0, bottom=b_padding, left=0, right=r_padding, borderType=cv2.BORDER_CONSTANT, value=[0, 0, 0]) return frame def create_video_writer(video, video_path, output_name): """ Creates a video writer object to write processed frames to file. Args: video: Video capture object, contains information about data source. video_path: User-specified video file path. output_path: Optional path to save the processed video. Returns: Video writer object. """ _, ext = os.path.splitext(video_path) i, filename = 0, output_name + ext while os.path.exists(filename): i += 1 filename = output_name + str(i) + ext video_writer = cv2.VideoWriter(filename=filename, fourcc=get_source_encoding_int(video), fps=int(video.get(cv2.CAP_PROP_FPS)), frameSize=(int(video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)))) return video_writer def init_video_file_capture(video_path, output_name): """ Creates a video capture object from a video file. Args: video_path: User-specified video file path. output_path: Optional path to save the processed video. Returns: Video capture object to capture frames, video writer object to write processed frames to file, plus total frame count of video source to iterate through. """ if not os.path.exists(video_path): raise FileNotFoundError(f'Video file not found for: {video_path}') video = cv2.VideoCapture(video_path) if not video.isOpened: raise RuntimeError(f'Failed to open video capture from file: {video_path}') video_writer = create_video_writer(video, video_path, output_name) iter_frame_count = range(int(video.get(cv2.CAP_PROP_FRAME_COUNT))) return video, video_writer, iter_frame_count def draw_bounding_boxes(frame, detections, labels=None, processing_function=None): """ Draws bounding boxes around detected objects and adds a label and confidence score. Args: frame: The original captured frame from video source. detections: A list of detected objects in the form [class, [box positions], confidence]. resize_factor: Resizing factor to scale box coordinates to output frame size. labels: Dictionary of labels and colors keyed on the classification index. """ def _to_original_scale(boxes, frame_height, frame_width): minmax_boxes = np.empty(shape=(4, ), dtype=np.int) cx = boxes[0] * frame_width cy = boxes[1] * frame_height w = boxes[2] * frame_width h = boxes[3] * frame_height minmax_boxes[0] = cx - w/2 minmax_boxes[1] = cy - h/2 minmax_boxes[2] = cx + w/2 minmax_boxes[3] = cy + h/2 return minmax_boxes color = (0, 255, 0) label_color = (125, 125, 125) for i in range(len(detections)): class_idx, box, confidence = [d for d in detections[i]] # Obtain frame size and resized bounding box positions frame_height, frame_width = frame.shape[:2] x_min, y_min, x_max, y_max = _to_original_scale(box, frame_height, frame_width) # Ensure box stays within the frame x_min, y_min = max(0, x_min), max(0, y_min) x_max, y_max = min(frame_width, x_max), min(frame_height, y_max) # Draw bounding box around detected object cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), color, 2) if processing_function: roi_img = frame[y_min:y_max, x_min:x_max] label = processing_function(roi_img) else: # Create label for detected object class label = labels[class_idx].capitalize() label = f'{label} {confidence * 100:.1f}%' # Make sure label always stays on-screen x_text, y_text = cv2.getTextSize(label, cv2.FONT_HERSHEY_DUPLEX, 1, 1)[0][:2] lbl_box_xy_min = (x_min, y_min if y_min<25 else y_min - y_text) lbl_box_xy_max = (x_min + int(0.55 * x_text), y_min + y_text if y_min<25 else y_min) lbl_text_pos = (x_min + 5, y_min + 16 if y_min<25 else y_min - 5) # Add label and confidence value cv2.rectangle(frame, lbl_box_xy_min, lbl_box_xy_max, color, -1) cv2.putText(frame, label, lbl_text_pos, cv2.FONT_HERSHEY_DUPLEX, 0.50, label_color, 1, cv2.LINE_AA) def draw_classification(frame, classifications, labels): for i in range(len(classifications)): label_id, prob = classifications[i] text = '%s : %.2f' % (labels[label_id], prob) cv2.putText(frame, text, (10, 20*i+20), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2, True) def get_source_encoding_int(video_capture): return int(video_capture.get(cv2.CAP_PROP_FOURCC)) class BoundBox: def __init__(self, x, y, w, h, c = None, classes = None): self.x = x self.y = y self.w = w self.h = h self.c = c self.classes = classes def get_label(self): return np.argmax(self.classes) def get_score(self): return self.classes[self.get_label()] def iou(self, bound_box): b1 = self.as_centroid() b2 = bound_box.as_centroid() return centroid_box_iou(b1, b2) def as_centroid(self): return np.array([self.x, self.y, self.w, self.h]) def boxes_to_array(bound_boxes): """ # Args boxes : list of BoundBox instances # Returns centroid_boxes : (N, 4) probs : (N, nb_classes) """ temp_list = [] for box in bound_boxes: temp_list.append([np.argmax(box.classes), np.asarray([box.x, box.y, box.w, box.h]), np.max(box.classes)]) return np.array(temp_list) def nms_boxes(boxes, n_classes, nms_threshold=0.3, obj_threshold=0.3): """ # Args boxes : list of BoundBox # Returns boxes : list of BoundBox non maximum supressed BoundBox instances """ # suppress non-maximal boxes for c in range(n_classes): sorted_indices = list(reversed(np.argsort([box.classes[c] for box in boxes]))) for i in range(len(sorted_indices)): index_i = sorted_indices[i] if boxes[index_i].classes[c] == 0: continue else: for j in range(i+1, len(sorted_indices)): index_j = sorted_indices[j] if boxes[index_i].iou(boxes[index_j]) >= nms_threshold: boxes[index_j].classes[c] = 0 # remove the boxes which are less likely than a obj_threshold boxes = [box for box in boxes if box.get_score() > obj_threshold] return boxes def centroid_box_iou(box1, box2): def _interval_overlap(interval_a, interval_b): x1, x2 = interval_a x3, x4 = interval_b if x3 < x1: if x4 < x1: return 0 else: return min(x2,x4) - x1 else: if x2 < x3: return 0 else: return min(x2,x4) - x3 _, _, w1, h1 = box1.reshape(-1,) _, _, w2, h2 = box2.reshape(-1,) x1_min, y1_min, x1_max, y1_max = to_minmax(box1.reshape(-1,4)).reshape(-1,) x2_min, y2_min, x2_max, y2_max = to_minmax(box2.reshape(-1,4)).reshape(-1,) intersect_w = _interval_overlap([x1_min, x1_max], [x2_min, x2_max]) intersect_h = _interval_overlap([y1_min, y1_max], [y2_min, y2_max]) intersect = intersect_w * intersect_h union = w1 * h1 + w2 * h2 - intersect return float(intersect) / union def to_minmax(centroid_boxes): centroid_boxes = centroid_boxes.astype(np.float) minmax_boxes = np.zeros_like(centroid_boxes) cx = centroid_boxes[:,0] cy = centroid_boxes[:,1] w = centroid_boxes[:,2] h = centroid_boxes[:,3] minmax_boxes[:,0] = cx - w/2 minmax_boxes[:,1] = cy - h/2 minmax_boxes[:,2] = cx + w/2 minmax_boxes[:,3] = cy + h/2 return minmax_boxes ================================================ FILE: example_scripts/tensorflow_lite/segnet/segnet_file.py ================================================ import time import argparse import os import cv2 import numpy as np from tqdm import tqdm import random random.seed(0) from cv_utils import init_video_file_capture, decode_segnet, get_legends, overlay_seg_image, concat_lenends, preprocess from tflite_runtime.interpreter import Interpreter def load_labels(path): with open(path, 'r') as f: return {i: line.strip() for i, line in enumerate(f.read().replace('"','').split(','))} class NetworkExecutor(object): def __init__(self, model_file): self.interpreter = Interpreter(model_file, num_threads=3) self.interpreter.allocate_tensors() _, self.input_height, self.input_width, _ = self.interpreter.get_input_details()[0]['shape'] self.tensor_index = self.interpreter.get_input_details()[0]['index'] def get_output_tensors(self): output_details = self.interpreter.get_output_details() tensor_indices = [] tensor_list = [] for output in output_details: tensor = np.squeeze(self.interpreter.get_tensor(output['index'])) tensor_list.append(tensor) return tensor_list def run(self, image): if image.shape[1:2] != (self.input_height, self.input_width): img = cv2.resize(image, (self.input_width, self.input_height)) img = preprocess(img) self.interpreter.set_tensor(self.tensor_index, img) self.interpreter.invoke() return self.get_output_tensors() def main(args): video, video_writer, frame_count = init_video_file_capture(args.file, 'segnet_demo') if not os.path.exists(args.labels[0]): labels = args.labels else: labels = load_labels(args.labels[0]) class_colors = [(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) for _ in range(256)] legend_img = get_legends(labels, class_colors) frame_num = len(frame_count) times = [] for _ in tqdm(frame_count, desc='Processing frames'): frame_present, frame = video.read() if not frame_present: continue start_time = time.time() results = segmentation_network.run(frame) elapsed_ms = (time.time() - start_time) * 1000 seg_img = decode_segnet(results, labels, class_colors) if args.overlay == True: seg_img = overlay_seg_image(frame, seg_img) frame = concat_lenends(seg_img, legend_img) times.append(elapsed_ms) video_writer.write(frame) print('Finished processing frames') video.release(), video_writer.release() print("Average time(ms): ", sum(times)//frame_num) print("FPS: ", 1000.0 / (sum(times)//frame_num)) # FPS = 1 / time to process loop if __name__ == "__main__" : print("OpenCV version: {}".format(cv2. __version__)) parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--model', help='File path of .tflite file.', required=True) parser.add_argument('--labels', nargs="+", help='File path of labels file.', required=True) parser.add_argument('--overlay', help='Overlay original image.', default=True) parser.add_argument('--file', help='File path of video file', default=None) args = parser.parse_args() segmentation_network = NetworkExecutor(args.model) main(args) ================================================ FILE: example_scripts/tensorflow_lite/segnet/segnet_stream.py ================================================ import time import argparse import os import cv2 import numpy as np import random random.seed(0) from cv_utils import decode_segnet, get_legends, overlay_seg_image, concat_lenends, preprocess from tflite_runtime.interpreter import Interpreter from flask import Flask, render_template, request, Response app = Flask (__name__, static_url_path = '') def load_labels(path): with open(path, 'r') as f: return {i: line.strip() for i, line in enumerate(f.read().replace('"','').split(','))} class NetworkExecutor(object): def __init__(self, model_file): self.interpreter = Interpreter(model_file, num_threads=3) self.interpreter.allocate_tensors() _, self.input_height, self.input_width, _ = self.interpreter.get_input_details()[0]['shape'] self.tensor_index = self.interpreter.get_input_details()[0]['index'] def get_output_tensors(self): output_details = self.interpreter.get_output_details() tensor_indices = [] tensor_list = [] for output in output_details: tensor = np.squeeze(self.interpreter.get_tensor(output['index'])) tensor_list.append(tensor) return tensor_list def run(self, image): if image.shape[1:2] != (self.input_height, self.input_width): img = cv2.resize(image, (self.input_width, self.input_height)) img = preprocess(img) self.interpreter.set_tensor(self.tensor_index, img) self.interpreter.invoke() return self.get_output_tensors() class Segnet(NetworkExecutor): def __init__(self, label_file, model_file, overlay): super().__init__(model_file) if not os.path.exists(label_file): self.labels = [label_file] else: self.labels = load_labels(label_file) self.class_colors = [(random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)) for _ in range(256)] self.legend_img = get_legends(self.labels, self.class_colors) self.overlay = overlay def segment(self, frame): start_time = time.time() results = self.run(frame) elapsed_ms = (time.time() - start_time) * 1000 seg_img = decode_segnet(results, self.labels, self.class_colors) if args.overlay == True: seg_img = overlay_seg_image(frame, seg_img) frame = concat_lenends(seg_img, self.legend_img) fps = 1 / elapsed_ms*1000 print("Estimated frames per second : {0:.2f} Inference time: {1:.2f}".format(fps, elapsed_ms)) return cv2.imencode('.jpg', frame)[1].tobytes() @app.route("/") def index(): return render_template('index.html', name = None) def gen(camera): while True: frame = camera.get_frame() image = segnet.segment(frame) yield (b'--frame\r\n'+b'Content-Type: image/jpeg\r\n\r\n' + image + b'\r\n') @app.route('/video_feed') def video_feed(): return Response(gen(Camera()), mimetype='multipart/x-mixed-replace; boundary=frame') parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--model', help='File path of .tflite file.', required=True) parser.add_argument('--labels', help='File path of labels file.', required=True) parser.add_argument('--overlay', help='Overlay original image.', default=True) parser.add_argument('--source', help='picamera or cv', default='cv') args = parser.parse_args() if args.source == "cv": from camera_opencv import Camera source = 0 elif args.source == "picamera": from camera_pi import Camera source = 0 Camera.set_video_source(source) segnet = Segnet(args.labels, args.model, args.overlay) if __name__ == "__main__" : app.run(host = '0.0.0.0', port = 5000, debug = True) ================================================ FILE: example_scripts/tensorflow_lite/segnet/templates/index.html ================================================ Video Streaming Demonstration

Tflite Semantic Segmentation Demo

================================================ FILE: resources/aXeleRate_face_detector.ipynb ================================================ { "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "aXeleRate_pascal20_detector.ipynb", "private_outputs": true, "provenance": [], "collapsed_sections": [], "mount_file_id": "1_yhmzOZKns_-h0GwyPu9YAT3K0WQ1PG8", "authorship_tag": "ABX9TyObcL241uRYx/322b9y47kr", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "hS9yMrWe02WQ" }, "source": [ "## PASCAL-VOC Detection model Training and Inference\n", "\n", "In this notebook we will use axelerate, Keras-based framework for AI on the edge, to quickly setup model training and then after training session is completed convert it to .tflite and .kmodel formats.\n", "\n", "First, let's take care of some administrative details. \n", "\n", "1) Before we do anything, make sure you have choosen GPU as Runtime type (in Runtime - > Change Runtime type).\n", "\n", "2) We need to mount Google Drive for saving our model checkpoints and final converted model(s). Press on Mount Google Drive button in Files tab on your left. \n", "\n", "In the next cell we clone axelerate Github repository and import it. \n", "\n", "**It is possible to use pip install or python setup.py install, but in that case you will need to restart the enironment.** Since I'm trying to make the process as streamlined as possibile I'm using sys.path.append for import." ] }, { "cell_type": "code", "metadata": { "id": "y07yAbYbjV2s" }, "source": [ "#we need imgaug 0.4 for image augmentations to work properly, see https://stackoverflow.com/questions/62580797/in-colab-doing-image-data-augmentation-with-imgaug-is-not-working-as-intended\n", "!pip uninstall -y imgaug && pip uninstall -y albumentations && pip install imgaug==0.4\n", "!git clone https://github.com/AIWintermuteAI/aXeleRate.git\n", "import sys\n", "sys.path.append('/content/aXeleRate')\n", "from axelerate import setup_training, setup_inference" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "5TBRMPZ83dRL" }, "source": [ "At this step you typically need to get the dataset. You can use !wget command to download it from somewhere on the Internet or !cp to copy from My Drive as in this example\n", "```\n", "!cp -r /content/drive/'My Drive'/pascal_20_segmentation.zip .\n", "!unzip --qq pascal_20_segmentation.zip\n", "```\n", "For this notebook we will use PASCAL-VOC 2012 object detection dataset, which you can download here:\n", "\n", "http://host.robots.ox.ac.uk:8080/pascal/VOC/voc2012/index.html#devkit\n", "\n", "I split the dataset into training and validation using a simple Python script. Since most of the models trained with aXeleRate are to be run on embedded devices and thus have memory and latency constraints, the validation images are easier than most of the images in training set. The validation images include one(or many) instance of a particular class, no mixed classes in one image.\n", "\n", "Let's visualize our detection model test dataset. We use img_num=10 to show only first 10 images. Feel free to change the number to None to see all 100 images.\n" ] }, { "cell_type": "code", "metadata": { "id": "_tpsgkGj7d79" }, "source": [ "%matplotlib inline\n", "!gdown https://drive.google.com/uc?id=1uQtP-Yct0Uiz7bU7cwl9hJU0AVGkMgGZ #subset of WideFace dataset\n", "\n", "!unzip --qq WideFace_large.zip\n", "\n", "from axelerate.networks.common_utils.augment import visualize_detection_dataset\n", "\n", "visualize_detection_dataset(img_folder='WideFace_large/imgs_validation', ann_folder='WideFace_large/anns_validation', num_imgs=10, img_size=224, augment=True)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "S1oqdtbr7VLB" }, "source": [ "Next step is defining a config dictionary. Most lines are self-explanatory.\n", "\n", "Type is model frontend - Classifier, Detector or Segnet\n", "\n", "Architecture is model backend (feature extractor) \n", "\n", "- Full Yolo\n", "- Tiny Yolo\n", "- MobileNet1_0\n", "- MobileNet7_5 \n", "- MobileNet5_0 \n", "- MobileNet2_5 \n", "- SqueezeNet\n", "- NASNetMobile\n", "- DenseNet121\n", "- ResNet50\n", "\n", "For more information on anchors, please read here\n", "https://github.com/pjreddie/darknet/issues/568\n", "\n", "Labels are labels present in your dataset.\n", "IMPORTANT: Please, list all the labels present in the dataset.\n", "\n", "object_scale determines how much to penalize wrong prediction of confidence of object predictors\n", "\n", "no_object_scale determines how much to penalize wrong prediction of confidence of non-object predictors\n", "\n", "coord_scale determines how much to penalize wrong position and size predictions (x, y, w, h)\n", "\n", "class_scale determines how much to penalize wrong class prediction\n", "\n", "For converter type you can choose the following:\n", "\n", "'k210', 'tflite_fullint', 'tflite_dynamic', 'edgetpu', 'openvino', 'onnx'" ] }, { "cell_type": "code", "metadata": { "id": "uruWpeGRf6Qi" }, "source": [ "config = {\n", " \"model\":{\n", " \"type\": \"Detector\",\n", " \"architecture\": \"MobileNet2_5\",\n", " \"input_size\": 224,\n", " \"anchors\": [0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828],\n", " \"labels\": [\"face\"],\n", " \"coord_scale\" : \t\t1.0,\n", " \"class_scale\" : \t\t1.0,\n", " \"object_scale\" : \t\t5.0,\n", " \"no_object_scale\" : \t1.0\n", " },\n", " \"weights\" : {\n", " \"full\": \t\t\t\t\"\",\n", " \"backend\": \t\t \"imagenet\"\n", " },\n", " \"train\" : {\n", " \"actual_epoch\": 30,\n", " \"train_image_folder\": \"WideFace_large/imgs\",\n", " \"train_annot_folder\": \"WideFace_large/anns\",\n", " \"train_times\": 1,\n", " \"valid_image_folder\": \"WideFace_large/imgs_validation\",\n", " \"valid_annot_folder\": \"WideFace_large/anns_validation\",\n", " \"valid_times\": 1,\n", " \"valid_metric\": \"mAP\",\n", " \"batch_size\": 32,\n", " \"learning_rate\": 1e-3,\n", " \"saved_folder\": \t\tF\"/content/drive/MyDrive/WideFace_large\",\n", " \"first_trainable_layer\": \"\",\n", " \"augumentation\":\t\t\t\tFalse,\n", " \"is_only_detect\" : \t\t False\n", " },\n", " \"converter\" : {\n", " \"type\": \t\t\t\t[\"tflite\"]\n", " }\n", " }" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "kobC_7gd5mEu" }, "source": [ "Let's check what GPU we have been assigned in this Colab session, if any." ] }, { "cell_type": "code", "metadata": { "id": "rESho_T70BWq" }, "source": [ "from tensorflow.python.client import device_lib\n", "device_lib.list_local_devices()" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "i0Fc61WrTxh1" }, "source": [ "Also, let's open Tensorboard, where we will be able to watch model training progress in real time. Training and validation logs also will be saved in project folder.\n", "Since there are no logs before we start the training, tensorboard will be empty. Refresh it after first epoch." ] }, { "cell_type": "code", "metadata": { "id": "jsGp9JvjTzzp" }, "source": [ "%load_ext tensorboard\n", "%tensorboard --logdir logs\n", "!sleep 10" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "cWyKjw-b5_yp" }, "source": [ "Finally we start the training by passing config dictionary we have defined earlier to setup_training function. The function will start the training with Reduce Learning Rate on Plateau and save on best mAP callbacks. Every epoch mAP of the model predictions is measured on the validation dataset. If you have specified the converter type in the config, after the training has stopped the script will convert the best model into the format you have specified in config and save it to the project folder.\n", "\n", "Let's train for one epoch to see how the whole pipeline works." ] }, { "cell_type": "code", "metadata": { "id": "deYD3cwukHsj" }, "source": [ "from keras import backend as K \n", "K.clear_session()\n", "model_path = setup_training(config_dict=config)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "ypTe3GZI619O" }, "source": [ "After training it is good to check the actual perfomance of your model by doing inference on your validation dataset and visualizing results. This is exactly what next block does." ] }, { "cell_type": "code", "metadata": { "id": "jE7pTYmZN7Pi" }, "source": [ "%matplotlib inline\n", "from keras import backend as K \n", "K.clear_session()\n", "setup_inference(config, model_path)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "nKsxhdPvzrD8" }, "source": [ "If you need to convert trained model to other formats, for example for inference with Edge TPU or OpenCV AI Kit, you can do it with following commands. Specify the converter type, backend and folder with calbiration images(normally your validation image folder)." ] }, { "cell_type": "code", "metadata": { "id": "awR7r4ILzrmb" }, "source": [ "from axelerate.networks.common_utils.convert import Converter\n", "converter = Converter('openvino', 'MobileNet2_5', 'WideFace_large/imgs_validation')\n", "converter.convert_model(model_path)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "5YuVe2VD11cd" }, "source": [ "Good luck and happy training! Have a look at these articles, that would allow you to get the most of Google Colab or connect to local runtime if there are no GPUs available;\n", "\n", "https://medium.com/@oribarel/getting-the-most-out-of-your-google-colab-2b0585f82403\n", "\n", "https://research.google.com/colaboratory/local-runtimes.html" ] } ] } ================================================ FILE: resources/aXeleRate_human_segmentation.ipynb ================================================ { "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "aXeleRate_human_segmentation.ipynb", "private_outputs": true, "provenance": [], "collapsed_sections": [], "mount_file_id": "101-DJzi5oWG7njbiibTdxgmG67ku_62z", "authorship_tag": "ABX9TyMYA8L5Gv+PoKfxaPtba9us", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "hS9yMrWe02WQ" }, "source": [ "## Segmentation model Training and Inference\n", "\n", "In this notebook we will use axelerate Keras-based framework for AI on the edge to quickly setup model training and then after training session is completed convert it to .tflite and .kmodel formats.\n", "\n", "First, let's take care of some administrative details. \n", "\n", "1) Before we do anything, make sure you have choosen GPU as Runtime type (in Runtime - > Change Runtime type).\n", "\n", "2) We need to mount Google Drive for saving our model checkpoints and final converted model(s). Press on Mount Google Drive button in Files tab on your left. \n", "\n", "In the next cell we clone axelerate Github repository and import it. \n", "\n", "**It is possible to use pip install or python setup.py install, but in that case you will need to restart the enironment.** Since I'm trying to make the process as streamlined as possibile I'm using sys.path.append for import." ] }, { "cell_type": "code", "metadata": { "id": "y07yAbYbjV2s" }, "source": [ "#we need imgaug 0.4 for image augmentations to work properly, see https://stackoverflow.com/questions/62580797/in-colab-doing-image-data-augmentation-with-imgaug-is-not-working-as-intended\n", "!pip uninstall -y imgaug && pip uninstall -y albumentations && pip install imgaug==0.4\n", "!git clone https://github.com/AIWintermuteAI/aXeleRate.git\n", "import sys\n", "sys.path.append('/content/aXeleRate')\n", "from axelerate import setup_training, setup_inference" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "5TBRMPZ83dRL" }, "source": [ "At this step you typically need to get the dataset. You can use !wget command to download it from somewhere on the Internet or !cp to copy from My Drive as in this example\n", "```\n", "!cp -r /content/drive/'My Drive'/pascal_20_segmentation.zip .\n", "!unzip --qq pascal_20_segmentation.zip\n", "```\n", "For this notebook we'll download the dataset I shared on Google Drive - it is a combination of two dataset for human image segmentation:\n", "\n", "[Human Segmentation Dataset by Vikram Shenoy](https://github.com/VikramShenoy97/Human-Segmentation-Dataset)\n", "\n", "[Human Parsing Dataset](https://github.com/lemondan/HumanParsing-Dataset)\n", "\n", "For semantic segmentation the dataset consists of RGB images and segmentation masks. \n", "A few things to keep in mind:\n", "\n", "- The filenames of the annotation images should be same as the filenames of the RGB images.\n", "\n", "- The dimensions of the annotation image for the corresponding RGB image should be same.\n", "\n", "- For each pixel in the RGB image, the class label of that pixel in the annotation image would be the value of the annotation image pixel.\n", "\n", "Let's visualize our semantic segmentation test dataset and see what that means in practice.\n" ] }, { "cell_type": "code", "metadata": { "id": "_tpsgkGj7d79" }, "source": [ "%matplotlib inline\n", "!gdown https://drive.google.com/uc?id=1NlKgS_GVusRhEFLqwm0EOP2i74z1JMHX\n", "!gdown https://drive.google.com/uc?id=18z2MLv9M6ARVE1KTHyoAqJQZOfSJWc57\n", "!unzip --qq human_segmentation.zip\n", "\n", "from axelerate.networks.common_utils.augment import visualize_segmentation_dataset\n", "\n", "visualize_segmentation_dataset(images_path = 'human_segmentation/imgs_validation', segs_path = 'human_segmentation/anns_validation', num_imgs = 10, img_size=224, augment=True, n_classes=2)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "S1oqdtbr7VLB" }, "source": [ "Next step is defining a config dictionary. Most lines are self-explanatory.\n", "\n", "Type is model frontend - Classifier, Detector or Segnet\n", "\n", "Architecture is model backend (feature extractor) \n", "\n", "- Full Yolo\n", "- Tiny Yolo\n", "- MobileNet1_0\n", "- MobileNet7_5 \n", "- MobileNet5_0 \n", "- MobileNet2_5 \n", "- SqueezeNet\n", "- NASNetMobile\n", "- ResNet50\n", "- DenseNet121\n", "\n", "For converter type you can choose the following:\n", "\n", "'k210', 'tflite_fullint', 'tflite_dynamic', 'edgetpu', 'openvino', 'onnx'\n", "\n", "**Since it is an example notebook, we will use pretrained weights and set all layers of the model to be \"frozen\"(non-trainable).** \n" ] }, { "cell_type": "code", "metadata": { "id": "Jw4q6_MsegD2" }, "source": [ "config = {\n", " \"model\" : {\n", " \"type\": \"SegNet\",\n", " \"architecture\": \"MobileNet5_0\",\n", " \"input_size\": 224,\n", " \"n_classes\" : \t\t2\n", " },\n", " \"weights\" : {\n", " \"full\": \t\t\t\t\"/content/Segnet_best_val_loss.h5\",\n", " \"backend\": \t\t \"imagenet\"\n", " },\n", " \"train\" : {\n", " \"actual_epoch\": 1,\n", " \"train_image_folder\": \"human_segmentation/imgs\",\n", " \"train_annot_folder\": \"human_segmentation/anns\",\n", " \"train_times\": 1,\n", " \"valid_image_folder\": \"human_segmentation/imgs_validation\",\n", " \"valid_annot_folder\": \"human_segmentation/anns_validation\",\n", " \"valid_times\": 1,\n", " \"valid_metric\": \"val_loss\",\n", " \"batch_size\": 32,\n", " \"learning_rate\": 0.0,\n", " \"saved_folder\": \t\tF\"/content/drive/MyDrive/projects/human_segmentation\",\n", " \"first_trainable_layer\": \"activation\",\n", " \"ignore_zero_class\": False,\n", " \"augmentation\":\t\t\t\tTrue\n", " },\n", " \"converter\" : {\n", " \"type\": \t\t\t\t[]\n", " }\n", " }" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "kobC_7gd5mEu" }, "source": [ "Let's check what GPU we have been assigned in this Colab session, if any." ] }, { "cell_type": "code", "metadata": { "id": "rESho_T70BWq" }, "source": [ "from tensorflow.python.client import device_lib\n", "device_lib.list_local_devices()" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "WB9096YQUQtb" }, "source": [ "Also, let's open Tensorboard, where we will be able to watch model training progress in real time. Training and validation logs also will be saved in project folder.\n", "Since there are no logs before we start the training, tensorboard will be empty. Refresh it after first epoch." ] }, { "cell_type": "code", "metadata": { "id": "k6P31xsjUSzi" }, "source": [ "%load_ext tensorboard\n", "%tensorboard --logdir logs\n", "!sleep 10" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "cWyKjw-b5_yp" }, "source": [ "Finally we start the training by passing config dictionary we have defined earlier to setup_training function. The function will start the training with Checkpoint, Reduce Learning Rate on Plateu and Early Stopping callbacks. If you have specified the converter type in the config, after the training has stopped the script will convert the best model into the format you have specified in config and save it to the project folder." ] }, { "cell_type": "code", "metadata": { "id": "deYD3cwukHsj" }, "source": [ "from keras import backend as K \n", "K.clear_session()\n", "model_path = setup_training(config_dict = config)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "ypTe3GZI619O" }, "source": [ "After training it is good to check the actual perfomance of your model by doing inference on your validation dataset and visualizing results. This is exactly what next block does. Our model used pre-trained weights and since we set learning rate to 0, we are just observing the perfomance of the model that was trained before." ] }, { "cell_type": "code", "metadata": { "id": "jE7pTYmZN7Pi" }, "source": [ "%matplotlib inline\n", "from keras import backend as K \n", "K.clear_session()\n", "setup_inference(config, model_path)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "23ByTRGE17g-" }, "source": [ "If you need to convert trained model to other formats, for example for inference with OpenCV AI Kit or Raspberry Pi(with quantized tflite model), you can do it with following commands. Specify the converter type, backend and folder with calbiration images(normally your validation image folder)." ] }, { "cell_type": "code", "metadata": { "id": "gXtqAape18K0" }, "source": [ "from axelerate.networks.common_utils.convert import Converter\n", "converter = Converter('k210', 'MobileNet5_0', 'human_segmentation/imgs_validation')\n", "converter.convert_model(model_path)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "crJm0Ttw10g1" }, "source": [ "To train the model from scratch use the following config and then run the cells with training and (optinally) inference functions again." ] }, { "cell_type": "code", "metadata": { "id": "0r9IKzfQ11UJ" }, "source": [ "config = {\n", " \"model\" : {\n", " \"type\": \"SegNet\",\n", " \"architecture\": \"MobileNet5_0\",\n", " \"input_size\": 224,\n", " \"n_classes\" : \t\t2\n", " },\n", " \"weights\" : {\n", " \"full\": \t\t\t\t\"\",\n", " \"backend\": \t\t \"imagenet\"\n", " },\n", " \"train\" : {\n", " \"actual_epoch\": 100,\n", " \"train_image_folder\": \"human_segmentation/imgs\",\n", " \"train_annot_folder\": \"human_segmentation/anns\",\n", " \"train_times\": 1,\n", " \"valid_image_folder\": \"human_segmentation/imgs_validation\",\n", " \"valid_annot_folder\": \"human_segmentation/anns_validation\",\n", " \"valid_times\": 1,\n", " \"valid_metric\": \"val_loss\",\n", " \"batch_size\": 32,\n", " \"learning_rate\": 1e-3,\n", " \"saved_folder\": \t\tF\"/content/drive/MyDrive/projects/human_segmentation\",\n", " \"first_trainable_layer\": \"\",\n", " \"ignore_zero_class\": False,\n", " \"augumentation\":\t\t\t\tTrue\n", " },\n", " \"converter\" : {\n", " \"type\": \t\t\t\t[\"k210\",\"tflite\"]\n", " }\n", " }" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "uxuW0Bh92FA9" }, "source": [ "from keras import backend as K \n", "K.clear_session()\n", "model_path = setup_training(config_dict=config)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "IK8RLSzA2FKZ" }, "source": [ "%matplotlib inline\n", "from keras import backend as K \n", "K.clear_session()\n", "setup_inference(config, model_path)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "5YuVe2VD11cd" }, "source": [ "Good luck and happy training! Have a look at these articles, that would allow you to get the most of Google Colab or connect to local runtime if there are no GPUs available;\n", "\n", "https://medium.com/@oribarel/getting-the-most-out-of-your-google-colab-2b0585f82403\n", "\n", "https://research.google.com/colaboratory/local-runtimes.html" ] } ] } ================================================ FILE: resources/aXeleRate_mark_detector.ipynb ================================================ { "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "aXeleRate_mark_detector.ipynb", "private_outputs": true, "provenance": [], "collapsed_sections": [], "mount_file_id": "1tDQwRgaEZqe_E-7g2kgi9QQ9FNl6e_2w", "authorship_tag": "ABX9TyOlFv83Dt6/Ug76a0IqmYTT", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "hS9yMrWe02WQ" }, "source": [ "## M.A.R.K. Detection model Training and Inference\n", "\n", "In this notebook we will use axelerate, Keras-based framework for AI on the edge, to quickly setup model training and then after training session is completed convert it to .tflite and .kmodel formats.\n", "\n", "First, let's take care of some administrative details. \n", "\n", "1) Before we do anything, make sure you have choosen GPU as Runtime type (in Runtime - > Change Runtime type).\n", "\n", "2) We need to mount Google Drive for saving our model checkpoints and final converted model(s). Press on Mount Google Drive button in Files tab on your left. \n", "\n", "In the next cell we clone axelerate Github repository and import it. \n", "\n", "**It is possible to use pip install or python setup.py install, but in that case you will need to restart the enironment.** Since I'm trying to make the process as streamlined as possibile I'm using sys.path.append for import." ] }, { "cell_type": "code", "metadata": { "id": "y07yAbYbjV2s" }, "source": [ "%load_ext tensorboard\n", "#we need imgaug 0.4 for image augmentations to work properly, see https://stackoverflow.com/questions/62580797/in-colab-doing-image-data-augmentation-with-imgaug-is-not-working-as-intended\n", "!pip uninstall -y imgaug && pip uninstall -y albumentations && pip install imgaug==0.4\n", "!git clone https://github.com/AIWintermuteAI/aXeleRate.git\n", "import sys\n", "sys.path.append('/content/aXeleRate')\n", "from axelerate import setup_training, setup_inference" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "5TBRMPZ83dRL" }, "source": [ "At this step you typically need to get the dataset. You can use !wget command to download it from somewhere on the Internet or !cp to copy from My Drive as in this example\n", "```\n", "!cp -r /content/drive/'My Drive'/pascal_20_segmentation.zip .\n", "!unzip --qq pascal_20_segmentation.zip\n", "```\n", "Dataset preparation and postprocessing are discussed in the article here:\n", "\n", "The annotation tool I use is LabelImg\n", "https://github.com/tzutalin/labelImg\n", "\n", "Let's visualize our detection model test dataset. There are images in validation folder with corresponding annotations in PASCAL-VOC format in validation annotations folder.\n" ] }, { "cell_type": "code", "metadata": { "id": "_tpsgkGj7d79" }, "source": [ "%matplotlib inline\n", "!gdown https://drive.google.com/uc?id=1s2h6DI_1tHpLoUWRc_SavvMF9jYG8XSi #dataset\n", "!gdown https://drive.google.com/uc?id=1-bDRZ9Z2T81SfwhHEfZIMFG7FtMQ5ZiZ #pre-trained model\n", "\n", "!unzip --qq mark_dataset.zip\n", "\n", "from axelerate.networks.common_utils.augment import visualize_detection_dataset\n", "\n", "visualize_detection_dataset(img_folder='mark_detection/imgs_validation', ann_folder='mark_detection/ann_validation', num_imgs=10, img_size=224, augment=True)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "S1oqdtbr7VLB" }, "source": [ "Next step is defining a config dictionary. Most lines are self-explanatory.\n", "\n", "Type is model frontend - Classifier, Detector or Segnet\n", "\n", "Architecture is model backend (feature extractor) \n", "\n", "- Full Yolo\n", "- Tiny Yolo\n", "- MobileNet1_0\n", "- MobileNet7_5 \n", "- MobileNet5_0 \n", "- MobileNet2_5 \n", "- SqueezeNet\n", "- NASNetMobile\n", "- DenseNet121\n", "- ResNet50\n", "\n", "For more information on anchors, please read here\n", "https://github.com/pjreddie/darknet/issues/568\n", "\n", "Labels are labels present in your dataset.\n", "IMPORTANT: Please, list all the labels present in the dataset.\n", "\n", "object_scale determines how much to penalize wrong prediction of confidence of object predictors\n", "\n", "no_object_scale determines how much to penalize wrong prediction of confidence of non-object predictors\n", "\n", "coord_scale determines how much to penalize wrong position and size predictions (x, y, w, h)\n", "\n", "class_scale determines how much to penalize wrong class prediction\n", "\n", "For converter type you can choose the following:\n", "\n", "'k210', 'tflite_fullint', 'tflite_dynamic', 'edgetpu', 'openvino', 'onnx'\n" ] }, { "cell_type": "markdown", "metadata": { "id": "EkASgMdcj3Nu" }, "source": [ "## Parameters for Person Detection\n", "\n", "K210, which is where we will run the network, has constrained memory (5.5 RAM) available, so with Micropython firmware, the largest model you can run is about 2 MB, which limits our architecture choice to Tiny Yolo, MobileNet(up to 0.75 alpha) and SqueezeNet. Out of these 3 architectures, only one comes with pre-trained model - MobileNet. So, to save the training time we will use Mobilenet with alpha 0.75, which has ... parameters. For objects that do not have that much variety, you can use MobileNet with lower alpha, down to 0.25." ] }, { "cell_type": "code", "metadata": { "id": "Jw4q6_MsegD2" }, "source": [ "config = {\n", " \"model\":{\n", " \"type\": \"Detector\",\n", " \"architecture\": \"MobileNet5_0\",\n", " \"input_size\": 224,\n", " \"anchors\": [0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828],\n", " \"labels\": [\"mark\"],\n", " \"coord_scale\" : \t\t1.0,\n", " \"class_scale\" : \t\t1.0,\n", " \"object_scale\" : \t\t5.0,\n", " \"no_object_scale\" : \t1.0\n", " },\n", " \"weights\" : {\n", " \"full\": \t\t\t\t\"\",\n", " \"backend\": \t\t \"imagenet\"\n", " },\n", " \"train\" : {\n", " \"actual_epoch\": 50,\n", " \"train_image_folder\": \"mark_detection/imgs\",\n", " \"train_annot_folder\": \"mark_detection/ann\",\n", " \"train_times\": 1,\n", " \"valid_image_folder\": \"mark_detection/imgs_validation\",\n", " \"valid_annot_folder\": \"mark_detection/ann_validation\",\n", " \"valid_times\": 1,\n", " \"valid_metric\": \"mAP\",\n", " \"batch_size\": 32,\n", " \"learning_rate\": 1e-3,\n", " \"saved_folder\": \t\tF\"/content/drive/MyDrive/mark_detector\",\n", " \"first_trainable_layer\": \"\",\n", " \"augumentation\":\t\t\t\tTrue,\n", " \"is_only_detect\" : \t\tFalse\n", " },\n", " \"converter\" : {\n", " \"type\": \t\t\t\t[\"k210\",\"tflite\"]\n", " }\n", " }" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "kobC_7gd5mEu" }, "source": [ "Let's check what GPU we have been assigned in this Colab session, if any." ] }, { "cell_type": "code", "metadata": { "id": "rESho_T70BWq" }, "source": [ "from tensorflow.python.client import device_lib\n", "device_lib.list_local_devices()" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "-oJ6i53GG-I0" }, "source": [ "Also, let's open Tensorboard, where we will be able to watch model training progress in real time. Training and validation logs also will be saved in project folder.\n", "Since there are no logs before we start the training, tensorboard will be empty. Refresh it after first epoch." ] }, { "cell_type": "code", "metadata": { "id": "d8l_DDM4G_aK" }, "source": [ "%tensorboard --logdir logs" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "cWyKjw-b5_yp" }, "source": [ "Finally we start the training by passing config dictionary we have defined earlier to setup_training function. The function will start the training with Checkpoint, Reduce Learning Rate on Plateau and Early Stopping callbacks. After the training has stopped, it will convert the best model into the format you have specified in config and save it to the project folder." ] }, { "cell_type": "code", "metadata": { "id": "deYD3cwukHsj" }, "source": [ "from keras import backend as K \n", "K.clear_session()\n", "model_path = setup_training(config_dict=config)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "ypTe3GZI619O" }, "source": [ "After training it is good to check the actual perfomance of your model by doing inference on your validation dataset and visualizing results. This is exactly what next block does. Obviously since our model has only trained on a few images the results are far from stellar, but if you have a good dataset, you'll have better results." ] }, { "cell_type": "code", "metadata": { "id": "jE7pTYmZN7Pi" }, "source": [ "from keras import backend as K \n", "K.clear_session()\n", "setup_inference(config, model_path)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "5YuVe2VD11cd" }, "source": [ "My end results are:\n", "\n", "{'fscore': 0.942528735632184, 'precision': 0.9318181818181818, 'recall': 0.9534883720930233}\n", "\n", "**You can obtain these results by loading a pre-trained model.**\n", "\n", "Good luck and happy training! Have a look at these articles, that would allow you to get the most of Google Colab or connect to local runtime if there are no GPUs available;\n", "\n", "https://medium.com/@oribarel/getting-the-most-out-of-your-google-colab-2b0585f82403\n", "\n", "https://research.google.com/colaboratory/local-runtimes.html" ] } ] } ================================================ FILE: resources/aXeleRate_pascal20_detector.ipynb ================================================ { "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "aXeleRate_pascal20_detector.ipynb", "private_outputs": true, "provenance": [], "collapsed_sections": [], "mount_file_id": "1_yhmzOZKns_-h0GwyPu9YAT3K0WQ1PG8", "authorship_tag": "ABX9TyPUzrsszS4m23mnB7AcN0I9", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "hS9yMrWe02WQ" }, "source": [ "## PASCAL-VOC Detection model Training and Inference\n", "\n", "In this notebook we will use axelerate, Keras-based framework for AI on the edge, to quickly setup model training and then after training session is completed convert it to .tflite and .kmodel formats.\n", "\n", "First, let's take care of some administrative details. \n", "\n", "1) Before we do anything, make sure you have choosen GPU as Runtime type (in Runtime - > Change Runtime type).\n", "\n", "2) We need to mount Google Drive for saving our model checkpoints and final converted model(s). Press on Mount Google Drive button in Files tab on your left. \n", "\n", "In the next cell we clone axelerate Github repository and import it. \n", "\n", "**It is possible to use pip install or python setup.py install, but in that case you will need to restart the enironment.** Since I'm trying to make the process as streamlined as possibile I'm using sys.path.append for import." ] }, { "cell_type": "code", "metadata": { "id": "y07yAbYbjV2s" }, "source": [ "#we need imgaug 0.4 for image augmentations to work properly, see https://stackoverflow.com/questions/62580797/in-colab-doing-image-data-augmentation-with-imgaug-is-not-working-as-intended\n", "!pip uninstall -y imgaug && pip uninstall -y albumentations && pip install imgaug==0.4\n", "!git clone https://github.com/AIWintermuteAI/aXeleRate.git\n", "import sys\n", "sys.path.append('/content/aXeleRate')\n", "from axelerate import setup_training, setup_inference, setup_evaluation" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "5TBRMPZ83dRL" }, "source": [ "At this step you typically need to get the dataset. You can use !wget command to download it from somewhere on the Internet or !cp to copy from My Drive as in this example\n", "```\n", "!cp -r /content/drive/'My Drive'/pascal_20_segmentation.zip .\n", "!unzip --qq pascal_20_segmentation.zip\n", "```\n", "For this notebook we will use PASCAL-VOC 2012 object detection dataset, which you can download here:\n", "\n", "http://host.robots.ox.ac.uk:8080/pascal/VOC/voc2012/index.html#devkit\n", "\n", "I split the dataset into training and validation using a simple Python script. Since most of the models trained with aXeleRate are to be run on embedded devices and thus have memory and latency constraints, the validation images are easier than most of the images in training set. The validation images include one(or many) instance of a particular class, no mixed classes in one image.\n", "\n", "Let's visualize our detection model test dataset. We use img_num=10 to show only first 10 images. Feel free to change the number to None to see all 100 images.\n" ] }, { "cell_type": "code", "metadata": { "id": "_tpsgkGj7d79" }, "source": [ "%matplotlib inline\n", "!gdown https://drive.google.com/uc?id=1xgk7svdjBiEyzyUVoZrCz4PP6dSjVL8S #pascal-voc dataset\n", "!gdown https://drive.google.com/uc?id=1-2jYfTRPX4kSUTL5SUQVxwHKjBclrBTA #pre-trained model\n", "!unzip --qq pascal_20_detection.zip\n", "\n", "from axelerate.networks.common_utils.augment import visualize_detection_dataset\n", "\n", "visualize_detection_dataset(img_folder='pascal_20_detection/imgs_validation', ann_folder='pascal_20_detection/anns_validation', num_imgs=10, img_size=320, augment=True)\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "S1oqdtbr7VLB" }, "source": [ "Next step is defining a config dictionary. Most lines are self-explanatory.\n", "\n", "Type is model frontend - Classifier, Detector or Segnet\n", "\n", "Architecture is model backend (feature extractor) \n", "\n", "- Full Yolo\n", "- Tiny Yolo\n", "- MobileNet1_0\n", "- MobileNet7_5 \n", "- MobileNet5_0 \n", "- MobileNet2_5 \n", "- SqueezeNet\n", "- NASNetMobile\n", "- DenseNet121\n", "- ResNet50\n", "\n", "Currently only MobileNet backends available for YOLOv3 detector. I'm working on backend (feature exctractor) overhaul.\n", "\n", "For more information on anchors, please read here\n", "https://github.com/pjreddie/darknet/issues/568\n", "\n", "Labels are labels present in your dataset.\n", "IMPORTANT: Please, list all the labels present in the dataset.\n", "\n", "object_scale determines how much to penalize wrong prediction of confidence of object predictors\n", "\n", "no_object_scale determines how much to penalize wrong prediction of confidence of non-object predictors\n", "\n", "coord_scale determines how much to penalize wrong position and size predictions (x, y, w, h)\n", "\n", "obj_thresh, nms_threshold set detection confidence threshold and nms thresholds to be used when calcualting precision/recall\n", "\n", "For converter type you can choose the following:\n", "\n", "'k210', 'tflite_fullint', 'tflite_dynamic', 'edgetpu', 'openvino', 'onnx'\n", "\n", "**Since it is an example notebook, we will use pretrained weights and set learning rate to 0.0** " ] }, { "cell_type": "code", "metadata": { "id": "Jw4q6_MsegD2" }, "source": [ "config = {\n", " \"model\":{\n", " \"type\": \"Detector\",\n", " \"architecture\": \"MobileNet1_0\",\n", " \"input_size\": [224, 320],\n", " \"anchors\": [[[0.76120044, 0.57155991], [0.6923348, 0.88535553], [0.47163042, 0.34163313]],\n", " [[0.33340788, 0.70065861], [0.18124964, 0.38986752], [0.08497349, 0.1527057 ]]],\n", " \"labels\": [\"person\", \"bird\", \"cat\", \"cow\", \"dog\", \"horse\", \"sheep\", \"aeroplane\", \"bicycle\", \"boat\", \"bus\", \"car\", \"motorbike\", \"train\",\"bottle\", \"chair\", \"diningtable\", \"pottedplant\", \"sofa\", \"tvmonitor\"],\n", " \"obj_thresh\" : \t\t 0.7,\n", " \"iou_thresh\" : \t\t 0.5,\n", " \"coord_scale\" : \t\t 1.0,\n", " \"object_scale\" : \t\t 3.0, \n", " \"no_object_scale\" : \t1.0\n", " },\n", " \"weights\" : {\n", " \"full\": \t\t\t\t \"/content/yolo_best_recall.h5\",\n", " \"backend\": \t\t \"imagenet\"\n", " },\n", " \"train\" : {\n", " \"actual_epoch\": 1,\n", " \"train_image_folder\": \"pascal_20_detection/imgs\",\n", " \"train_annot_folder\": \"pascal_20_detection/anns\",\n", " \"train_times\": 1,\n", " \"valid_image_folder\": \"pascal_20_detection/imgs_validation\",\n", " \"valid_annot_folder\": \"pascal_20_detection/anns_validation\",\n", " \"valid_times\": 1,\n", " \"valid_metric\": \"recall\",\n", " \"batch_size\": 32,\n", " \"learning_rate\": 0.0,\n", " \"saved_folder\": \t\tF\"/content/drive/MyDrive/projects/pascal20_yolov3\",\n", " \"first_trainable_layer\": \"\",\n", " \"augmentation\":\t\t\t\t True,\n", " \"is_only_detect\" : \t\t False\n", " },\n", " \"converter\" : {\n", " \"type\": \t\t\t\t[]\n", " }\n", "}" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "kobC_7gd5mEu" }, "source": [ "Let's check what GPU we have been assigned in this Colab session, if any." ] }, { "cell_type": "code", "metadata": { "id": "rESho_T70BWq" }, "source": [ "from tensorflow.python.client import device_lib\n", "device_lib.list_local_devices()" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "i0Fc61WrTxh1" }, "source": [ "Also, let's open Tensorboard, where we will be able to watch model training progress in real time. Training and validation logs also will be saved in project folder.\n", "Since there are no logs before we start the training, tensorboard will be empty. Refresh it after first epoch." ] }, { "cell_type": "code", "metadata": { "id": "jsGp9JvjTzzp" }, "source": [ "%load_ext tensorboard\n", "%tensorboard --logdir logs\n", "!sleep 5" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "cWyKjw-b5_yp" }, "source": [ "Finally we start the training by passing config dictionary we have defined earlier to setup_training function. The function will start the training with Reduce Learning Rate on Plateau and save on best mAP callbacks. Every epoch mAP of the model predictions is measured on the validation dataset. If you have specified the converter type in the config, after the training has stopped the script will convert the best model into the format you have specified in config and save it to the project folder.\n", "\n", "Let's train for one epoch to see how the whole pipeline works." ] }, { "cell_type": "code", "metadata": { "id": "deYD3cwukHsj" }, "source": [ "from keras import backend as K \n", "K.clear_session()\n", "model_path = setup_training(config_dict=config)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "ypTe3GZI619O" }, "source": [ "After training it is good to check the actual perfomance of your model by doing inference on your validation dataset and visualizing results. This is exactly what next block does. Our model used pre-trained weights and since all the layers were set as non-trainable, we are just observing the perfomance of the model that was trained before." ] }, { "cell_type": "code", "metadata": { "id": "jE7pTYmZN7Pi" }, "source": [ "%matplotlib inline\n", "from keras import backend as K \n", "K.clear_session()\n", "setup_inference(config, model_path)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "nKsxhdPvzrD8" }, "source": [ "If you need to convert trained model to other formats, for example for inference with Edge TPU or OpenCV AI Kit, you can do it with following commands. Specify the converter type, backend and folder with calbiration images(normally your validation image folder)." ] }, { "cell_type": "code", "metadata": { "id": "awR7r4ILzrmb" }, "source": [ "from axelerate.networks.common_utils.convert import Converter\n", "converter = Converter('tflite_dynamic', 'MobileNet1_0', 'pascal_20_detection/imgs_validation')\n", "converter.convert_model(model_path)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "JPvYzcRhfs2u" }, "source": [ "To train the model from scratch use the following config and then run the cells with training and (optinally) inference functions again." ] }, { "cell_type": "code", "metadata": { "id": "uruWpeGRf6Qi" }, "source": [ "config = {\n", " \"model\":{\n", " \"type\": \"Detector\",\n", " \"architecture\": \"MobileNet1_0\",\n", " \"input_size\": [224, 320],\n", " \"anchors\": [[[0.76120044, 0.57155991], [0.6923348, 0.88535553], [0.47163042, 0.34163313]],\n", " [[0.33340788, 0.70065861], [0.18124964, 0.38986752], [0.08497349, 0.1527057 ]]],\n", " \"labels\": [\"person\", \"bird\", \"cat\", \"cow\", \"dog\", \"horse\", \"sheep\", \"aeroplane\", \"bicycle\", \"boat\", \"bus\", \"car\", \"motorbike\", \"train\",\"bottle\", \"chair\", \"diningtable\", \"pottedplant\", \"sofa\", \"tvmonitor\"],\n", " \"obj_thresh\" : \t\t 0.7,\n", " \"iou_thresh\" : \t\t 0.5,\n", " \"coord_scale\" : \t\t 1.0,\n", " \"object_scale\" : \t\t 3.0, \n", " \"no_object_scale\" : \t1.0\n", " },\n", " \"weights\" : {\n", " \"full\": \t\t\t\t \"\",\n", " \"backend\": \t\t \"imagenet\"\n", " },\n", " \"train\" : {\n", " \"actual_epoch\": 50,\n", " \"train_image_folder\": \"pascal_20_detection/imgs\",\n", " \"train_annot_folder\": \"pascal_20_detection/anns\",\n", " \"train_times\": 1,\n", " \"valid_image_folder\": \"pascal_20_detection/imgs_validation\",\n", " \"valid_annot_folder\": \"pascal_20_detection/anns_validation\",\n", " \"valid_times\": 1,\n", " \"valid_metric\": \"recall\",\n", " \"batch_size\": 32,\n", " \"learning_rate\": 1e-3,\n", " \"saved_folder\": \t\tF\"/content/drive/MyDrive/projects/pascal20_yolov3\",\n", " \"first_trainable_layer\": \"\",\n", " \"augmentation\":\t\t\t\t True,\n", " \"is_only_detect\" : \t\t False\n", " },\n", " \"converter\" : {\n", " \"type\": \t\t\t\t[]\n", " }\n", "}" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "1frVrWMcf-k7" }, "source": [ "from keras import backend as K \n", "K.clear_session()\n", "model_path = setup_training(config_dict=config)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "Ipv1AGzRgAMA" }, "source": [ "%matplotlib inline\n", "from keras import backend as K \n", "K.clear_session()\n", "setup_inference(config, model_path)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "5YuVe2VD11cd" }, "source": [ "Good luck and happy training! Have a look at these articles, that would allow you to get the most of Google Colab or connect to local runtime if there are no GPUs available;\n", "\n", "https://medium.com/@oribarel/getting-the-most-out-of-your-google-colab-2b0585f82403\n", "\n", "https://research.google.com/colaboratory/local-runtimes.html" ] } ] } ================================================ FILE: resources/aXeleRate_person_detector.ipynb ================================================ { "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "aXeleRate_person_detector.ipynb", "private_outputs": true, "provenance": [], "collapsed_sections": [], "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "hS9yMrWe02WQ" }, "source": [ "## Person Detection model Training and Inference\n", "\n", "In this notebook we will use axelerate, Keras-based framework for AI on the edge, to quickly setup model training and then after training session is completed convert it to .tflite and .kmodel formats.\n", "\n", "First, let's take care of some administrative details. \n", "\n", "1) Before we do anything, make sure you have choosen GPU as Runtime type (in Runtime - > Change Runtime type).\n", "\n", "2) We need to mount Google Drive for saving our model checkpoints and final converted model(s). Press on Mount Google Drive button in Files tab on your left. \n", "\n", "In the next cell we clone axelerate Github repository and import it. \n", "\n", "**It is possible to use pip install or python setup.py install, but in that case you will need to restart the enironment.** Since I'm trying to make the process as streamlined as possibile I'm using sys.path.append for import." ] }, { "cell_type": "code", "metadata": { "id": "y07yAbYbjV2s" }, "source": [ "%load_ext tensorboard\n", "#we need imgaug 0.4 for image augmentations to work properly, see https://stackoverflow.com/questions/62580797/in-colab-doing-image-data-augmentation-with-imgaug-is-not-working-as-intended\n", "!pip uninstall -y imgaug && pip uninstall -y albumentations && pip install imgaug==0.4\n", "!pip install --upgrade --no-cache-dir gdown\n", "!git clone https://github.com/AIWintermuteAI/aXeleRate.git\n", "import sys\n", "sys.path.append('/content/aXeleRate')\n", "from axelerate import setup_training, setup_inference" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "5TBRMPZ83dRL" }, "source": [ "At this step you typically need to get the dataset. You can use !wget command to download it from somewhere on the Internet or !cp to copy from My Drive as in this example\n", "```\n", "!cp -r /content/drive/'My Drive'/pascal_20_segmentation.zip .\n", "!unzip --qq pascal_20_segmentation.zip\n", "```\n", "For this notebook well use gdown command line tool to download the dataset for person detection I shared on Google Drive and then unzip it with unzip command. It is based on INRIA person detection dataset, which I converted to PASCAL-VOC annotation format.\n", "https://dbcollection.readthedocs.io/en/latest/datasets/inria_ped.html\n", "When actually training the model myself I added about 400 pictures of our office staff, which I cannot share online. I recommend you also augment this dataset by taking and annotating pictures of your family/friends. The annotation tool I use is LabelImg\n", "https://github.com/tzutalin/labelImg\n", "\n", "Let's visualize our detection model test dataset. There are images in validation folder with corresponding annotations in PASCAL-VOC format in validation annotations folder.\n" ] }, { "cell_type": "code", "metadata": { "id": "_tpsgkGj7d79" }, "source": [ "%matplotlib inline\n", "!gdown https://drive.google.com/uc?id=1UWwxlJm5JH_JiBY9PoLgGyHsRDzBqRGU #dataset\n", "!gdown https://drive.google.com/uc?id=1-2fiBxykZVZBRcux9I6mKZaS3yAHq6hk #pre-trained model\n", "\n", "!unzip --qq person_dataset.zip\n", "\n", "from axelerate.networks.common_utils.augment import visualize_detection_dataset\n", "\n", "visualize_detection_dataset(img_folder='person_dataset/imgs_validation', ann_folder='person_dataset/anns_validation', img_size=None, augment=True)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "S1oqdtbr7VLB" }, "source": [ "Next step is defining a config dictionary. Most lines are self-explanatory.\n", "\n", "Type is model frontend - Classifier, Detector or Segnet\n", "\n", "Architecture is model backend (feature extractor) \n", "\n", "- Full Yolo\n", "- Tiny Yolo\n", "- MobileNet1_0\n", "- MobileNet7_5 \n", "- MobileNet5_0 \n", "- MobileNet2_5 \n", "- SqueezeNet\n", "- NASNetMobile\n", "- DenseNet121\n", "- ResNet50\n", "\n", "For more information on anchors, please read here\n", "https://github.com/pjreddie/darknet/issues/568\n", "\n", "Labels are labels present in your dataset.\n", "IMPORTANT: Please, list all the labels present in the dataset.\n", "\n", "object_scale determines how much to penalize wrong prediction of confidence of object predictors\n", "\n", "no_object_scale determines how much to penalize wrong prediction of confidence of non-object predictors\n", "\n", "coord_scale determines how much to penalize wrong position and size predictions (x, y, w, h)\n", "\n", "class_scale determines how much to penalize wrong class prediction\n", "\n", "For converter type you can choose the following:\n", "\n", "'k210', 'tflite_fullint', 'tflite_dynamic', 'edgetpu', 'openvino', 'onnx'" ] }, { "cell_type": "markdown", "metadata": { "id": "EkASgMdcj3Nu" }, "source": [ "## Parameters for Person Detection\n", "\n", "K210, which is where we will run the network, has constrained memory (5.5 RAM) available, so with Micropython firmware, the largest model you can run is about 2 MB, which limits our architecture choice to Tiny Yolo, MobileNet(up to 0.75 alpha) and SqueezeNet. Out of these 3 architectures, only one comes with pre-trained model - MobileNet. So, to save the training time we will use Mobilenet with alpha 0.75, which has ... parameters. For objects that do not have that much variety, you can use MobileNet with lower alpha, down to 0.25." ] }, { "cell_type": "code", "metadata": { "id": "Jw4q6_MsegD2" }, "source": [ "config = {\n", " \"model\":{\n", " \"type\": \"Detector\",\n", " \"architecture\": \"MobileNet5_0\",\n", " \"input_size\": [224, 320],\n", " \"anchors\": [[[0.76120044, 0.57155991], [0.6923348, 0.88535553], [0.47163042, 0.34163313]],\n", " [[0.33340788, 0.70065861], [0.18124964, 0.38986752], [0.08497349, 0.1527057 ]]],\n", " \"labels\": [\"person\"],\n", " \"obj_thresh\" : \t\t 0.7,\n", " \"iou_thresh\" : \t\t 0.5,\n", " \"coord_scale\" : \t\t1.0,\n", " \"class_scale\" : \t\t1.0,\n", " \"object_scale\" : \t\t5.0,\n", " \"no_object_scale\" : \t1.0\n", " },\n", " \"weights\" : {\n", " \"full\": \t\t\t\t\"\",\n", " \"backend\": \t\t \"imagenet\"\n", " },\n", " \"train\" : {\n", " \"actual_epoch\": 1,\n", " \"train_image_folder\": \"person_dataset/imgs\",\n", " \"train_annot_folder\": \"person_dataset/anns\",\n", " \"train_times\": 1,\n", " \"valid_image_folder\": \"person_dataset/imgs_validation\",\n", " \"valid_annot_folder\": \"person_dataset/anns_validation\",\n", " \"valid_times\": 1,\n", " \"valid_metric\": \"recall\",\n", " \"batch_size\": 10,\n", " \"learning_rate\": 1e-3,\n", " \"saved_folder\": \t\tF\"/content/drive/MyDrive/person_detector\",\n", " \"first_trainable_layer\": \"\",\n", " \"augmentation\":\t\t\t\tTrue,\n", " \"is_only_detect\" : \t\tFalse\n", " },\n", " \"converter\" : {\n", " \"type\": \t\t\t\t[\"k210\",\"tflite\"]\n", " }\n", " }" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "kobC_7gd5mEu" }, "source": [ "Let's check what GPU we have been assigned in this Colab session, if any." ] }, { "cell_type": "code", "metadata": { "id": "rESho_T70BWq" }, "source": [ "from tensorflow.python.client import device_lib\n", "device_lib.list_local_devices()" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "gtNVJF3WIYXL" }, "source": [ "Also, let's open Tensorboard, where we will be able to watch model training progress in real time. Training and validation logs also will be saved in project folder.\n", "Since there are no logs before we start the training, tensorboard will be empty. Refresh it after first epoch." ] }, { "cell_type": "code", "metadata": { "id": "lLUCRqhSIcRP" }, "source": [ "%tensorboard --logdir logs" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "cWyKjw-b5_yp" }, "source": [ "Finally we start the training by passing config dictionary we have defined earlier to setup_training function. The function will start the training with Checkpoint, Reduce Learning Rate on Plateau and Early Stopping callbacks. After the training has stopped, it will convert the best model into the format you have specified in config and save it to the project folder." ] }, { "cell_type": "code", "metadata": { "id": "deYD3cwukHsj" }, "source": [ "from keras import backend as K \n", "K.clear_session()\n", "model_path = setup_training(config_dict=config)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "ypTe3GZI619O" }, "source": [ "After training it is good to check the actual perfomance of your model by doing inference on your validation dataset and visualizing results. This is exactly what next block does." ] }, { "cell_type": "code", "metadata": { "id": "jE7pTYmZN7Pi" }, "source": [ "%matplotlib inline\n", "from keras import backend as K \n", "K.clear_session()\n", "setup_inference(config, model_path)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "5YuVe2VD11cd" }, "source": [ "The pre-trained weights inference results are: {'fscore': 0.918918918918919, 'precision': 0.8947368421052632, 'recall': 0.9444444444444444}, final validation mAP 0.5657894736842105 \n", "**weights name: YOLO_best_mAP.h5**\n", "\n", "Good luck and happy training! Have a look at these articles, that would allow you to get the most of Google Colab or connect to local runtime if there are no GPUs available;\n", "\n", "https://medium.com/@oribarel/getting-the-most-out-of-your-google-colab-2b0585f82403\n", "\n", "https://research.google.com/colaboratory/local-runtimes.html" ] } ] } ================================================ FILE: resources/aXeleRate_standford_dog_classifier.ipynb ================================================ { "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "aXeleRate_standford_dog_classifier.ipynb", "private_outputs": true, "provenance": [], "collapsed_sections": [], "mount_file_id": "1rCJbj9BGoDxEt1ERSK3onxShVBv9LS7B", "authorship_tag": "ABX9TyP3QFJgHG/Wic0bXC60lYCn", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "hS9yMrWe02WQ" }, "source": [ "## Standford Dog Breed Classification model Training and Inference\n", "\n", "In this notebook we will use axelerate Keras-based framework for AI on the edge to quickly setup model training and then after training session is completed convert it to .tflite and .kmodel formats.\n", "\n", "First, let's take care of some administrative details. \n", "\n", "1) Before we do anything, make sure you have choosen GPU as Runtime type (in Runtime - > Change Runtime type).\n", "\n", "2) We need to mount Google Drive for saving our model checkpoints and final converted model(s). Press on Mount Google Drive button in Files tab on your left. \n", "\n", "In the next cell we clone axelerate Github repository and import it. \n", "\n", "**It is possible to use pip install or python setup.py install, but in that case you will need to restart the enironment.** Since I'm trying to make the process as streamlined as possibile I'm using sys.path.append for import." ] }, { "cell_type": "code", "metadata": { "id": "y07yAbYbjV2s" }, "source": [ "#we need imgaug 0.4 for image augmentations to work properly, see https://stackoverflow.com/questions/62580797/in-colab-doing-image-data-augmentation-with-imgaug-is-not-working-as-intended\n", "!pip uninstall -y imgaug && pip uninstall -y albumentations && pip install imgaug==0.4\n", "!git clone https://github.com/AIWintermuteAI/aXeleRate.git\n", "import sys\n", "sys.path.append('/content/aXeleRate')\n", "from axelerate import setup_training, setup_inference" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "5TBRMPZ83dRL" }, "source": [ "At this step you typically need to get the dataset. You can use !wget command to download it from somewhere on the Internet or !cp to copy from My Drive as in this example\n", "```\n", "!cp -r /content/drive/'My Drive'/pascal_20_segmentation.zip .\n", "!unzip --qq pascal_20_segmentation.zip\n", "```\n", "For this notebook we will use Standford Dog Breed Classification dataset for fine-grained classification, which you can download here:\n", "http://vision.stanford.edu/aditya86/ImageNetDogs/\n", "\n", "In the next cell we will download the same dataset, but with training/validation split already done - I shared on my Google Drive. We will also download pre-trained model to demonstrate inference results.\n", "\n", "Let's visualize our classification validation dataset with visualize_dataset function, which will search for all images in folder and display num_imgs number of images with class overlayer over the image.\n" ] }, { "cell_type": "code", "metadata": { "id": "_tpsgkGj7d79" }, "source": [ "%matplotlib inline\n", "!gdown https://drive.google.com/uc?id=1qq758Tjsfm7Euu9ev7hSyLkMj63YC9ST #dog breed classification dataset\n", "!gdown https://drive.google.com/uc?id=1dFnDCOxws2uX4ZpauSPC6r6jdjHoJw_p #pre-trained model\n", "!unzip --qq dogs_classification.zip\n", "\n", "from axelerate.networks.common_utils.augment import visualize_classification_dataset\n", "\n", "visualize_classification_dataset('dogs_classification/imgs_validation', num_imgs=10, img_size=224, augment=True)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "S1oqdtbr7VLB" }, "source": [ "Next step is defining a config dictionary. Most lines are self-explanatory.\n", "\n", "Type is model frontend - Classifier, Detector or Segnet\n", "\n", "Architecture is model backend (feature extractor) \n", "\n", "- Full Yolo\n", "- Tiny Yolo\n", "- MobileNet1_0\n", "- MobileNet7_5 \n", "- MobileNet5_0 \n", "- MobileNet2_5 \n", "- SqueezeNet\n", "- NASNetMobile\n", "- DenseNet121\n", "- ResNet50\n", "\n", "**Note that while you can train any network type with any backend (Tiny YOLO + Classifier, NASNETMobile + Detector, DenseNet121 + Segnet and so on), some converters do not support larger networks! E.g. K210 converter only supports MobileNet and TinyYOLO backends.**\n", "\n", "Fully_connected is number of neurons in classification layers as list.\n", "\n", "Dropout value is dropout in classification layers.\n", "\n", "actual_epoch is number of epochs to train, noramlly good starting value is 50 - 100\n", "\n", "train_times is a multiplier for training dataset, i.e. how many times to repeat the dataset during one epoch. Useful when you apply augmentations to image. Normally between 1 and 3 is okay. If you have big dataset, can leave at 1.\n", "\n", "For converter type you can choose the following:\n", "\n", "'k210', 'tflite_fullint', 'tflite_dynamic', 'edgetpu', 'openvino', 'onnx'\n", "\n", "**Since it is an example notebook, we will use pretrained weights and set all layers of the model to be \"frozen\"(non-trainable), except for the last one. Also we set learning rate to very low value, that will allow us to see the perfomance of pretrained model** " ] }, { "cell_type": "code", "metadata": { "id": "Jw4q6_MsegD2" }, "source": [ "config = {\n", " \"model\" : {\n", " \"type\": \"Classifier\",\n", " \"architecture\": \"NASNetMobile\",\n", " \"input_size\": 224,\n", " \"fully-connected\": [],\n", " \"labels\": [],\n", " \"dropout\" : \t\t0.2\n", " },\n", " \"weights\" : {\n", " \"full\": \t\t\t\t\"/content/Classifier_best_val_accuracy.h5\",\n", " \"backend\": \t\t \"imagenet\",\n", " \"save_bottleneck\": False\n", " \n", " },\n", " \"train\" : {\n", " \"actual_epoch\": 1,\n", " \"train_image_folder\": \"dogs_classification/imgs\",\n", " \"train_times\": 1,\n", " \"valid_image_folder\": \"dogs_classification/imgs_validation\",\n", " \"valid_times\": 1,\n", " \"valid_metric\": \"val_accuracy\",\n", " \"batch_size\": 16,\n", " \"learning_rate\": 0.0,\n", " \"saved_folder\": \t\tF\"/content/drive/MyDrive/dogs_classifier\",\n", " \"first_trainable_layer\": \"dense\",\n", " \"augmentation\":\t\t\t\tTrue\n", " },\n", " \"converter\" : {\n", " \"type\": \t\t\t\t[]\n", " }\n", "}" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "kobC_7gd5mEu" }, "source": [ "Let's check what GPU we have been assigned in this Colab session, if any." ] }, { "cell_type": "code", "metadata": { "id": "rESho_T70BWq" }, "source": [ "from tensorflow.python.client import device_lib\n", "device_lib.list_local_devices()" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "vsu5OuxwH58t" }, "source": [ "Also, let's open Tensorboard, where we will be able to watch model training progress in real time. Training and validation logs also will be saved in project folder.\n", "Since there are no logs before we start the training, tensorboard will be empty. Refresh it after first epoch." ] }, { "cell_type": "code", "metadata": { "id": "8H59nl11H6kB" }, "source": [ "%load_ext tensorboard\n", "%tensorboard --logdir logs\n", "!sleep 10" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "cWyKjw-b5_yp" }, "source": [ "Finally we start the training by passing config dictionary we have defined earlier to setup_training function. The function will start the training with Checkpoint, Reduce Learning Rate on Plateu and Early Stopping callbacks. Every time our validation metric(in this config set to \"val_accuracy\") improves, the model is saved with Checkpoint callback. If you have specified the converter type in the config, after the training has stopped the script will convert the best model into the format you have specified in config and save it to the project folder." ] }, { "cell_type": "code", "metadata": { "id": "deYD3cwukHsj" }, "source": [ "from keras import backend as K \n", "K.clear_session()\n", "model_path = setup_training(config_dict=config)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "ypTe3GZI619O" }, "source": [ "After training it is good to check the actual perfomance of your model by doing inference on your validation dataset and visualizing results. This is exactly what next block does. Our model used pre-trained weights and since all the layers,except for the last one were set as non-trainable and we set the learning rate to a very low value, we are just observing the perfomance of the model that was trained before." ] }, { "cell_type": "code", "metadata": { "id": "jE7pTYmZN7Pi" }, "source": [ "%matplotlib inline\n", "from keras import backend as K \n", "K.clear_session()\n", "setup_inference(config, model_path)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "PF__ooBsyb58" }, "source": [ "If you need to convert trained model to other formats, for example for inference with Edge TPU or Kendryte K210, you can do it with following commands. Specify the converter type, backend and folder with calbiration images(normally your validation image folder)." ] }, { "cell_type": "code", "metadata": { "id": "fGNqUf1Gyc4z" }, "source": [ "from axelerate.networks.common_utils.convert import Converter\n", "converter = Converter('tflite_dynamic', 'NASNetMobile', 'dogs_classification/imgs_validation')\n", "converter.convert_model(model_path)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "fn7H0V4SEOd_" }, "source": [ "To train the model from scratch use the following config and then run the cells with training and (optinally) inference functions again." ] }, { "cell_type": "code", "metadata": { "id": "oT87SwQ6EQB8" }, "source": [ "config = {\n", " \"model\" : {\n", " \"type\": \"Classifier\",\n", " \"architecture\": \"NASNetMobile\",\n", " \"input_size\": 224,\n", " \"fully-connected\": [],\n", " \"labels\": [],\n", " \"dropout\" : \t\t0.2\n", " },\n", " \"weights\" : {\n", " \"full\": \t\t\t\t\"\",\n", " \"backend\": \t\t \"imagenet\",\n", " \"save_bottleneck\": False\n", " \n", " },\n", " \"train\" : {\n", " \"actual_epoch\": 50,\n", " \"train_image_folder\": \"dogs_classification/imgs\",\n", " \"train_times\": 1,\n", " \"valid_image_folder\": \"dogs_classification/imgs_validation\",\n", " \"valid_times\": 1,\n", " \"valid_metric\": \"val_accuracy\",\n", " \"batch_size\": 16,\n", " \"learning_rate\": 1e-3,\n", " \"saved_folder\": \t\tF\"/content/drive/MyDrive/dogs_classifier\",\n", " \"first_trainable_layer\": \"\",\n", " \"augumentation\":\t\t\t\tTrue\n", " },\n", " \"converter\" : {\n", " \"type\": \t\t\t\t[\"tflite_dynamic\"]\n", " }\n", "}" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "NQjvas2UEe8l" }, "source": [ "from keras import backend as K \n", "K.clear_session()\n", "model_path = setup_training(config_dict=config)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "iJJWjuRaEfkj" }, "source": [ "%matplotlib inline\n", "from keras import backend as K \n", "K.clear_session()\n", "setup_inference(config, model_path)" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "5YuVe2VD11cd" }, "source": [ "Good luck and happy training! Have a look at these articles, that would allow you to get the most of Google Colab or connect to local runtime if there are no GPUs available;\n", "\n", "https://medium.com/@oribarel/getting-the-most-out-of-your-google-colab-2b0585f82403\n", "\n", "https://research.google.com/colaboratory/local-runtimes.html" ] } ] } ================================================ FILE: sample_datasets/detector/anns/2007_000032.xml ================================================ VOC2012 2007_000032.jpg The VOC2007 Database PASCAL VOC2007 flickr 500 281 3 1 aeroplane Frontal 0 0 104 78 375 183 aeroplane Left 0 0 133 88 197 123 person Rear 0 0 195 180 213 229 person Rear 0 0 26 189 44 238 ================================================ FILE: sample_datasets/detector/anns/2007_000033.xml ================================================ VOC2012 2007_000033.jpg The VOC2007 Database PASCAL VOC2007 flickr 500 366 3 1 aeroplane Unspecified 0 0 9 107 499 263 aeroplane Left 0 0 421 200 482 226 aeroplane Left 1 0 325 188 411 223 ================================================ FILE: sample_datasets/detector/anns_validation/2007_000243.xml ================================================ VOC2012 2007_000243.jpg The VOC2007 Database PASCAL VOC2007 flickr 500 333 3 1 aeroplane Unspecified 0 0 181 127 274 193 ================================================ FILE: sample_datasets/detector/anns_validation/2007_000250.xml ================================================ VOC2012 2007_000250.jpg The VOC2007 Database PASCAL VOC2007 flickr 500 375 3 1 diningtable Unspecified 1 1 1 170 474 375 bottle Unspecified 0 0 97 124 150 297 ================================================ FILE: sample_datasets/detector/anns_validation/2007_000645.xml ================================================ VOC2012 2007_000645.jpg The VOC2007 Database PASCAL VOC2007 flickr 500 375 3 1 bird Left 0 0 135 46 500 374 bird Left 0 0 124 146 365 375 ================================================ FILE: sample_datasets/detector/anns_validation/2007_001595.xml ================================================ VOC2012 2007_001595.jpg The VOC2007 Database PASCAL VOC2007 flickr 500 375 3 1 bus Unspecified 0 0 268 162 442 296 bus Unspecified 1 0 40 158 275 288 ================================================ FILE: sample_datasets/detector/anns_validation/2007_001834.xml ================================================ VOC2012 2007_001834.jpg The VOC2007 Database PASCAL VOC2007 flickr 500 334 3 1 diningtable Unspecified 0 0 46 39 456 304 ================================================ FILE: sample_datasets/detector/anns_validation/2007_003131.xml ================================================ VOC2012 2007_003131.jpg The VOC2007 Database PASCAL VOC2007 flickr 500 334 3 1 boat Right 0 0 340 214 410 330 ================================================ FILE: sample_datasets/detector/anns_validation/2007_003201.xml ================================================ VOC2012 2007_003201.jpg The VOC2007 Database PASCAL VOC2007 flickr 500 315 3 1 cow Frontal 0 0 1 53 166 260 cow Left 0 0 137 25 416 298 cow Unspecified 1 0 320 30 500 261 ================================================ FILE: sample_datasets/detector/anns_validation/2007_003593.xml ================================================ VOC2012 2007_003593.jpg The VOC2007 Database PASCAL VOC2007 flickr 500 333 3 1 sheep Left 1 0 316 135 463 265 sheep Left 1 0 62 119 314 303 ================================================ FILE: sample_datasets/detector/anns_validation/2007_004627.xml ================================================ VOC2012 2007_004627.jpg The VOC2007 Database PASCAL VOC2007 flickr 500 375 3 1 train Unspecified 0 0 193 202 421 272 train Unspecified 1 0 417 227 500 284 ================================================ FILE: sample_datasets/detector/anns_validation/2007_005803.xml ================================================ VOC2012 2007_005803.jpg The VOC2007 Database PASCAL VOC2007 flickr 500 375 3 1 diningtable Unspecified 0 0 67 156 433 273 ================================================ FILE: setup.py ================================================ from setuptools import setup, find_packages from os import path this_directory = path.abspath(path.dirname(__file__)) with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f: long_description = f.read() with open('requirements.txt') as f: requirements = f.read().splitlines() setup(name='axelerate', version="0.7.6", description='Keras-based framework for AI on the Edge', install_requires=requirements, long_description=long_description, long_description_content_type="text/markdown", author='Dmitry Maslov', author_email='dmitrywat@gmail.com', url='https://github.com/AIWintermuteAI', packages=find_packages(), ) ================================================ FILE: tests_training_and_inference.py ================================================ import argparse import json from axelerate import setup_training, setup_evaluation import tensorflow.keras.backend as K from termcolor import colored import traceback import time def configs(network_type): classifier = { "model" : { "type": "Classifier", "architecture": "Tiny Yolo", "input_size": [224,224], "fully-connected": [], "labels": [], "dropout" : 0.5 }, "weights" : { "full": "", "backend": None, "save_bottleneck": True }, "train" : { "actual_epoch": 5, "train_image_folder": "sample_datasets/classifier/imgs", "train_times": 1, "valid_image_folder": "sample_datasets/classifier/imgs_validation", "valid_times": 1, "valid_metric": "accuracy", "batch_size": 2, "learning_rate": 1e-4, "saved_folder": "classifier", "first_trainable_layer": "", "augmentation": True }, "converter" : { "type": [] } } detector = { "model":{ "type": "Detector", "architecture": "MobileNet7_5", "input_size": [240, 320], "anchors": [[[0.51424575, 0.54116074], [0.29523918, 0.45838044], [0.21371929, 0.21518053]]], "labels": ["aeroplane", "person", "diningtable"," bottle", "bird", "bus", "boat", "cow", "sheep", "train"], "obj_thresh" : 0.7, "iou_thresh" : 0.3, "coord_scale" : 0.5, "object_scale" : 5.0, "no_object_scale" : 0.5 }, "weights" : { "full": "", "backend": None }, "train" : { "actual_epoch": 5, "train_image_folder": "sample_datasets/detector/imgs", "train_annot_folder": "sample_datasets/detector/anns", "train_times": 1, "valid_image_folder": "sample_datasets/detector/imgs_validation", "valid_annot_folder": "sample_datasets/detector/anns_validation", "valid_times": 1, "valid_metric": "recall", "batch_size": 2, "learning_rate": 1e-4, "saved_folder": "detector", "first_trainable_layer": "", "augmentation": True, "is_only_detect" : False }, "converter" : { "type": [] } } segnet = { "model" : { "type": "SegNet", "architecture": "MobileNet5_0", "input_size": [224,224], "n_classes" : 20 }, "weights" : { "full": "", "backend": None }, "train" : { "actual_epoch": 5, "train_image_folder": "sample_datasets/segmentation/imgs", "train_annot_folder": "sample_datasets/segmentation/anns", "train_times": 4, "valid_image_folder": "sample_datasets/segmentation/imgs_validation", "valid_annot_folder": "sample_datasets/segmentation/anns_validation", "valid_times": 4, "valid_metric": "loss", "batch_size": 2, "learning_rate": 1e-4, "saved_folder": "segment", "first_trainable_layer": "", "ignore_zero_class": False, "augmentation": True }, "converter" : { "type": [] } } dict = {'all':[classifier,detector,segnet],'classifier':[classifier],'detector':[detector],'segnet':[segnet]} return dict[network_type] argparser = argparse.ArgumentParser(description='Test axelerate on sample datasets') argparser.add_argument( '-t', '--type', default="all", help='type of network to test:classifier,detector,segnet or all') argparser.add_argument( '-a', '--arch', type=bool, default=False, help='test all architectures?') argparser.add_argument( '-c', '--conv', type=bool, default=False, help='test all converters?') args = argparser.parse_args() archs = ['MobileNet7_5'] converters = [""] errors = [] if args.arch: archs = ['Full Yolo', 'Tiny Yolo', 'MobileNet1_0', 'MobileNet7_5', 'MobileNet5_0', 'MobileNet2_5', 'SqueezeNet', 'NASNetMobile', 'ResNet50', 'DenseNet121'] if args.conv: converters = ['k210', 'tflite_fullint', 'tflite_dynamic', 'edgetpu', 'openvino', 'onnx'] for item in configs(args.type): for arch in archs: for converter in converters: try: item['model']['architecture'] = arch item['converter']['type'] = converter print(json.dumps(item, indent=4, sort_keys=False)) model_path = setup_training(config_dict=item) K.clear_session() setup_evaluation(item, model_path) except Exception as e: traceback.print_exc() print(colored(str(e), 'red')) time.sleep(2) errors.append(item['model']['type'] + " " + arch + " " + converter + " " + str(e)) for error in errors: print(error)