Repository: NVIDIA/tensorrt-laboratory
Branch: v2
Commit: 33b6fdf2935c
Files: 406
Total size: 2.2 MB

Directory structure:
gitextract_8sd61058/

├── .bazelrc
├── .clang-format
├── .dockerignore
├── .gitmodules
├── BUILD.bazel
├── CLA
├── CMakeLists.txt
├── CREDITS.md
├── Dockerfile
├── LICENSE
├── README.md
├── WORKSPACE
├── bazel/
│   ├── BUILD.bazel
│   ├── cuda_configure.bzl
│   ├── repositories.bzl
│   └── tensorrt_configure.bzl
├── build.sh
├── cmake/
│   ├── FindTensorRT.cmake
│   ├── Findcpuaff.cmake
│   ├── GRPCGenerateCPP.cmake
│   ├── GRPCGenerateCPPLikeBazel.cmake
│   ├── LibFindMacros.cmake
│   ├── ProtobufGenerateCPPLikeBazel.cmake
│   └── dependencies.cmake
├── devel.sh
├── examples/
│   ├── 00_TensorRT/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   ├── infer.cc
│   │   └── inference.cc
│   ├── 01_Basic_GRPC/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   └── src/
│   │       ├── async_client.cc
│   │       ├── client.cpp
│   │       └── server.cpp
│   ├── 02_TensorRT_GRPC/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   └── src/
│   │       ├── async-client.cc
│   │       ├── metrics.cc
│   │       ├── metrics.h
│   │       ├── server.cc
│   │       ├── siege.cc
│   │       └── sync-client.cc
│   ├── 03_Batching/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   ├── inference-batcher.cc
│   │   ├── launch_batching.sh
│   │   ├── simple_batching_client.py
│   │   ├── simple_pb2.py
│   │   ├── simple_pb2_grpc.py
│   │   ├── streaming-service.cc
│   │   └── unary_client.py
│   ├── 04_Middleman/
│   │   ├── CMakeLists.txt
│   │   └── middleman-client.cc
│   ├── 10_Internals/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   └── internals.cc
│   ├── 11_Protos/
│   │   ├── CMakeLists.txt
│   │   ├── demo/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── dataset.proto
│   │   │   └── inference.proto
│   │   ├── echo/
│   │   │   ├── CMakeLists.txt
│   │   │   └── echo.proto
│   │   └── inference/
│   │       ├── CMakeLists.txt
│   │       ├── api.proto
│   │       ├── model_config.proto
│   │       ├── nvidia_inference.proto
│   │       ├── request_status.proto
│   │       └── server_status.proto
│   ├── 12_ConfigGenerator/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   ├── generator.cc
│   │   ├── link.sh
│   │   └── ms_mgmt
│   ├── 12_FlatBuffers/
│   │   ├── CMakeLists.txt
│   │   ├── client.cc
│   │   ├── example.fbs
│   │   ├── example.grpc.fb.cc
│   │   ├── example.grpc.fb.h
│   │   ├── example_generated.h
│   │   └── server.cc
│   ├── 30_PyTensorRT/
│   │   ├── README.md
│   │   ├── client.py
│   │   ├── compute.py
│   │   ├── infer_test_utils.py
│   │   ├── rebuild.sh
│   │   └── server.py
│   ├── 90_Kubernetes/
│   │   ├── README.md
│   │   ├── bootstrap-minikube.sh
│   │   ├── deploy/
│   │   │   └── build-and-run.sh
│   │   ├── devel/
│   │   │   ├── README.md
│   │   │   └── yais-devel.yml
│   │   ├── istio/
│   │   │   ├── README.md
│   │   │   └── rendered/
│   │   │       ├── istio-v0.8-minikube.yml
│   │   │       └── istio-v1.0-minikube.yml
│   │   ├── minikube/
│   │   │   ├── README.md
│   │   │   └── bootstrap.sh
│   │   ├── prometheus/
│   │   │   ├── bootstrap.sh
│   │   │   ├── custom-settings.yml
│   │   │   ├── service-account.yml
│   │   │   ├── yais-dashboard.json
│   │   │   └── yais-metrics.yml
│   │   └── yais-deploy.yml
│   ├── 91_Prometheus/
│   │   ├── README.md
│   │   └── scrape.conf
│   ├── 97_SingleProcessMultiSteam/
│   │   └── launch_service.sh
│   ├── 98_MultiProcessSingleStream/
│   │   ├── README.md
│   │   ├── run_latency_test
│   │   ├── run_throughput_test
│   │   └── setup.py
│   ├── 99_LoadBalancer/
│   │   ├── README.md
│   │   ├── lb-envoy.j2
│   │   └── run_loadbalancer.py
│   ├── CMakeLists.txt
│   ├── Deployment/
│   │   ├── CMakeLists.txt
│   │   ├── ImageClient/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── api.proto
│   │   │   ├── client.cc
│   │   │   ├── client.h
│   │   │   └── client.py
│   │   ├── Kubernetes/
│   │   │   └── basic-trtis-deployment/
│   │   │       ├── deploy.yml
│   │   │       ├── istio-ingress.yml
│   │   │       └── scrape-metrics.yml
│   │   ├── ObjectStore/
│   │   │   ├── README.md
│   │   │   ├── create_buckets.py
│   │   │   ├── get_rook_s3_keys.sh
│   │   │   ├── ingress-istio.yml
│   │   │   ├── ingress-nginx.yml
│   │   │   └── rook-s3.yml
│   │   ├── README.md
│   │   ├── RouteRequests/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── README.md
│   │   │   ├── envoy_config.yaml
│   │   │   ├── test_client.py
│   │   │   ├── test_routing.sh
│   │   │   └── test_service.cc
│   │   └── batcher.cc
│   ├── ONNX/
│   │   └── resnet50/
│   │       ├── README.md
│   │       ├── build.py
│   │       ├── calibration_images.csv
│   │       ├── calibrator.py
│   │       ├── fetch.sh
│   │       ├── imagenet_labels.py
│   │       ├── int8.py
│   │       ├── onnx_utils.py
│   │       ├── open_source_images.md5
│   │       ├── resnet50.md5
│   │       ├── run_jpeg_test.py
│   │       └── run_onnx_tests.py
│   └── nvRPC/
│       ├── CMakeLists.txt
│       ├── SharedMemoryService/
│       │   ├── CMakeLists.txt
│       │   ├── README.md
│       │   ├── client.cc
│       │   └── server.cc
│       ├── StreamingInOrderSendRecv/
│       │   ├── CMakeLists.txt
│       │   ├── README.md
│       │   ├── client.cc
│       │   ├── server.cc
│       │   └── test.sh
│       ├── StreamingService/
│       │   ├── CMakeLists.txt
│       │   ├── README.md
│       │   ├── client.cc
│       │   ├── common.h
│       │   ├── even-odds.cc
│       │   ├── ping-pong.cc
│       │   └── test.sh
│       └── UnaryService/
│           ├── CMakeLists.txt
│           ├── client.cc
│           └── server.cc
├── jupyter_notebook_config.py
├── models/
│   ├── README.md
│   ├── ResNet-152-deploy.prototxt
│   ├── ResNet-50-deploy.prototxt
│   ├── mps_builder
│   ├── onnx/
│   │   ├── common.py
│   │   ├── mnist-v1.3/
│   │   │   ├── model.onnx
│   │   │   ├── test_data_set_0/
│   │   │   │   ├── input_0.pb
│   │   │   │   └── output_0.pb
│   │   │   ├── test_data_set_1/
│   │   │   │   ├── input_0.pb
│   │   │   │   └── output_0.pb
│   │   │   └── test_data_set_2/
│   │   │       ├── input_0.pb
│   │   │       └── output_0.pb
│   │   └── onnx_builder.py
│   └── setup.py
├── notebooks/
│   ├── Demo Day 1.ipynb
│   ├── Demo Day 2.ipynb
│   ├── Demo Day 3.ipynb
│   ├── Multiple Models.ipynb
│   ├── Quickstart.ipynb
│   └── README.md
├── requirements.txt
└── trtlab/
    ├── BUILD.bazel
    ├── CMakeLists.txt
    ├── core/
    │   ├── BUILD.bazel
    │   ├── CMakeLists.txt
    │   ├── benchmarks/
    │   │   ├── CMakeLists.txt
    │   │   ├── bench_batcher.cc
    │   │   ├── bench_memory.cc
    │   │   ├── bench_memory_stack.cc
    │   │   ├── bench_pool.cc
    │   │   ├── bench_thread_pool.cc
    │   │   └── main.cc
    │   ├── include/
    │   │   └── trtlab/
    │   │       └── core/
    │   │           ├── affinity.h
    │   │           ├── async_compute.h
    │   │           ├── batcher.h
    │   │           ├── cyclic_buffer.h
    │   │           ├── cyclic_windowed_buffer.h
    │   │           ├── dispatcher.h
    │   │           ├── fiber_group.h
    │   │           ├── hybrid_condition.h
    │   │           ├── hybrid_mutex.h
    │   │           ├── memory/
    │   │           │   └── first_touch_allocator.h
    │   │           ├── pool.h
    │   │           ├── ranges.h
    │   │           ├── resources.h
    │   │           ├── standard_threads.h
    │   │           ├── task_pool.h
    │   │           ├── thread_pool.h
    │   │           ├── types.h
    │   │           ├── userspace_threads.h
    │   │           └── utils.h
    │   ├── src/
    │   │   ├── affinity.cc
    │   │   ├── cyclic_buffer.cc
    │   │   ├── cyclic_windowed_buffer.cc
    │   │   ├── memory/
    │   │   │   ├── copy.cc
    │   │   │   ├── host_memory.cc
    │   │   │   ├── malloc.cc
    │   │   │   ├── memory.cc
    │   │   │   ├── sysv_allocator.cc
    │   │   │   └── tensor_shape.cc
    │   │   ├── types.cc
    │   │   └── utils.cc
    │   └── tests/
    │       ├── BUILD.bazel
    │       ├── CMakeLists.txt
    │       ├── test_affinity.cc
    │       ├── test_async.cc
    │       ├── test_async_compute.cc
    │       ├── test_batcher.cc
    │       ├── test_common.cc
    │       ├── test_common.h
    │       ├── test_cyclic_allocator.cc
    │       ├── test_cyclic_windowed_buffer.cc
    │       ├── test_foo_memory.cc
    │       ├── test_main.cc
    │       ├── test_memory.cc
    │       ├── test_memory_old.cc
    │       ├── test_memory_stack.cc
    │       ├── test_pool.cc
    │       ├── test_stl_allocator.cc
    │       ├── test_sysv_allocator.cc
    │       ├── test_tensor.cc
    │       ├── test_thread_pool.cc
    │       ├── test_transactional_allocator.h
    │       └── test_types.cc
    ├── cuda/
    │   ├── BUILD.bazel
    │   ├── CMakeLists.txt
    │   ├── benchmarks/
    │   │   ├── CMakeLists.txt
    │   │   ├── bench_cuda_memory.cc
    │   │   └── bench_main.cc
    │   ├── include/
    │   │   └── trtlab/
    │   │       └── cuda/
    │   │           ├── common.h
    │   │           ├── cyclic_windowed_buffer.h
    │   │           ├── device_guard.h
    │   │           ├── device_info.h
    │   │           ├── memory/
    │   │           │   ├── cuda_allocators.h
    │   │           │   └── device_memory.h
    │   │           └── sync.h
    │   ├── src/
    │   │   ├── copy.cc
    │   │   ├── cuda_allocators.cc
    │   │   ├── device_guard.cc
    │   │   └── device_info.cc
    │   └── tests/
    │       ├── CMakeLists.txt
    │       ├── test_allocators.cc
    │       ├── test_device_info.cc
    │       ├── test_main.cc
    │       └── test_memory.cc
    ├── memory/
    │   ├── CMakeLists.txt
    │   ├── benchmarks/
    │   │   ├── CMakeLists.txt
    │   │   ├── bench_memory.cc
    │   │   ├── bench_memory_pool.cc
    │   │   └── main.cc
    │   ├── cmake/
    │   │   ├── configuration.cmake
    │   │   └── dependencies.cmake
    │   ├── include/
    │   │   └── trtlab/
    │   │       └── memory/
    │   │           ├── align.h
    │   │           ├── allocator.h
    │   │           ├── allocator_storage.h
    │   │           ├── allocator_traits.h
    │   │           ├── bfit_allocator.h
    │   │           ├── block_allocators.h
    │   │           ├── block_arena.h
    │   │           ├── block_manager.h
    │   │           ├── block_stack.h
    │   │           ├── config.h
    │   │           ├── debugging.h
    │   │           ├── deleter.h
    │   │           ├── descriptor.h
    │   │           ├── detail/
    │   │           │   ├── assert.h
    │   │           │   ├── block_list.h
    │   │           │   ├── container_node_sizes.h
    │   │           │   ├── debug_helpers.h
    │   │           │   ├── free_list.h
    │   │           │   ├── memory_stack.h
    │   │           │   ├── page_info.h
    │   │           │   ├── ranges.h
    │   │           │   └── utility.h
    │   │           ├── error.h
    │   │           ├── huge_page_allocator.h
    │   │           ├── literals.h
    │   │           ├── malloc_allocator.h
    │   │           ├── memory_block.h
    │   │           ├── memory_pool.h
    │   │           ├── memory_resource.h
    │   │           ├── memory_type.h
    │   │           ├── memory_typed_allocator.h
    │   │           ├── posix_aligned_allocator.h
    │   │           ├── raii_allocator.h
    │   │           ├── smart_ptr.h
    │   │           ├── std_allocator.h
    │   │           ├── threading.h
    │   │           ├── trackers.h
    │   │           ├── tracking.h
    │   │           ├── transactional_allocator.h
    │   │           └── utils.h
    │   ├── src/
    │   │   ├── CMakeLists.txt
    │   │   ├── align.cc
    │   │   ├── block_stack.cc
    │   │   ├── config.h.in
    │   │   ├── descriptor.cc
    │   │   ├── detail/
    │   │   │   ├── block_list.cc
    │   │   │   ├── free_list.cc
    │   │   │   ├── free_list_utils.h
    │   │   │   └── page_info.c
    │   │   ├── error.cc
    │   │   ├── ilog2.h
    │   │   ├── memory_type.cc
    │   │   ├── trackers.cc
    │   │   └── utils.cc
    │   ├── tests/
    │   │   ├── CMakeLists.txt
    │   │   ├── test_main.cc
    │   │   └── test_memory.cc
    │   └── tools/
    │       ├── CMakeLists.txt
    │       ├── node_size_debugger.cpp
    │       ├── node_size_debugger.hpp
    │       └── test_types.hpp
    ├── nvrpc/
    │   ├── BUILD.bazel
    │   ├── CMakeLists.txt
    │   ├── include/
    │   │   └── nvrpc/
    │   │       ├── client/
    │   │       │   ├── base_context.h
    │   │       │   ├── client_single_up_multiple_down.h
    │   │       │   ├── client_streaming.h
    │   │       │   ├── client_streaming_v2.h
    │   │       │   ├── client_streaming_v3.h
    │   │       │   ├── client_unary.h
    │   │       │   ├── client_unary_v2.h
    │   │       │   └── executor.h
    │   │       ├── context.h
    │   │       ├── executor.h
    │   │       ├── fiber/
    │   │       │   └── executor.h
    │   │       ├── interfaces.h
    │   │       ├── life_cycle_batching.h
    │   │       ├── life_cycle_bidirectional.h
    │   │       ├── life_cycle_streaming.h
    │   │       ├── life_cycle_unary.h
    │   │       ├── rpc.h
    │   │       ├── server.h
    │   │       └── service.h
    │   ├── src/
    │   │   ├── client/
    │   │   │   └── client_executor.cc
    │   │   ├── executor.cc
    │   │   └── server.cc
    │   └── tests/
    │       ├── CMakeLists.txt
    │       ├── test_build_client.h
    │       ├── test_build_server.h
    │       ├── test_pingpong.cc
    │       ├── test_pingpong.h
    │       ├── test_resources.cc
    │       ├── test_resources.h
    │       ├── test_server.cc
    │       └── testing.proto
    ├── pybind/
    │   ├── CMakeLists.txt
    │   └── trtlab/
    │       ├── CMakeLists.txt
    │       ├── infer.cc
    │       ├── utils.cc
    │       └── utils.h
    └── tensorrt/
        ├── BUILD.bazel
        ├── CMakeLists.txt
        ├── include/
        │   └── trtlab/
        │       └── tensorrt/
        │           ├── allocator.h
        │           ├── bindings.h
        │           ├── buffers.h
        │           ├── common.h
        │           ├── execution_context.h
        │           ├── infer_bench.h
        │           ├── infer_runner.h
        │           ├── inference_manager.h
        │           ├── model.h
        │           ├── runtime.h
        │           ├── utils.h
        │           └── workspace.h
        ├── src/
        │   ├── allocator.cc
        │   ├── bindings.cc
        │   ├── buffers.cc
        │   ├── execution_context.cc
        │   ├── infer_bench.cc
        │   ├── inference_manager.cc
        │   ├── model.cc
        │   ├── runtime.cc
        │   ├── utils.cc
        │   └── workspace.cc
        └── tests/
            ├── CMakeLists.txt
            └── test_buffers.cc

================================================
FILE CONTENTS
================================================

================================================
FILE: .bazelrc
================================================
build --cxxopt=-std=c++1z
build --incompatible_remove_native_http_archive=false 
build --incompatible_package_name_is_a_function=false


================================================
FILE: .clang-format
================================================
#BasedOnStyle: Google

Language:        Cpp
# BasedOnStyle:  LLVM
AccessModifierOffset: -2
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
AlignEscapedNewlinesLeft: true
AlignOperands:   true
AlignTrailingComments: false
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortIfStatementsOnASingleLine: true
AllowShortFunctionsOnASingleLine: true
AllowShortLoopsOnASingleLine: true
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: false
AlwaysBreakTemplateDeclarations: true
BinPackArguments: true
BinPackParameters: true
BraceWrapping:
  AfterClass:      true
  AfterControlStatement: true
  AfterEnum:       true
  AfterFunction:   true
  AfterNamespace:  false
  AfterObjCDeclaration: false
  AfterStruct:     true
  AfterUnion:      true
  BeforeCatch:     true
  BeforeElse:      true
  IndentBraces:    false
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Custom
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
BreakAfterJavaFieldAnnotations: false
BreakStringLiterals: true
ColumnLimit:     100
CommentPragmas:  '^ IWYU pragma:'
ConstructorInitializerAllOnOneLineOrOnePerLine: false
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DerivePointerAlignment: false
DisableFormat:   false
ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
IncludeCategories:
  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
    Priority:        2
  - Regex:           '^(<|"(gtest|isl|json)/)'
    Priority:        3
  - Regex:           '.*'
    Priority:        1
IncludeIsMainRegex: '$'
IndentCaseLabels: true
IndentWidth:     4
IndentWrappedFunctionNames: true
JavaScriptQuotes: Leave
KeepEmptyLinesAtTheStartOfBlocks: false
MacroBlockBegin: ''
MacroBlockEnd:   ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
PenaltyBreakBeforeFirstCallParameter: 19
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 100
PenaltyBreakString: 1000
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 60
PointerAlignment: Left
ReflowComments:  true
SortIncludes:    true
SpaceAfterCStyleCast: false
SpaceAfterTemplateKeyword: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: Never
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 1
SpacesInAngles:  false
SpacesInContainerLiterals: false
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard:        Cpp11
TabWidth:        4
UseTab:          Never


================================================
FILE: .dockerignore
================================================
build
*.engine
models
@eaDir
__pycache__
bazel-*


================================================
FILE: .gitmodules
================================================
[submodule "third_party/cpuaff"]
	path = third_party/cpuaff
	url = https://github.com/dcdillon/cpuaff
[submodule "third_party/gflags"]
	path = third_party/gflags
	url = https://github.com/gflags/gflags.git
[submodule "third_party/glog"]
	path = third_party/glog
	url = https://github.com/google/glog.git
[submodule "third_party/grpc"]
	path = third_party/grpc
	url = https://github.com/grpc/grpc
[submodule "third_party/wait-for-it"]
	path = third_party/wait-for-it
	url = https://github.com/vishnubob/wait-for-it
[submodule "third_party/benchmark"]
	path = third_party/benchmark
	url = https://github.com/google/benchmark.git
[submodule "third_party/googletest"]
	path = third_party/googletest
	url = https://github.com/google/googletest.git
[submodule "third_party/pybind11"]
	path = third_party/pybind11
	url = https://github.com/pybind/pybind11.git
[submodule "third_party/flatbuffers"]
	path = third_party/flatbuffers
	url = https://github.com/google/flatbuffers.git


================================================
FILE: BUILD.bazel
================================================
package(default_visibility = ["//visibility:public"])


================================================
FILE: CLA
================================================
                       The NVIDIA TensorRT Laboratory
   Software Grant and Corporate Contributor License Agreement ("Agreement")

   Thank you for your interest in the NVIDIA TensorRT Laboratory Project 
   (the "Project"). In order to clarify the intellectual property license
   granted with Contributions from any person or entity, NVIDIA
   Corporation (the “Copyright Holders") must have a Contributor License
   Agreement (CLA) on file that has been signed by each Contributor,
   indicating agreement to the license terms below. This license is
   for your protection as a Contributor as well as the protection of the
   Project and its users; it does not change your rights to use your own
   Contributions for any other purpose.

   This version of the Agreement allows an entity (the "Corporation") to
   submit Contributions to the Project, to authorize Contributions
   submitted by its designated employees to the Project, and to grant
   copyright and patent licenses thereto to the Copyright Holders.

   If you have not already done so, please complete and sign, then scan and
   email a pdf file of this Agreement to rolson@nvidia.com.
   Please read this document carefully before signing and keep a copy for
   your records.

   Corporation name:    ________________________________________________

   Corporation address: ________________________________________________

                        ________________________________________________

                        ________________________________________________

   Point of Contact:    ________________________________________________

          E-Mail:       ________________________________________________

          Telephone:    _____________________ Fax: _____________________


   You accept and agree to the following terms and conditions for Your
   present and future Contributions submitted to the Project. In
   return, the Copyright Holders shall not use Your Contributions in a way
   that is contrary to the public benefit or inconsistent with its nonprofit
   status and bylaws in effect at the time of the Contribution. Except
   for the license granted herein to the Copyright Holders and recipients of
   software distributed by the Copyright Holders, You reserve all right, title,
   and interest in and to Your Contributions.

   1. Definitions.

      "You" (or "Your") shall mean the copyright owner or legal entity
      authorized by the copyright owner that is making this Agreement
      with the Copyright Holders. For legal entities, the entity making a
      Contribution and all other entities that control, are controlled by,
      or are under common control with that entity are considered to be a
      single Contributor. For the purposes of this definition, "control"
      means (i) the power, direct or indirect, to cause the direction or
      management of such entity, whether by contract or otherwise, or
      (ii) ownership of fifty percent (50%) or more of the outstanding
      shares, or (iii) beneficial ownership of such entity.

      "Contribution" shall mean the code, documentation or other original
      works of authorship expressly identified in Schedule B, as well as
      any original work of authorship, including
      any modifications or additions to an existing work, that is intentionally
      submitted by You to the Copyright Holders for inclusion in, or
      documentation of, any of the products owned or managed by the
      Copyright Holders (the "Work"). For the purposes of this definition,
      "submitted" means any form of electronic, verbal, or written
      communication sent to the Copyright Holders or its representatives,
      including but not limited to communication on electronic mailing
      lists, source code control systems, and issue tracking systems
      that are managed by, or on behalf of, the Copyright Holders for the
      purpose of discussing and improving the Work, but excluding
      communication that is conspicuously marked or otherwise designated
      in writing by You as "Not a Contribution."

   2. Grant of Copyright License. Subject to the terms and conditions
      of this Agreement, You hereby grant to the Copyright Holders and to
      recipients of software distributed by the Copyright Holders a
      perpetual, worldwide, non-exclusive, no-charge, royalty-free,
      irrevocable copyright license to reproduce, prepare derivative works
      of, publicly display, publicly perform, sublicense, and distribute
      Your Contributions and such derivative works.

   3. Grant of Patent License. Subject to the terms and conditions of
      this Agreement, You hereby grant to the Copyright Holders and to
      recipients of software distributed by the Copyright Holders
      a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
      irrevocable (except as stated in this section) patent license
      to make, have made, use, offer to sell, sell, import, and otherwise
      transfer the Work, where such license applies only to those
      patent claims licensable by You that are necessarily infringed
      by Your Contribution(s) alone or by combination of Your Contribution(s)
      with the Work to which such Contribution(s) were submitted.
      If any entity institutes patent litigation against You or any
      other entity (including a cross-claim or counterclaim in a lawsuit)
      alleging that your Contribution, or the Work to which you have
      contributed, constitutes direct or contributory patent infringement,
      then any patent licenses granted to that entity under this Agreement
      for that Contribution or Work shall terminate as of the date such
      litigation is filed.

   4. You represent that You are legally entitled to grant the above
      license. You represent further that each employee of the
      Corporation designated on Schedule A below (or in a subsequent
      written modification to that Schedule) is authorized to submit
      Contributions on behalf of the Corporation.

   5. You represent that each of Your Contributions is Your original
      creation (see section 7 for submissions on behalf of others).

   6. You are not expected to provide support for Your Contributions,
      except to the extent You desire to provide support. You may provide
      support for free, for a fee, or not at all. Unless required by
      applicable law or agreed to in writing, You provide Your
      Contributions on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
      OF ANY KIND, either express or implied, including, without
      limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT,
      MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE.

   7. Should You wish to submit work that is not Your original creation,
      You may submit it to the Copyright Holders separately from any
      Contribution, identifying the complete details of its source and
      of any license or other restriction (including, but not limited
      to, related patents, trademarks, and license agreements) of which
      you are personally aware, and conspicuously marking the work as
      "Submitted on behalf of a third-party: [named here]".

   8. It is your responsibility to notify the Copyright Holders when any change
      is required to the list of designated employees authorized to submit
      Contributions on behalf of the Corporation, or to the Corporation's
      Point of Contact with the Copyright Holders.


   Please sign: __________________________________ Date: _______________

   Title:       __________________________________

   Corporation: __________________________________


Schedule A

   [Initial list of designated employees.  NB: authorization is not
    tied to particular Contributions.]


Schedule B

   [Identification of optional concurrent software grant.  Would be
    left blank or omitted if there is no concurrent software grant.]


================================================
FILE: CMakeLists.txt
================================================
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

cmake_minimum_required(VERSION 3.13 FATAL_ERROR)

option (BUILD_DEPENDENCIES "Whether or not a superbuild should be invoked" ON)
option (BUILD_MEMORY "Whether or not build trtlab/memory" ON)
option (BUILD_CORE "Whether or not build trtlab/core" ON)
option (BUILD_CUDA "Whether or not build trtlab/cuda" ON)
option (BUILD_NVRPC "Whether or not build trtlab/nvrpc" ON)
option (BUILD_TENSORRT "Whether or not build trtlab/tensorrt" ON)
option (BUILD_PYTHON "Whether or not build trtlab/pybind" OFF)
option (BUILD_EXAMPLES "Whether or not to build trtlab examples" OFF)

if (BUILD_DEPENDENCIES)
  project (trtlab_dependencies NONE)
  include (cmake/dependencies.cmake)
  return() # stop processing this file further
else()
  project (trtlab)
endif()

# CMake path
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
list(APPEND CMAKE_PREFIX_PATH "${PROJECT_SOURCE_DIR}/cmake")

# trtlab 
add_subdirectory(trtlab)

if (BUILD_EXAMPLES)
  add_subdirectory(examples)
endif()


================================================
FILE: CREDITS.md
================================================
`ThreadPool` class was derived from https://github.com/progschj/ThreadPool

> Copyright (c) 2012 Jakob Progsch, Václav Zeman
> 
> This software is provided 'as-is', without any express or implied
> warranty. In no event will the authors be held liable for any damages
> arising from the use of this software.
> 
> Permission is granted to anyone to use this software for any purpose,
> including commercial applications, and to alter it and redistribute it
> freely, subject to the following restrictions:
> 
>   1. The origin of this software must not be misrepresented; you must not
>   claim that you wrote the original software. If you use this software
>   in a product, an acknowledgment in the product documentation would be
>   appreciated but is not required.
> 
>   2. Altered source versions must be plainly marked as such, and must not be
>   misrepresented as being the original software.
> 
>   3. This notice may not be removed or altered from any source
>   distribution.
> 
> Modifications to the original work include:
>   * Header-only file was split into .h/.cc files
>   * Added an extra safety check (lines 30-31) in the construction (.cc file).
>   * Added CPU affinity options to the constructor

-----

`cpuaff` is distributed unmodified from the original in [`third-party/cpuaff`]
(https://github.com/dcdillon/cpuaff)

> Copyright (c) 2015, Daniel C. Dillon
> All rights reserved.
> 
> Redistribution and use in source and binary forms, with or without
> modification, are permitted provided that the following conditions are met:
> 
> * Redistributions of source code must retain the above copyright notice, this
>   list of conditions and the following disclaimer.
> 
> * Redistributions in binary form must reproduce the above copyright notice,
>   this list of conditions and the following disclaimer in the documentation
>   and/or other materials provided with the distribution.
> 
> * Neither the name of cpuaff nor the names of its
>   contributors may be used to endorse or promote products derived from
>   this software without specific prior written permission.
> 
> THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
> AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
> DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
> FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
> DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
> SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
> CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
> OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-----

`wait-for-it.sh` is distributed unmodified from the original in [`third-pary/wait-for-it`]
(https://github.com/vishnubob/wait-for-it).

> The MIT License (MIT)
> Copyright (c) 2016 Giles Hall
> 
> Permission is hereby granted, free of charge, to any person obtaining a copy of
> this software and associated documentation files (the "Software"), to deal in
> the Software without restriction, including without limitation the rights to
> use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
> of the Software, and to permit persons to whom the Software is furnished to do
> so, subject to the following conditions:
> 
> The above copyright notice and this permission notice shall be included in all
> copies or substantial portions of the Software.
> 
> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
> AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

---

Example gRPC client code was used with modification from the [gRPC project]
(https://github.com/grpc/grpc), specifically the [synchronous c++ client]
(https://github.com/grpc/grpc/blob/master/examples/cpp/helloworld/greeter_client.cc)

> Copyright 2015 gRPC authors.
> 
> Licensed under the Apache License, Version 2.0 (the "License");
> you may not use this file except in compliance with the License.
> You may obtain a copy of the License at
> 
>     http://www.apache.org/licenses/LICENSE-2.0
> 
> Unless required by applicable law or agreed to in writing, software
> distributed under the License is distributed on an "AS IS" BASIS,
> WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> See the License for the specific language governing permissions and
> limitations under the License.

---

[moodycamel::ConcurrentQueue](https://github.com/cameron314/concurrentqueue) is
added unmodified to the Docker images and loaded into the `playground` namespace.

> Simplified BSD License:
> 
> Copyright (c) 2013-2016, Cameron Desrochers. All rights reserved.
> 
> Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
> 
> Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
> Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
> THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

---

`transwarp` is used unmodified from the original [bloomen/transwarp](https://github.com/bloomen/transwarp)

> MIT License
> 
> Copyright (c) 2018-2019 Christian Blume, Guan Wang
> 
> Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
> 
> The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
> 
> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

---

Caffe ResNet-50 and ResNet-152 models from [KaimingHe/deep-residual-networks]
(https://github.com/KaimingHe/deep-residual-networks) are included without modification.

> The MIT License (MIT)
> 
> Copyright (c) 2016 Shaoqing Ren
> 
> Permission is hereby granted, free of charge, to any person obtaining a copy
> of this software and associated documentation files (the "Software"), to deal
> in the Software without restriction, including without limitation the rights
> to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> copies of the Software, and to permit persons to whom the Software is
> furnished to do so, subject to the following conditions:
> 
> The above copyright notice and this permission notice shall be included in all
> copies or substantial portions of the Software.
> 
> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
> AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> SOFTWARE.

---


================================================
FILE: Dockerfile
================================================
# stage 1 - development container
# holds the core nvidia libraries but does not container the project source code
# use this container for development by mapping our source into the image which
# persists your source code outside of the container lifecycle

FROM nvcr.io/nvidia/tensorrt:20.06-py3 AS base

RUN apt update
RUN apt install -y clang-format libssl-dev openssl libz-dev software-properties-common

# remove base cmake
RUN apt remove --purge -y cmake
RUN apt autoremove -y
RUN apt autoclean -y

# install cmake ppa from kitware - https://apt.kitware.com/
RUN apt install -y apt-transport-https ca-certificates gnupg software-properties-common wget
RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
RUN apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
RUN apt update && apt install -y cmake

# then remove FindGTest.cmake installed by cmake
RUN find / -name "FindGTest.cmake" -exec rm -f {} \;

# add cufft and nvml to the container image
RUN apt install -y libcufft-dev-11-0 cuda-nvml-dev-11-0

# override some envs
ENV LD_LIBRARY_PATH=/externals/myelin/x86_64/cuda-11.0/lib:/externals/cudnn/x86_64/8.0/cuda-11.0/lib64:/usr/local/cuda-11.0/targets/x86_64-linux/lib
ENV CCACHE_DIR=/tmp/.ccache
RUN cd /usr/lib/x86_64-linux-gnu && ln -s libnvidia-ml.so.1 libnvidia-ml.so


# stage 2: build the project inside the dev container

FROM base AS trtlab

WORKDIR /work

COPY . .

RUN mkdir build && cd build && cmake .. && make -j


================================================
FILE: LICENSE
================================================
BSD 3-Clause License

Copyright (c) 2018-2019, NVIDIA Corporation
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright notice,
  this list of conditions and the following disclaimer in the documentation
  and/or other materials provided with the distribution.

* Neither the name of the copyright holder nor the names of its
  contributors may be used to endorse or promote products derived from
  this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


================================================
FILE: README.md
================================================
# TensorRT Laboratory

The TensorRT Laboratory (trtlab) is a general purpose set of tools to build customer inference applications
and services.

[Triton](https://github.com/nvidia/triton) is a professional grade production inference server.

This project is broken into 4 primary components:

  * `memory` is based on [foonathan/memory](https://github.com/foonathan/memory) the `memory` module 
     was designed to write custom allocators for both host and gpu memory.  Several custom allocators are
     included.  

  * `core` contains host/cpu-side tools for common components such as thread pools, resource pool, 
    and userspace threading based on boost fibers.

  * `cuda` extends `memory` with a new memory_type for CUDA device memory.  All custom allocators
    in `memory` can be used with `device_memory`, `device_managed_memory` or `host_pinned_memory`.

  * `nvrpc` is an abstraction layer for building asynchronous microservices.  The current implementation
    is based on grpc.

  * `tensorrt` provides an opinionated runtime built on the TensorRT API.

## Quickstart

The easiest way to manage the external NVIDIA dependencies is to leverage the containers hosted on
[NGC](https://ngc.nvidia.com).  For bare metal installs, use the `Dockerfile` as a template for
which NVIDIA libraries to install.

```
docker build -t trtlab . 
```

For development purposes, the following set of commands first builds the base image, then
maps the source code on the host into a running container.


```
docker build -t trtlab:dev --target base .
docker run --rm -ti --gpus=all -v $PWD:/work --workdir=/work --net=host trtlab:dev bash
```


## Copyright and License

This project is released under the [BSD 3-clause license](LICENSE).

## Issues and Contributing

* Please let us know by [filing a new issue](https://github.com/NVIDIA/tensorrt-laboratory/issues/new)
* You can contribute by opening a [pull request](https://help.github.com/articles/using-pull-requests/)

Pull requests with changes of 10 lines or more will require a [Contributor License Agreement](CLA).


================================================
FILE: WORKSPACE
================================================
workspace(name = "com_github_nvidia_trtlab")

load(":bazel/repositories.bzl", "repositories")
repositories()

load ("//bazel:cuda_configure.bzl", "cuda_configure")
cuda_configure(name = "local_config_cuda")

load ("//bazel:tensorrt_configure.bzl", "tensorrt_configure")
tensorrt_configure(name = "local_config_tensorrt")

load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
grpc_deps()


================================================
FILE: bazel/BUILD.bazel
================================================
exports_files(
    glob(["*.bzl"]),
    visibility = ["//visibility:public"],
)


================================================
FILE: bazel/cuda_configure.bzl
================================================
"""Build rule generator for locally installed CUDA toolkit and cuDNN SDK."""

# src: https://github.com/google/nvidia_libs_test

def _get_env_var(repository_ctx, name, default):
    if name in repository_ctx.os.environ:
        return repository_ctx.os.environ[name]
    return default

def _impl(repository_ctx):
    cuda_path = _get_env_var(repository_ctx, "CUDA_PATH", "/usr/local/cuda")
    cudnn_path = _get_env_var(repository_ctx, "CUDNN_PATH", cuda_path)

    print("Using CUDA from %s\n" % cuda_path)
    print("Using cuDNN from %s\n" % cudnn_path)

    repository_ctx.symlink(cuda_path, "cuda")
    repository_ctx.symlink(cudnn_path, "cudnn")

    repository_ctx.file("nvcc.sh", """
#! /bin/bash
repo_path=%s
compiler=${CC:+"--compiler-bindir=$CC"}
$repo_path/cuda/bin/nvcc $compiler --compiler-options=-fPIC --include-path=$repo_path $*
""" % repository_ctx.path("."))

    repository_ctx.file("BUILD", """
package(default_visibility = ["//visibility:public"])

sh_binary(
    name = "nvcc",
    srcs = ["nvcc.sh"],
)

# The *_headers cc_library rules below aren't cc_inc_library rules because
# dependent targets would only see the first one.

cc_library(
    name = "cuda_headers",
    hdrs = glob(
        include = ["cuda/include/**/*.h*"],
        exclude = ["cuda/include/cudnn.h"]
    ),
    # Allows including CUDA headers with angle brackets.
    includes = ["cuda/include"],
)

cc_library(
    name = "cuda",
    srcs = ["cuda/lib64/stubs/libcuda.so"],
    linkopts = ["-ldl"],
)

cc_library(
    name = "cuda_runtime",
    srcs = ["cuda/lib64/libcudart_static.a"],
    deps = [":cuda"],
    linkopts = ["-lrt"],
)

cc_library(
    name = "curand_static",
    srcs = [
        "cuda/lib64/libcurand_static.a",
        "cuda/lib64/libculibos.a",
    ],
)

cc_library(
    name = "cupti_headers",
    hdrs = glob(["cuda/extras/CUPTI/include/**/*.h"]),
    # Allows including CUPTI headers with angle brackets.
    includes = ["cuda/extras/CUPTI/include"],
)

cc_library(
    name = "cupti",
    srcs = glob(["cuda/extras/CUPTI/lib64/libcupti.so*"]),
)

cc_library(
    name = "cudnn",
    srcs = [
        "cudnn/lib64/libcudnn_static.a",
        "cuda/lib64/libcublas_static.a",
        "cuda/lib64/libculibos.a",
    ],
    hdrs = ["cudnn/include/cudnn.h"],
    deps = [
        ":cuda",
        ":cuda_headers"
    ],
)

cc_library(
    name = "cuda_util",
    deps = [":cuda_util_compile"],
)
""")

cuda_configure = repository_rule(
    implementation = _impl,
    environ = ["CUDA_PATH", "CUDNN_PATH"],
)


================================================
FILE: bazel/repositories.bzl
================================================
load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")

def repositories():
    _maybe(
        http_archive,
        name = "com_github_antonovvk_bazel_rules",
        sha256 = "ba75b07d3fd297375a6688e9a16583eb616e7a74b3d5e8791e7a222cf36ab26e",
        strip_prefix = "bazel_rules-98ddd7e4f7c63ea0868f08bcc228463dac2f9f12",
        urls = [
            "https://mirror.bazel.build/github.com/antonovvk/bazel_rules/archive/98ddd7e4f7c63ea0868f08bcc228463dac2f9f12.tar.gz",
            "https://github.com/antonovvk/bazel_rules/archive/98ddd7e4f7c63ea0868f08bcc228463dac2f9f12.tar.gz",
        ],
    )

    _maybe(
        http_archive,
        name = "com_github_gflags_gflags",
        sha256 = "6e16c8bc91b1310a44f3965e616383dbda48f83e8c1eaa2370a215057b00cabe",
        strip_prefix = "gflags-77592648e3f3be87d6c7123eb81cbad75f9aef5a",
        urls = [
            "https://mirror.bazel.build/github.com/gflags/gflags/archive/77592648e3f3be87d6c7123eb81cbad75f9aef5a.tar.gz",
            "https://github.com/gflags/gflags/archive/77592648e3f3be87d6c7123eb81cbad75f9aef5a.tar.gz",
        ],
    )

    _maybe(
        http_archive,
        name = "com_google_glog",
        sha256 = "1ee310e5d0a19b9d584a855000434bb724aa744745d5b8ab1855c85bff8a8e21",
        strip_prefix = "glog-028d37889a1e80e8a07da1b8945ac706259e5fd8",
        urls = [
            "https://mirror.bazel.build/github.com/google/glog/archive/028d37889a1e80e8a07da1b8945ac706259e5fd8.tar.gz",
            "https://github.com/google/glog/archive/028d37889a1e80e8a07da1b8945ac706259e5fd8.tar.gz",
        ],
    )

    _maybe(
        http_archive,
        name = "com_google_googletest",
        sha256 = "c18f281fd6621bb264570b99860a0241939b4a251c9b1af709b811d33bc63af8",
        strip_prefix = "googletest-e3bd4cbeaeef3cee65a68a8bd3c535cb779e9b6d",
        urls = [
            "https://mirror.bazel.build/github.com/google/googletest/archive/e3bd4cbeaeef3cee65a68a8bd3c535cb779e9b6d.tar.gz",
            "https://github.com/google/googletest/archive/e3bd4cbeaeef3cee65a68a8bd3c535cb779e9b6d.tar.gz",
        ],
    )

    _maybe(
        http_archive,
        name = "com_github_grpc_grpc",
        strip_prefix = "grpc-1.16.1",
        urls = [
            "https://github.com/grpc/grpc/archive/v1.16.1.tar.gz",
        ],
    )

def load_trtis():
    http_archive(
        name = "com_github_nvidia_trtis",
        strip_prefix = "tensorrt-inference-server-0.9.0",
        urls = [
            "https://github.com/NVIDIA/tensorrt-inference-server/archive/v0.9.0.tar.gz",
        ],
    )

def load_benchmark():
    http_archive(
        name = "com_github_google_benchmark",
        sha256 = "f8e525db3c42efc9c7f3bc5176a8fa893a9a9920bbd08cef30fb56a51854d60d",
        strip_prefix = "benchmark-1.4.1",
        urls = [
            "https://github.com/google/benchmark/archive/v1.4.1.tar.gz",
        ],
    )

def _maybe(repo_rule, name, **kwargs):
    if name not in native.existing_rules():
        repo_rule(name = name, **kwargs)


================================================
FILE: bazel/tensorrt_configure.bzl
================================================
"""Build rule generator for locally installed TensorRT."""

# inspired from: https://github.com/google/nvidia_libs_test

def _get_env_var(repository_ctx, name, default):
    if name in repository_ctx.os.environ:
        return repository_ctx.os.environ[name]
    return default

def _impl(repository_ctx):
    hdrs_path = _get_env_var(repository_ctx, "TENSORRT_HDRS_PATH", "/usr/include/x86_64-linux-gnu")
    libs_path = _get_env_var(repository_ctx, "TENSORRT_LIBS_PATH", "/usr/lib/x86_64-linux-gnu")

    print("Using TensorRT Headers from %s\n" % hdrs_path)
    print("Using TensorRT Libs from %s\n" % libs_path)

    repository_ctx.symlink(hdrs_path, "include")
    repository_ctx.symlink(libs_path, "libs")

    repository_ctx.file("BUILD", """
package(default_visibility = ["//visibility:public"])

# The *_headers cc_library rules below aren't cc_inc_library rules because
# dependent targets would only see the first one.

cc_library(
    name = "tensorrt_headers",
    hdrs = glob(
        include = ["include/Nv*.h"],
    ),
    strip_include_prefix = "include",
    # Allows including CUDA headers with angle brackets.
    # includes = ["cuda/include"],
)

cc_library(
    name = "tensorrt_infer",
    srcs = ["libs/libnvinfer.so"],
    linkopts = ["-ldl"],
)

""")

tensorrt_configure = repository_rule(
    implementation = _impl,
    environ = ["TENSORRT_HDRS_PATH", "TENSORRT_LIBS_PATH"],
)


================================================
FILE: build.sh
================================================
#!/bin/bash
#
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
mkdir -p build
cd build
cmake -DCMAKE_BUILD_TYPE=Release ..
cmake -DCMAKE_BUILD_TYPE=Release ..
make -j
cd /work/notebooks
ln -f -s /work/build/tensorrt-laboratory/python/trtlab/trtlab.cpython-35m-x86_64-linux-gnu.so
#make install


================================================
FILE: cmake/FindTensorRT.cmake
================================================
# This module defines the following variables:
#
# ::
#
#   TensorRT_INCLUDE_DIRS
#   TensorRT_LIBRARIES
#   TensorRT_FOUND
#
# ::
#
#   TensorRT_VERSION_STRING - version (x.y.z)
#   TensorRT_VERSION_MAJOR  - major version (x)
#   TensorRT_VERSION_MINOR  - minor version (y)
#   TensorRT_VERSION_PATCH  - patch version (z)
#
# Hints
# ^^^^^
# A user may set ``TensorRT_ROOT`` to an installation root to tell this module where to look.
#
set(_TensorRT_SEARCHES)

if(TensorRT_ROOT)
  set(_TensorRT_SEARCH_ROOT PATHS ${TensorRT_ROOT} NO_DEFAULT_PATH)
  list(APPEND _TensorRT_SEARCHES _TensorRT_SEARCH_ROOT)
endif()

# appends some common paths
set(_TensorRT_SEARCH_NORMAL
  PATHS "/usr"
)
list(APPEND _TensorRT_SEARCHES _TensorRT_SEARCH_NORMAL)

# Include dir
foreach(search ${_TensorRT_SEARCHES})
  find_path(TensorRT_INCLUDE_DIR NAMES NvInfer.h ${${search}} PATH_SUFFIXES include)
endforeach()

if(NOT TensorRT_LIBRARY)
  foreach(search ${_TensorRT_SEARCHES})
    find_library(TensorRT_LIBRARY NAMES nvinfer ${${search}} PATH_SUFFIXES lib)
  endforeach()
endif()

mark_as_advanced(TensorRT_INCLUDE_DIR)

if(TensorRT_INCLUDE_DIR AND EXISTS "${TensorRT_INCLUDE_DIR}/NvInfer.h")
    file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MAJOR REGEX "^#define NV_TENSORRT_MAJOR [0-9]+.*$")
    file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInfer.h" TensorRT_MINOR REGEX "^#define NV_TENSORRT_MINOR [0-9]+.*$")
    file(STRINGS "${TensorRT_INCLUDE_DIR}/NvInfer.h" TensorRT_PATCH REGEX "^#define NV_TENSORRT_PATCH [0-9]+.*$")

    string(REGEX REPLACE "^#define NV_TENSORRT_MAJOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MAJOR "${TensorRT_MAJOR}")
    string(REGEX REPLACE "^#define NV_TENSORRT_MINOR ([0-9]+).*$" "\\1" TensorRT_VERSION_MINOR "${TensorRT_MINOR}")
    string(REGEX REPLACE "^#define NV_TENSORRT_PATCH ([0-9]+).*$" "\\1" TensorRT_VERSION_PATCH "${TensorRT_PATCH}")
    set(TensorRT_VERSION_STRING "${TensorRT_VERSION_MAJOR}.${TensorRT_VERSION_MINOR}.${TensorRT_VERSION_PATCH}")
endif()

include(FindPackageHandleStandardArgs)
FIND_PACKAGE_HANDLE_STANDARD_ARGS(TensorRT REQUIRED_VARS TensorRT_LIBRARY TensorRT_INCLUDE_DIR VERSION_VAR TensorRT_VERSION_STRING)

if(TensorRT_FOUND)
  set(TensorRT_INCLUDE_DIRS ${TensorRT_INCLUDE_DIR})

  if(NOT TensorRT_LIBRARIES)
    set(TensorRT_LIBRARIES ${TensorRT_LIBRARY})
  endif()

  if(NOT TARGET TensorRT::TensorRT)
    add_library(TensorRT::TensorRT UNKNOWN IMPORTED)
    set_target_properties(TensorRT::TensorRT PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${TensorRT_INCLUDE_DIRS}")
    set_property(TARGET TensorRT::TensorRT APPEND PROPERTY IMPORTED_LOCATION "${TensorRT_LIBRARY}")
  endif()
endif()


================================================
FILE: cmake/Findcpuaff.cmake
================================================
# This module defines the following variables:
#
# ::
#
#   CPUAFF_INCLUDE_DIRS
#   CPUAFF_FOUND
#
# ::
#
# Hints
# ^^^^^
# A user may set ``CPUAFF_ROOT`` to an installation root to tell this module where to look.
#
set(CPUAFF_FOUND FALSE)
set(_CPUAFF_SEARCHES)

if(CPUAFF_ROOT)
  set(_CPUAFF_SEARCH_ROOT PATHS ${CPUAFF_ROOT} NO_DEFAULT_PATH)
  list(APPEND _CPUAFF_SEARCHES _CPUAFF_SEARCH_ROOT)
else()
  list(APPEND _CPUAFF_SEARCHES "/usr")
  list(APPEND _CPUAFF_SEARCHES "/usr/local")
endif()

# Include dir
foreach(search ${_CPUAFF_SEARCHES})
  find_path(
    CPUAFF_INCLUDE_DIR 
    NAMES cpuaff/cpuaff.hpp
    PATHS ${CPUAFF_ROOT}
    PATH_SUFFIXES include)
  message(STATUS "cpuaff: ${CPUAFF_INCLUDE_DIR}")
endforeach()

mark_as_advanced(CPUAFF_INCLUDE_DIR)

if(CPUAFF_INCLUDE_DIR AND EXISTS "${CPUAFF_INCLUDE_DIR}/cpuaff/cpuaff.hpp")
    set(CPUAFF_FOUND True)

    add_library(cpuaff INTERFACE)
    target_include_directories(cpuaff INTERFACE ${CPUAFF_INCLUDE_DIR})
endif()


================================================
FILE: cmake/GRPCGenerateCPP.cmake
================================================

find_package(gRPC REQUIRED COMPONENTS grpc_cpp_plugin)
set(_gRPC_CPP_PLUGIN_EXECUTABLE $<TARGET_FILE:gRPC::grpc_cpp_plugin>)

function(PROTOBUF_GENERATE_GRPC_CPP SRCS HDRS)
  cmake_parse_arguments(protobuf "" "EXPORT_MACRO;DESCRIPTORS" "" ${ARGN})

  set(PROTO_FILES "${protobuf_UNPARSED_ARGUMENTS}")
  if(NOT PROTO_FILES)
    message(SEND_ERROR "Error: PROTOBUF_GENERATE_GRPC_CPP() called without any proto files")
    return()
  endif()

  if(PROTOBUF_GENERATE_CPP_APPEND_PATH) # This variable is common for all types of output.
    # Create an include path for each file specified
    foreach(FIL ${PROTO_FILES})
      get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
      get_filename_component(ABS_PATH ${ABS_FIL} PATH)
      list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
      if(${_contains_already} EQUAL -1)
          list(APPEND _protobuf_include_path -I ${ABS_PATH})
      endif()
    endforeach()
  else()
    set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
  endif()

  if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS)
    set(Protobuf_IMPORT_DIR "${PROTOBUF_IMPORT_DIRS")
  endif()

  if(DEFINED Protobuf_IMPORT_DIRS)
    foreach(DIR ${Protobuf_IMPORT_DIRS})
      get_filename_component(ABS_PATH ${DIR} ABSOLUTE)
      list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
      if(${_contains_already} EQUAL -1)
          list(APPEND _protobuf_include_path -I ${ABS_PATH})
      endif()
    endforeach()
  endif()

  set(${SRCS})
  set(${HDRS})
  foreach(FIL ${PROTO_FILES})
    get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
    get_filename_component(FIL_WE ${FIL} NAME_WE)

    if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH)
      get_filename_component(FIL_DIR ${FIL} DIRECTORY)
      if(FIL_DIR)
        set(FIL_WE "${FIL_DIR}/${FIL_WE}")
      endif()
    endif()

    set(_protobuf_grpc_src "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.grpc.pb.cc")
    set(_protobuf_grpc_hdr "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.grpc.pb.h")
    list(APPEND ${SRCS} "${_protobuf_grpc_src}")
    list(APPEND ${HDRS} "${_protobuf_grpc_hdr}")

    add_custom_command(
      OUTPUT "${_protobuf_grpc_src}"
             "${_protobuf_grpc_hdr}"
      COMMAND ${Protobuf_PROTOC_EXECUTABLE}
              --grpc_out=${CMAKE_CURRENT_BINARY_DIR}
              --plugin=protoc-gen-grpc=${_gRPC_CPP_PLUGIN_EXECUTABLE}
              ${_protobuf_include_path} ${ABS_FIL}
      DEPENDS ${ABS_FIL} ${Protobuf_PROTOC_EXECUTABLE}
      COMMENT "Running gRPC C++ protocol buffer compiler on ${FIL}"
      VERBATIM)
  endforeach()

  set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE)
  set(${SRCS} ${${SRCS}} PARENT_SCOPE)
  set(${HDRS} ${${HDRS}} PARENT_SCOPE)
endfunction()

================================================
FILE: cmake/GRPCGenerateCPPLikeBazel.cmake
================================================
find_program(GRPC_CPP_PLUGIN grpc_cpp_plugin) # Get full path to plugin

function(PROTOBUF_GENERATE_GRPC_CPP_LIKE_BAZEL SRCS HDRS)
  cmake_parse_arguments(protobuf "" "EXPORT_MACRO;DESCRIPTORS" "" ${ARGN})

  set(PROTO_FILES "${protobuf_UNPARSED_ARGUMENTS}")
  if(NOT PROTO_FILES)
    message(SEND_ERROR "Error: PROTOBUF_GENERATE_GRPC_CPP() called without any proto files")
    return()
  endif()

  if(protobuf_EXPORT_MACRO)
    set(DLL_EXPORT_DECL "dllexport_decl=${protobuf_EXPORT_MACRO}:")
  endif()

  get_filename_component(ABS_PROTO_PATH ${CMAKE_SOURCE_DIR} ABSOLUTE)
  set(EXTRA_ARGS "--proto_path=${ABS_PROTO_PATH}")
  file(RELATIVE_PATH Protobuf_PRE_IMPORT_DIRS ${CMAKE_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR})

  if(PROTOBUF_GENERATE_CPP_APPEND_PATH) # This variable is common for all types of output.
    # Create an include path for each file specified
    foreach(FIL ${PROTO_FILES})
      get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
      get_filename_component(ABS_PATH ${ABS_FIL} PATH)
      list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
      if(${_contains_already} EQUAL -1)
          list(APPEND _protobuf_include_path -I ${ABS_PATH})
      endif()
    endforeach()
  else()
    set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
  endif()

  if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS)
    set(Protobuf_IMPORT_DIR "${PROTOBUF_IMPORT_DIRS")
  endif()

  if(DEFINED Protobuf_IMPORT_DIRS)
    foreach(DIR ${Protobuf_IMPORT_DIRS})
      get_filename_component(ABS_PATH ${DIR} ABSOLUTE)
      list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
      if(${_contains_already} EQUAL -1)
          list(APPEND _protobuf_include_path -I ${ABS_PATH})
      endif()
    endforeach()
  endif()

  set(${SRCS})
  set(${HDRS})
  foreach(FIL ${PROTO_FILES})
    message(STATUS "grpc_cpp_proto: ${FIL}")
    get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
    get_filename_component(FIL_WE ${FIL} NAME_WE)
    message(STATUS "grpc_cpp_proto_abs: ${ABS_FIL}")

    if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH)
      get_filename_component(FIL_DIR ${FIL} DIRECTORY)
      if(FIL_DIR)
        set(FIL_WE "${FIL_DIR}/${FIL_WE}")
      endif()
    endif()

    if(Protobuf_PRE_IMPORT_DIRS)
    set(_protobuf_protoc_src "${CMAKE_CURRENT_BINARY_DIR}/${Protobuf_PRE_IMPORT_DIRS}/${FIL_WE}.grpc.pb.cc")
    set(_protobuf_protoc_hdr "${CMAKE_CURRENT_BINARY_DIR}/${Protobuf_PRE_IMPORT_DIRS}/${FIL_WE}.grpc.pb.h")
  else()
    set(_protobuf_protoc_src "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.grpc.pb.cc")
    set(_protobuf_protoc_hdr "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.grpc.pb.h")
  endif()
    message(STATUS "grpc_cpp_src: ${_protobuf_protoc_src}")
    list(APPEND ${SRCS} "${_protobuf_protoc_src}")
    list(APPEND ${HDRS} "${_protobuf_protoc_hdr}")

    add_custom_command(
      OUTPUT "${_protobuf_protoc_src}"
             "${_protobuf_protoc_hdr}"
      COMMAND ${Protobuf_PROTOC_EXECUTABLE}
              ${EXTRA_ARGS}
              "--grpc_out=${CMAKE_CURRENT_BINARY_DIR}"
              "--plugin=protoc-gen-grpc=${GRPC_CPP_PLUGIN}"
              ${_protobuf_protoc_flags}
              ${_protobuf_include_path} ${ABS_FIL}
      DEPENDS ${ABS_FIL} ${Protobuf_PROTOC_EXECUTABLE}
      COMMENT "Running gRPC C++ protocol buffer compiler on ${FIL}"
      VERBATIM)
  endforeach()

  set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE)
  set(${SRCS} ${${SRCS}} PARENT_SCOPE)
  set(${HDRS} ${${HDRS}} PARENT_SCOPE)
endfunction()

================================================
FILE: cmake/LibFindMacros.cmake
================================================
# Version 2.2
# Public Domain, originally written by Lasse Kärkkäinen <tronic>
# Maintained at https://github.com/Tronic/cmake-modules
# Please send your improvements as pull requests on Github.

# Find another package and make it a dependency of the current package.
# This also automatically forwards the "REQUIRED" argument.
# Usage: libfind_package(<prefix> <another package> [extra args to find_package])
macro (libfind_package PREFIX PKG)
  set(${PREFIX}_args ${PKG} ${ARGN})
  if (${PREFIX}_FIND_REQUIRED)
    set(${PREFIX}_args ${${PREFIX}_args} REQUIRED)
  endif()
  find_package(${${PREFIX}_args})
  set(${PREFIX}_DEPENDENCIES ${${PREFIX}_DEPENDENCIES};${PKG})
  unset(${PREFIX}_args)
endmacro()

# A simple wrapper to make pkg-config searches a bit easier.
# Works the same as CMake's internal pkg_check_modules but is always quiet.
macro (libfind_pkg_check_modules)
  find_package(PkgConfig QUIET)
  if (PKG_CONFIG_FOUND)
    pkg_check_modules(${ARGN} QUIET)
  endif()
endmacro()

# Avoid useless copy&pasta by doing what most simple libraries do anyway:
# pkg-config, find headers, find library.
# Usage: libfind_pkg_detect(<prefix> <pkg-config args> FIND_PATH <name> [other args] FIND_LIBRARY <name> [other args])
# E.g. libfind_pkg_detect(SDL2 sdl2 FIND_PATH SDL.h PATH_SUFFIXES SDL2 FIND_LIBRARY SDL2)
function (libfind_pkg_detect PREFIX)
  # Parse arguments
  set(argname pkgargs)
  foreach (i ${ARGN})
    if ("${i}" STREQUAL "FIND_PATH")
      set(argname pathargs)
    elseif ("${i}" STREQUAL "FIND_LIBRARY")
      set(argname libraryargs)
    else()
      set(${argname} ${${argname}} ${i})
    endif()
  endforeach()
  if (NOT pkgargs)
    message(FATAL_ERROR "libfind_pkg_detect requires at least a pkg_config package name to be passed.")
  endif()
  # Find library
  libfind_pkg_check_modules(${PREFIX}_PKGCONF ${pkgargs})
  if (pathargs)
    find_path(${PREFIX}_INCLUDE_DIR NAMES ${pathargs} HINTS ${${PREFIX}_PKGCONF_INCLUDE_DIRS})
  endif()
  if (libraryargs)
    find_library(${PREFIX}_LIBRARY NAMES ${libraryargs} HINTS ${${PREFIX}_PKGCONF_LIBRARY_DIRS})
  endif()
endfunction()

# Extracts a version #define from a version.h file, output stored to <PREFIX>_VERSION.
# Usage: libfind_version_header(Foobar foobar/version.h FOOBAR_VERSION_STR)
# Fourth argument "QUIET" may be used for silently testing different define names.
# This function does nothing if the version variable is already defined.
function (libfind_version_header PREFIX VERSION_H DEFINE_NAME)
  # Skip processing if we already have a version or if the include dir was not found
  if (${PREFIX}_VERSION OR NOT ${PREFIX}_INCLUDE_DIR)
    return()
  endif()
  set(quiet ${${PREFIX}_FIND_QUIETLY})
  # Process optional arguments
  foreach(arg ${ARGN})
    if (arg STREQUAL "QUIET")
      set(quiet TRUE)
    else()
      message(AUTHOR_WARNING "Unknown argument ${arg} to libfind_version_header ignored.")
    endif()
  endforeach()
  # Read the header and parse for version number
  set(filename "${${PREFIX}_INCLUDE_DIR}/${VERSION_H}")
  if (NOT EXISTS ${filename})
    if (NOT quiet)
      message(AUTHOR_WARNING "Unable to find ${${PREFIX}_INCLUDE_DIR}/${VERSION_H}")
    endif()
    return()
  endif()
  file(READ "${filename}" header)
  string(REGEX REPLACE ".*#[ \t]*define[ \t]*${DEFINE_NAME}[ \t]*\"([^\n]*)\".*" "\\1" match "${header}")
  # No regex match?
  if (match STREQUAL header)
    if (NOT quiet)
      message(AUTHOR_WARNING "Unable to find \#define ${DEFINE_NAME} \"<version>\" from ${${PREFIX}_INCLUDE_DIR}/${VERSION_H}")
    endif()
    return()
  endif()
  # Export the version string
  set(${PREFIX}_VERSION "${match}" PARENT_SCOPE)
endfunction()

# Do the final processing once the paths have been detected.
# If include dirs are needed, ${PREFIX}_PROCESS_INCLUDES should be set to contain
# all the variables, each of which contain one include directory.
# Ditto for ${PREFIX}_PROCESS_LIBS and library files.
# Will set ${PREFIX}_FOUND, ${PREFIX}_INCLUDE_DIRS and ${PREFIX}_LIBRARIES.
# Also handles errors in case library detection was required, etc.
function (libfind_process PREFIX)
  # Skip processing if already processed during this configuration run
  if (${PREFIX}_FOUND)
    return()
  endif()

  set(found TRUE)  # Start with the assumption that the package was found

  # Did we find any files? Did we miss includes? These are for formatting better error messages.
  set(some_files FALSE)
  set(missing_headers FALSE)

  # Shorthands for some variables that we need often
  set(quiet ${${PREFIX}_FIND_QUIETLY})
  set(required ${${PREFIX}_FIND_REQUIRED})
  set(exactver ${${PREFIX}_FIND_VERSION_EXACT})
  set(findver "${${PREFIX}_FIND_VERSION}")
  set(version "${${PREFIX}_VERSION}")

  # Lists of config option names (all, includes, libs)
  unset(configopts)
  set(includeopts ${${PREFIX}_PROCESS_INCLUDES})
  set(libraryopts ${${PREFIX}_PROCESS_LIBS})

  # Process deps to add to 
  foreach (i ${PREFIX} ${${PREFIX}_DEPENDENCIES})
    if (DEFINED ${i}_INCLUDE_OPTS OR DEFINED ${i}_LIBRARY_OPTS)
      # The package seems to export option lists that we can use, woohoo!
      list(APPEND includeopts ${${i}_INCLUDE_OPTS})
      list(APPEND libraryopts ${${i}_LIBRARY_OPTS})
    else()
      # If plural forms don't exist or they equal singular forms
      if ((NOT DEFINED ${i}_INCLUDE_DIRS AND NOT DEFINED ${i}_LIBRARIES) OR
          ({i}_INCLUDE_DIR STREQUAL ${i}_INCLUDE_DIRS AND ${i}_LIBRARY STREQUAL ${i}_LIBRARIES))
        # Singular forms can be used
        if (DEFINED ${i}_INCLUDE_DIR)
          list(APPEND includeopts ${i}_INCLUDE_DIR)
        endif()
        if (DEFINED ${i}_LIBRARY)
          list(APPEND libraryopts ${i}_LIBRARY)
        endif()
      else()
        # Oh no, we don't know the option names
        message(FATAL_ERROR "We couldn't determine config variable names for ${i} includes and libs. Aieeh!")
      endif()
    endif()
  endforeach()
  
  if (includeopts)
    list(REMOVE_DUPLICATES includeopts)
  endif()
  
  if (libraryopts)
    list(REMOVE_DUPLICATES libraryopts)
  endif()

  string(REGEX REPLACE ".*[ ;]([^ ;]*(_INCLUDE_DIRS|_LIBRARIES))" "\\1" tmp "${includeopts} ${libraryopts}")
  if (NOT tmp STREQUAL "${includeopts} ${libraryopts}")
    message(AUTHOR_WARNING "Plural form ${tmp} found in config options of ${PREFIX}. This works as before but is now deprecated. Please only use singular forms INCLUDE_DIR and LIBRARY, and update your find scripts for LibFindMacros > 2.0 automatic dependency system (most often you can simply remove the PROCESS variables entirely).")
  endif()

  # Include/library names separated by spaces (notice: not CMake lists)
  unset(includes)
  unset(libs)

  # Process all includes and set found false if any are missing
  foreach (i ${includeopts})
    list(APPEND configopts ${i})
    if (NOT "${${i}}" STREQUAL "${i}-NOTFOUND")
      list(APPEND includes "${${i}}")
    else()
      set(found FALSE)
      set(missing_headers TRUE)
    endif()
  endforeach()

  # Process all libraries and set found false if any are missing
  foreach (i ${libraryopts})
    list(APPEND configopts ${i})
    if (NOT "${${i}}" STREQUAL "${i}-NOTFOUND")
      list(APPEND libs "${${i}}")
    else()
      set (found FALSE)
    endif()
  endforeach()

  # Version checks
  if (found AND findver)
    if (NOT version)
      message(WARNING "The find module for ${PREFIX} does not provide version information, so we'll just assume that it is OK. Please fix the module or remove package version requirements to get rid of this warning.")
    elseif (version VERSION_LESS findver OR (exactver AND NOT version VERSION_EQUAL findver))
      set(found FALSE)
      set(version_unsuitable TRUE)
    endif()
  endif()

  # If all-OK, hide all config options, export variables, print status and exit
  if (found)
    foreach (i ${configopts})
      mark_as_advanced(${i})
    endforeach()
    if (NOT quiet)
      message(STATUS "Found ${PREFIX} ${${PREFIX}_VERSION}")
      if (LIBFIND_DEBUG)
        message(STATUS "  ${PREFIX}_DEPENDENCIES=${${PREFIX}_DEPENDENCIES}")
        message(STATUS "  ${PREFIX}_INCLUDE_OPTS=${includeopts}")
        message(STATUS "  ${PREFIX}_INCLUDE_DIRS=${includes}")
        message(STATUS "  ${PREFIX}_LIBRARY_OPTS=${libraryopts}")
        message(STATUS "  ${PREFIX}_LIBRARIES=${libs}")
      endif()
      set (${PREFIX}_INCLUDE_OPTS ${includeopts} PARENT_SCOPE)
      set (${PREFIX}_LIBRARY_OPTS ${libraryopts} PARENT_SCOPE)
      set (${PREFIX}_INCLUDE_DIRS ${includes} PARENT_SCOPE)
      set (${PREFIX}_LIBRARIES ${libs} PARENT_SCOPE)
      set (${PREFIX}_FOUND TRUE PARENT_SCOPE)
    endif()
    return()    
  endif()

  # Format messages for debug info and the type of error
  set(vars "Relevant CMake configuration variables:\n")
  foreach (i ${configopts})
    mark_as_advanced(CLEAR ${i})
    set(val ${${i}})
    if ("${val}" STREQUAL "${i}-NOTFOUND")
      set (val "<not found>")
    elseif (val AND NOT EXISTS ${val})
      set (val "${val}  (does not exist)")
    else()
      set(some_files TRUE)
    endif()
    set(vars "${vars}  ${i}=${val}\n")
  endforeach()
  set(vars "${vars}You may use CMake GUI, cmake -D or ccmake to modify the values. Delete CMakeCache.txt to discard all values and force full re-detection if necessary.\n")
  if (version_unsuitable)
    set(msg "${PREFIX} ${${PREFIX}_VERSION} was found but")
    if (exactver)
      set(msg "${msg} only version ${findver} is acceptable.")
    else()
      set(msg "${msg} version ${findver} is the minimum requirement.")
    endif()
  else()
    if (missing_headers)
      set(msg "We could not find development headers for ${PREFIX}. Do you have the necessary dev package installed?")
    elseif (some_files)
      set(msg "We only found some files of ${PREFIX}, not all of them. Perhaps your installation is incomplete or maybe we just didn't look in the right place?")
      if(findver)
        set(msg "${msg} This could also be caused by incompatible version (if it helps, at least ${PREFIX} ${findver} should work).")
      endif()
    else()
      set(msg "We were unable to find package ${PREFIX}.")
    endif()
  endif()

  # Fatal error out if REQUIRED
  if (required)
    set(msg "REQUIRED PACKAGE NOT FOUND\n${msg} This package is REQUIRED and you need to install it or adjust CMake configuration in order to continue building ${CMAKE_PROJECT_NAME}.")
    message(FATAL_ERROR "${msg}\n${vars}")
  endif()
  # Otherwise just print a nasty warning
  if (NOT quiet)
    message(WARNING "WARNING: MISSING PACKAGE\n${msg} This package is NOT REQUIRED and you may ignore this warning but by doing so you may miss some functionality of ${CMAKE_PROJECT_NAME}. \n${vars}")
  endif()
endfunction()


================================================
FILE: cmake/ProtobufGenerateCPPLikeBazel.cmake
================================================
function(PROTOBUF_GENERATE_CPP_LIKE_BAZEL SRCS HDRS)
  cmake_parse_arguments(protobuf "" "EXPORT_MACRO;DESCRIPTORS" "" ${ARGN})

  set(PROTO_FILES "${protobuf_UNPARSED_ARGUMENTS}")
  if(NOT PROTO_FILES)
    message(SEND_ERROR "Error: PROTOBUF_GENERATE_CPP() called without any proto files")
    return()
  endif()

  if(protobuf_EXPORT_MACRO)
    set(DLL_EXPORT_DECL "dllexport_decl=${protobuf_EXPORT_MACRO}:")
  endif()

  get_filename_component(ABS_PROTO_PATH ${CMAKE_SOURCE_DIR} ABSOLUTE)
  set(EXTRA_ARGS "--proto_path=${ABS_PROTO_PATH}")
  file(RELATIVE_PATH Protobuf_PRE_IMPORT_DIRS ${CMAKE_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR})

  if(PROTOBUF_GENERATE_CPP_APPEND_PATH)
    # Create an include path for each file specified
    foreach(FIL ${PROTO_FILES})
      get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
      get_filename_component(ABS_PATH ${ABS_FIL} PATH)
      list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
      if(${_contains_already} EQUAL -1)
          list(APPEND _protobuf_include_path -I ${ABS_PATH})
      endif()
    endforeach()
  else()
    set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
  endif()

  if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS)
    set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}")
  endif()

  if(DEFINED Protobuf_IMPORT_DIRS)
    foreach(DIR ${Protobuf_IMPORT_DIRS})
      get_filename_component(ABS_PATH ${DIR} ABSOLUTE)
      list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
      if(${_contains_already} EQUAL -1)
          list(APPEND _protobuf_include_path -I ${ABS_PATH})
      endif()
    endforeach()
  endif()

  set(${SRCS})
  set(${HDRS})
  if (protobuf_DESCRIPTORS)
    set(${protobuf_DESCRIPTORS})
  endif()

  foreach(FIL ${PROTO_FILES})
    get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
    get_filename_component(FIL_WE ${FIL} NAME_WE)
    if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH)
      get_filename_component(FIL_DIR ${FIL} DIRECTORY)
      if(FIL_DIR)
        set(FIL_WE "${FIL_DIR}/${FIL_WE}")
      endif()
    endif()

    if(Protobuf_PRE_IMPORT_DIRS)
      set(_protobuf_protoc_src "${CMAKE_CURRENT_BINARY_DIR}/${Protobuf_PRE_IMPORT_DIRS}/${FIL_WE}.pb.cc")
      set(_protobuf_protoc_hdr "${CMAKE_CURRENT_BINARY_DIR}/${Protobuf_PRE_IMPORT_DIRS}/${FIL_WE}.pb.h")
    else()
      set(_protobuf_protoc_src "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc")
      set(_protobuf_protoc_hdr "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h")
    endif()
    list(APPEND ${SRCS} "${_protobuf_protoc_src}")
    list(APPEND ${HDRS} "${_protobuf_protoc_hdr}")

    if(protobuf_DESCRIPTORS)
      set(_protobuf_protoc_desc "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.desc")
      set(_protobuf_protoc_flags "--descriptor_set_out=${_protobuf_protoc_desc}")
      list(APPEND ${protobuf_DESCRIPTORS} "${_protobuf_protoc_desc}")
    else()
      set(_protobuf_protoc_desc "")
      set(_protobuf_protoc_flags "")
    endif()

    add_custom_command(
      OUTPUT "${_protobuf_protoc_src}"
             "${_protobuf_protoc_hdr}"
             ${_protobuf_protoc_desc}
      COMMAND  protobuf::protoc
               ${EXTRA_ARGS}
               "--cpp_out=${DLL_EXPORT_DECL}${CMAKE_CURRENT_BINARY_DIR}"
               ${_protobuf_protoc_flags}
               ${_protobuf_include_path} ${ABS_FIL}
      DEPENDS ${ABS_FIL} protobuf::protoc
      COMMENT "Running C++ protocol buffer compiler on ${FIL}"
      VERBATIM )
  endforeach()

  set(${SRCS} "${${SRCS}}" PARENT_SCOPE)
  set(${HDRS} "${${HDRS}}" PARENT_SCOPE)
  if(protobuf_DESCRIPTORS)
    set(${protobuf_DESCRIPTORS} "${${protobuf_DESCRIPTORS}}" PARENT_SCOPE)
  endif()
endfunction()


================================================
FILE: cmake/dependencies.cmake
================================================
include (ExternalProject)

set (DEPENDENCIES)
set (EXTRA_CMAKE_ARGS)

# trtlab external dependencies
list (APPEND DEPENDENCIES boost dlpack gflags glog benchmark googletest cpuaff jemalloc)
list (APPEND DEPENDENCIES grpc-repo protobuf c-ares grpc cub cnpy)

# note on ubuntu 18.04, you need
# apt install libz-dev libssl-dev

# customize the folder for external projects
# download, source and builds for dependencies 
# will be in <build-dir>/Dependencies
set_property (DIRECTORY PROPERTY EP_BASE Dependencies)

# all dependencies will be installed here
# typical directories: bin, include and lib
set (BUILD_ROOT ${CMAKE_CURRENT_BINARY_DIR}/Dependencies/Build)
set (SOURCE_ROOT ${CMAKE_CURRENT_BINARY_DIR}/Dependencies/Source)
set (INSTALL_ROOT ${CMAKE_CURRENT_BINARY_DIR}/local)

# set cmake search paths to pick up installed .cmake files
list(INSERT CMAKE_MODULE_PATH 0 "${INSTALL_ROOT}/lib/cmake")
list(INSERT CMAKE_PREFIX_PATH 0 "${INSTALL_ROOT}/lib/cmake")

# cmake config args forwarded to trtlab
list(APPEND EXTRA_CMAKE_ARGS
  -DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}
  -DCMAKE_MODULE_PATH=${CMAKE_MODULE_PATH}
# -DBoost_VERBOSE=ON
  -DBoost_USE_STATIC_LIBS=ON
  -DCPUAFF_ROOT=${INSTALL_ROOT}
  -DJEMALLOC_STATIC_LIBRARIES=${INSTALL_ROOT}/lib/libjemalloc_pic.a
  -DCUB_INCLUDE_DIR=${SOURCE_ROOT}/cub
  -DINSTALL_ROOT=${INSTALL_ROOT}
)

# short-cut to dependencies build path
set (BUILD_ROOT ${CMAKE_CURRENT_BINARY_DIR}/Dependencies/Build)

# Boost
# =====
# - Use static linking to avoid issues with system-wide installations of Boost.
# - Use numa=on to ensure the numa component of fiber gets built
set(BOOST_COMPONENTS "context,fiber,filesystem")
ExternalProject_Add (boost
  URL https://dl.bintray.com/boostorg/release/1.72.0/source/boost_1_72_0.tar.gz
  URL_HASH SHA256=c66e88d5786f2ca4dbebb14e06b566fb642a1a6947ad8cc9091f9f445134143f
  CONFIGURE_COMMAND ./bootstrap.sh --prefix=${INSTALL_ROOT} --with-libraries=${BOOST_COMPONENTS} numa=on
  BUILD_COMMAND ./b2 link=static cxxflags=-fPIC cflags=-fPIC cxxflags="-std=c++14" numa=on 
                     --build-dir=${BUILD_ROOT}/boost --stagedir=${BUILD_ROOT}/boost
  BUILD_IN_SOURCE 1
  INSTALL_COMMAND ./b2 install numa=on
)

# DLPack
# ======
ExternalProject_Add(dlpack
  GIT_REPOSITORY "https://github.com/dmlc/dlpack.git"
  GIT_TAG "master"
  CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${INSTALL_ROOT}
)


# gflags
# ======
# config, build and install to INSTALL_ROOT
ExternalProject_Add(gflags
  GIT_REPOSITORY "https://github.com/gflags/gflags.git"
  GIT_TAG "v2.2.2"
  CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${INSTALL_ROOT}
             -DBUILD_SHARED_LIBS=ON
             -DBUILD_STATIC_LIBS=ON
             -DBUILD_PACKAGING=OFF
             -DBUILD_TESTING=OFF
             -DBUILD_CONFIG_TESTS=OFF
             -DINSTALL_HEADERS=ON
             -DBUILD_gflags_LIB=OFF
             -DBUILD_gflags_nothreads_LIB=ON
             -DGFLAGS_NAMESPACE=google
)

# glog
# ====
# - link against shared 
# - todo: compile with -DWITH_GFLAGS=OFF and remove gflags dependency
ExternalProject_Add(glog
  DEPENDS gflags
  GIT_REPOSITORY "https://github.com/google/glog"
  GIT_TAG "v0.4.0"
  CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${INSTALL_ROOT}
             -DCMAKE_BUILD_TYPE=Release
             -DBUILD_TESTING=OFF
)

# google benchmark
# ================
ExternalProject_Add(benchmark
  DEPENDS 
  GIT_REPOSITORY    https://github.com/google/benchmark.git
  GIT_TAG           "v1.5.0"
  BINARY_DIR        "${CMAKE_CURRENT_BINARY_DIR}/Dependencies/Build/benchmark"
  CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${INSTALL_ROOT}
             -DCMAKE_BUILD_TYPE=Release
             -DBENCHMARK_ENABLE_TESTING=OFF
)

# google test
# ===========
ExternalProject_Add(googletest
  DEPENDS glog gflags
  GIT_REPOSITORY    https://github.com/google/googletest.git
  GIT_TAG           "release-1.10.0"
  BINARY_DIR        "${CMAKE_CURRENT_BINARY_DIR}/Dependencies/Build/googletest"
  CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${INSTALL_ROOT}
             -DCMAKE_BUILD_TYPE=Release
)

# cpuaff
# ======
ExternalProject_Add(cpuaff
  URL http://dcdillon.github.io/cpuaff/releases/cpuaff-1.0.6.tar.gz
  CONFIGURE_COMMAND ./configure --prefix=${INSTALL_ROOT}
  BUILD_COMMAND     make include
  INSTALL_COMMAND   make install include
  BUILD_IN_SOURCE 1
)

# nvidia cub
# ==========
ExternalProject_Add(cub
  GIT_REPOSITORY    https://github.com/NVlabs/cub.git 
  GIT_TAG           "1.8.0"
  CONFIGURE_COMMAND ""
  BUILD_COMMAND ""
  INSTALL_COMMAND ""
  TEST_COMMAND ""
)

# jemalloc
# ========
ExternalProject_Add(jemalloc
  URL https://github.com/jemalloc/jemalloc/releases/download/5.2.1/jemalloc-5.2.1.tar.bz2
  CONFIGURE_COMMAND ./configure --prefix=${INSTALL_ROOT}
  BUILD_COMMAND     make include
  INSTALL_COMMAND   make install include
  BUILD_IN_SOURCE 1
)

# cnpy - c++ library for reading and writing .npy/.npz files
# ==========================================================
ExternalProject_Add(cnpy
  GIT_REPOSITORY "https://github.com/rogersce/cnpy.git"
  GIT_TAG "master"
  CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${INSTALL_ROOT}
             -DCMAKE_BUILD_TYPE=Release
             -DBUILD_TESTING=OFF
             -DCMAKE_POSITION_INDEPENDENT_CODEL=ON
)

# grpc-repo
# =========
ExternalProject_Add(grpc-repo
  GIT_REPOSITORY "https://github.com/grpc/grpc.git"
  GIT_TAG "v1.32.0"
  GIT_SUBMODULES "third_party/cares/cares" "third_party/protobuf" "third_party/abseil-cpp" "third_party/re2"
  CONFIGURE_COMMAND ""
  BUILD_COMMAND ""
  INSTALL_COMMAND ""
  TEST_COMMAND ""
)

#
# Build protobuf project from grpc-repo
#
ExternalProject_Add(absl
  SOURCE_DIR "${SOURCE_ROOT}/grpc-repo/third_party/abseil-cpp"
  DOWNLOAD_COMMAND ""
  CMAKE_CACHE_ARGS
        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=TRUE
        -DCMAKE_INSTALL_PREFIX:PATH=${INSTALL_ROOT}
  DEPENDS grpc-repo
)

ExternalProject_Add(re2
  SOURCE_DIR "${SOURCE_ROOT}/grpc-repo/third_party/re2"
  DOWNLOAD_COMMAND ""
  CMAKE_CACHE_ARGS
        -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=TRUE
        -DCMAKE_INSTALL_PREFIX:PATH=${INSTALL_ROOT}
  DEPENDS grpc-repo
)

ExternalProject_Add(protobuf
  SOURCE_DIR "${SOURCE_ROOT}/grpc-repo/third_party/protobuf/cmake"
  DOWNLOAD_COMMAND ""
  CMAKE_ARGS
    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
    -Dprotobuf_BUILD_TESTS:BOOL=OFF
    -Dprotobuf_WITH_ZLIB:BOOL=OFF
    -Dprotobuf_MSVC_STATIC_RUNTIME:BOOL=OFF
    -DCMAKE_BUILD_TYPE=Release
    -DCMAKE_INSTALL_PREFIX:PATH=${INSTALL_ROOT}
  DEPENDS grpc-repo
)

# Location where protobuf-config.cmake will be installed varies by
# platform
if (WIN32)
  set(_FINDPACKAGE_PROTOBUF_CONFIG_DIR "${CMAKE_CURRENT_BINARY_DIR}/protobuf/cmake")
else()
  set(_FINDPACKAGE_PROTOBUF_CONFIG_DIR "${INSTALL_ROOT}/lib/cmake")
endif()

#
# Build c-area project from grpc-repo
#
ExternalProject_Add(c-ares
  SOURCE_DIR "${SOURCE_ROOT}/grpc-repo/third_party/cares/cares"
  DOWNLOAD_COMMAND ""
  CMAKE_ARGS
    -DCARES_SHARED:BOOL=OFF
    -DCARES_STATIC:BOOL=ON
    -DCARES_STATIC_PIC:BOOL=ON
    -DCMAKE_BUILD_TYPE=Release
    -DCMAKE_INSTALL_PREFIX:PATH=${INSTALL_ROOT}
  DEPENDS grpc-repo
)


#
# Build GRPC
#
ExternalProject_Add(grpc
  SOURCE_DIR "${SOURCE_ROOT}/grpc-repo"
  DOWNLOAD_COMMAND ""
  CMAKE_ARGS
    -DgRPC_INSTALL:BOOL=ON
    -DgRPC_BUILD_TESTS:BOOL=OFF
    -DgRPC_PROTOBUF_PROVIDER:STRING=package
    -DgRPC_PROTOBUF_PACKAGE_TYPE:STRING=CONFIG
    -DProtobuf_DIR:PATH=${INSTALL_ROOT}/lib/cmake
    -DgRPC_ZLIB_PROVIDER:STRING=package
    -DgRPC_CARES_PROVIDER:STRING=package
    -Dc-ares_DIR:PATH=${INSTALL_ROOT}/lib/cmake
    -DgRPC_SSL_PROVIDER:STRING=package
    -DgRPC_GFLAGS_PROVIDER=package
    -DgRPC_BENCHMARK_PROVIDER=package
    -DgRPC_RE2_PROVIDER:STRING=package
    -Dre2_DIR:STRING=${INSTALL_ROOT}/lib/cmake
    -DgRPC_ABSL_PROVIDER:STRING=package
    -Dabsl_DIR:STRING=${INSTALL_ROOT}/lib/cmake
    ${_CMAKE_ARGS_OPENSSL_ROOT_DIR}
    -DCMAKE_BUILD_TYPE=Release
    -DCMAKE_INSTALL_PREFIX:PATH=${INSTALL_ROOT}
  DEPENDS grpc-repo c-ares protobuf re2 absl gflags benchmark
)


# trtlab
# ======
ExternalProject_Add (trtlab
  DEPENDS ${DEPENDENCIES}
  SOURCE_DIR ${PROJECT_SOURCE_DIR}
  CMAKE_ARGS -DBUILD_DEPENDENCIES=OFF ${EXTRA_CMAKE_ARGS}
  INSTALL_COMMAND ""
  BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})

================================================
FILE: devel.sh
================================================
#!/bin/bash
#
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
models_path=${TRT_MODELS_PATH:-"/path/to/my/models"}
models_cli=""
if [ -d "$models_path" ]; then
  models_cli=" -v $(realpath $models_path):/work/models "
fi

crt=""
if [ -x "$(which luda)" ] ; then
    echo "Using luda"
    crt="$(which luda) --no-home"
elif [ -x "$(which nvidia-docker)" ]; then
    echo "Using nvidia-docker"
    crt="nvidia-docker run --rm -ti"
else
    echo "No GPU container runtime found"
    exit 911
fi
NV_GPU=0 $crt -v $PWD:/work $models_cli --workdir /work --name trtlab --net host trtlab


================================================
FILE: examples/00_TensorRT/CMakeLists.txt
================================================
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

include_directories(${CUDA_INCLUDE_DIRS})
include_directories(${TensorRT_INCLUDE_DIRS})


add_executable(inference.x
    inference.cc
    ${PROTO_SRCS}
    ${PROTO_GRPC_SRCS})

target_link_libraries(inference.x
    trtlab::nvrpc
    trtlab::tensorrt
    gflags
)

add_executable(infer.x
    infer.cc
    ${PROTO_SRCS}
    ${PROTO_GRPC_SRCS})

target_link_libraries(infer.x
    trtlab::nvrpc
    trtlab::tensorrt
    gflags
)

if(YAIS_ENABLE_MPI)
find_package(MPI)
include_directories(SYSTEM ${MPI_INCLUDE_PATH})
target_link_libraries(inference.x
    ${MPI_C_LIBRARIES}
    ${MPI_CXX_LIBRARIES}
)
target_compile_definitions(inference.x PUBLIC PLAYGROUND_USE_MPI)
endif()


================================================
FILE: examples/00_TensorRT/README.md
================================================
# Inference Example

Basic CLI tool for executing TensorRT engines.

Provide an engine and `inference.x` will run a simplifed inference pipeline using synthetic data.

The program will run a pipelined H2D -> TensorRT -> D2H calculation for `--seconds` (default: 5) with a
0.1 second warmup run. By default, only 1 TensorRT Execution Context is used to perform the evaulation.
You can modify the number of contexts using the `--contexts`. Unless provided, the number of Input/Output
Buffers is set to `(2 * contexts)`.  See below for the list of [options](#options).

The `inference.x` program is fully pipelined and asynchronous.  It performs uses three threads (default)
to: 1) async copy input H2D, 2) launch the async inference evaluation and return output tensor to the host,
and 3) to wait on the resouces used during execution and release them when finished.  This final thread is
where one might build a return message or do something else with the results.

While running `inference.x`, you may find it useful to monitor GPU metrics using:
```
nvidia-smi dmon -i 0 -s put
```

Note: If you see numbers that differ from the output of `giexec`, you may have an IO bottleneck in that 
the transfers are more expensive than the compute.

 * TODO: Update the program to output avg xfer time.
 * TODO: Build .engine files as part of the build

## Quickstart

```
root@dgx:/work/build/examples/00_TensorRT# ./inference.x --engine=/work/models/ResNet-50-b1-int8.engine
I0702 22:16:51.868419 10857 TensorRT.cc:561] -- Initialzing TensorRT Resource Manager --
I0702 22:16:51.868676 10857 TensorRT.cc:562] Maximum Execution Concurrency: 1
I0702 22:16:51.868686 10857 TensorRT.cc:563] Maximum Copy Concurrency: 2
I0702 22:16:53.430330 10857 TensorRT.cc:628] -- Registering Model: 0 --
I0702 22:16:53.430399 10857 TensorRT.cc:629] Input/Output Tensors require 591.9 KiB
I0702 22:16:53.430415 10857 TensorRT.cc:630] Execution Activations require 2.5 MiB
I0702 22:16:53.430428 10857 TensorRT.cc:633] Weights require 30.7 MiB
I0702 22:16:53.437571 10857 TensorRT.cc:652] -- Allocating TensorRT Resources --
I0702 22:16:53.437587 10857 TensorRT.cc:653] Creating 1 TensorRT execution tokens.
I0702 22:16:53.437595 10857 TensorRT.cc:654] Creating a Pool of 2 Host/Device Memory Stacks
I0702 22:16:53.437607 10857 TensorRT.cc:655] Each Host Stack contains 608.0 KiB
I0702 22:16:53.437614 10857 TensorRT.cc:656] Each Device Stack contains 3.2 MiB
I0702 22:16:53.437623 10857 TensorRT.cc:657] Total GPU Memory: 6.5 MiB
I0702 22:16:53.540400 10857 inference.cc:93] -- Inference: Running for ~5 seconds with batch_size 1 --
I0702 22:16:58.543475 10857 inference.cc:131] Inference Results: 4770 batches in 5.00307 seconds; sec/batch: 0.00104886; inf/sec: 953.414
```

## Options
```
    -buffers (Number of Buffers (default: 2x contexts)) type: int32 default: 0
    -contexts (Number of Execution Contexts) type: int32 default: 1
    -cudathreads (Number Cuda Launcher Threads) type: int32 default: 1
    -engine (TensorRT serialized engine) type: string
      default: "/work/models/trt4.engine"
    -respthreads (Number Response Sync Threads) type: int32 default: 1
    -seconds (Number of Execution Contexts) type: int32 default: 5
```


================================================
FILE: examples/00_TensorRT/infer.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <sys/stat.h>
#include <unistd.h>

#include <gflags/gflags.h>
#include <glog/logging.h>

#include "tensorrt/laboratory/core/thread_pool.h"
#include "tensorrt/laboratory/infer_bench.h"
#include "tensorrt/laboratory/inference_manager.h"
#include "tensorrt/laboratory/model.h"
#include "tensorrt/laboratory/runtime.h"

#ifdef PLAYGROUND_USE_MPI
#include "mpi.h"
#define MPI_CHECK(mpicall) mpicall
#else
#define MPI_CHECK(mpicall)
#endif

using trtlab::ThreadPool;
using trtlab::TensorRT::InferBench;
using trtlab::TensorRT::InferBenchKey;
using trtlab::TensorRT::InferenceManager;
using trtlab::TensorRT::ManagedRuntime;
using trtlab::TensorRT::Model;
using trtlab::TensorRT::Runtime;
using trtlab::TensorRT::StandardRuntime;

static std::string ModelName(int model_id)
{
    std::ostringstream stream;
    stream << model_id;
    return stream.str();
}

static bool ValidateEngine(const char* flagname, const std::string& value)
{
    struct stat buffer;
    return (stat(value.c_str(), &buffer) == 0);
}

DEFINE_string(engine, "/path/to/tensorrt.engine", "TensorRT serialized engine");
DEFINE_validator(engine, &ValidateEngine);
DEFINE_string(runtime, "default", "TensorRT Runtime");
DEFINE_int32(seconds, 5, "Approximate number of seconds for the timing loop");
DEFINE_int32(contexts, 1, "Number of Execution Contexts");
DEFINE_int32(buffers, 0, "Number of Buffers (default: 2x contexts)");
DEFINE_int32(cudathreads, 1, "Number Cuda Launcher Threads");
DEFINE_int32(respthreads, 1, "Number Response Sync Threads");
DEFINE_int32(replicas, 1, "Number of Replicas of the Model to load");
DEFINE_int32(batch_size, 0, "Overrides the max batch_size of the provided engine");

int main(int argc, char* argv[])
{
    FLAGS_alsologtostderr = 1; // Log to console
    ::google::InitGoogleLogging("TensorRT Inference");
    ::google::ParseCommandLineFlags(&argc, &argv, true);

    MPI_CHECK(MPI_Init(&argc, &argv));

    auto contexts = FLAGS_contexts;
    auto buffers = FLAGS_buffers ? FLAGS_buffers : 2 * FLAGS_contexts;

    auto resources = std::make_shared<InferenceManager>(contexts, buffers);
    resources->RegisterThreadPool("pre", std::make_unique<ThreadPool>(1));
    resources->RegisterThreadPool("cuda", std::make_unique<ThreadPool>(1));
    resources->RegisterThreadPool("post", std::make_unique<ThreadPool>(3));

    //, FLAGS_cudathreads, FLAGS_respthreads);
    std::shared_ptr<Runtime> runtime;
    if(FLAGS_runtime == "default")
    {
        runtime = std::make_shared<StandardRuntime>();
    }
    else if(FLAGS_runtime == "unified")
    {
        runtime = std::make_shared<ManagedRuntime>();
    }
    else
    {
        LOG(FATAL) << "Invalid TensorRT Runtime";
    }

    std::vector<std::shared_ptr<Model>> models;

    models.push_back(runtime->DeserializeEngine(FLAGS_engine));
    resources->RegisterModel("0", models.back());
    resources->AllocateResources();

    auto batch_size = FLAGS_batch_size ? FLAGS_batch_size : models.back()->GetMaxBatchSize();

    for(int i = 1; i < FLAGS_replicas; i++)
    {
        models.push_back(runtime->DeserializeEngine(FLAGS_engine));
        resources->RegisterModel(ModelName(i), models.back());
    }

    {
        InferBench benchmark(resources);
        benchmark.Run(models, batch_size, 0.1);

        // if testing mps - sync all processes before executing timed loop
        MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
        auto results = benchmark.Run(models, batch_size, FLAGS_seconds);
        MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
        // todo: perform an mpi_allreduce to collect the per process timings
        //       for a simplified report
        MPI_CHECK(MPI_Finalize());

        using namespace trtlab::TensorRT;
        LOG(INFO) << "Inference Results: " << (*results)[kBatchesComputed]
                  << " batches computed in " << (*results)[kWalltime] << " seconds on "
                  << (*results)[kMaxExecConcurrency]
                  << " compute streams using batch_size: " << (*results)[kBatchSize]
                  << "; inf/sec: " << (*results)[kInferencesPerSecond]
                  << "; batches/sec: " << (*results)[kBatchesPerSecond]
                  << "; execution time per batch: " << (*results)[kExecutionTimePerBatch];
    }

    return 0;
}


================================================
FILE: examples/00_TensorRT/inference.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <sys/stat.h>
#include <unistd.h>

#include <gflags/gflags.h>
#include <glog/logging.h>

#include "tensorrt/laboratory/core/thread_pool.h"
#include "tensorrt/laboratory/inference_manager.h"
#include "tensorrt/laboratory/runtime.h"

#ifdef PLAYGROUND_USE_MPI
#include "mpi.h"
#define MPI_CHECK(mpicall) mpicall
#else
#define MPI_CHECK(mpicall)
#endif

using trtlab::ThreadPool;
using trtlab::TensorRT::CustomRuntime;
using trtlab::TensorRT::InferenceManager;
using trtlab::TensorRT::ManagedAllocator;
using trtlab::TensorRT::Runtime;
using trtlab::TensorRT::StandardAllocator;

static int g_Concurrency = 0;

static std::string ModelName(int model_id)
{
    std::ostringstream stream;
    stream << model_id;
    return stream.str();
}

class InferenceResources : public InferenceManager
{
  public:
    InferenceResources(int max_executions, int max_buffers, size_t nCuda, size_t nResp)
        : InferenceManager(max_executions, max_buffers),
          m_CudaThreadPool(std::make_unique<ThreadPool>(nCuda)),
          m_ResponseThreadPool(std::make_unique<ThreadPool>(nResp))
    {
    }

    ~InferenceResources() override {}

    std::unique_ptr<ThreadPool>& GetCudaThreadPool() { return m_CudaThreadPool; }
    std::unique_ptr<ThreadPool>& GetResponseThreadPool() { return m_ResponseThreadPool; }

  private:
    std::unique_ptr<ThreadPool> m_CudaThreadPool;
    std::unique_ptr<ThreadPool> m_ResponseThreadPool;
};

class Inference final
{
  public:
    Inference(std::shared_ptr<InferenceResources> resources) : m_Resources(resources) {}

    void Run(float seconds, bool warmup, int replicas, uint32_t requested_batch_size)
    {
        int replica = 0;
        uint64_t inf_count = 0;

        auto start = std::chrono::steady_clock::now();
        auto elapsed = [start]() -> float {
            return std::chrono::duration<float>(std::chrono::steady_clock::now() - start).count();
        };

        auto model = GetResources()->GetModel(ModelName(replica++));
        auto batch_size = requested_batch_size ? requested_batch_size : model->GetMaxBatchSize();
        if(batch_size > model->GetMaxBatchSize())
        {
            LOG(FATAL)
                << "Requested batch_size greater than allowed by the compiled TensorRT Engine";
        }

        // Inference Loop - Main thread copies, cuda thread launches, response thread completes
        if(!warmup)
        {
            LOG(INFO) << "-- Inference: Running for ~" << (int)seconds
                      << " seconds with batch_size " << batch_size << " --";
        }

        std::vector<std::future<void>> futures;

        while(elapsed() < seconds && ++inf_count)
        {
            if(replica >= replicas) replica = 0;

            // This thread only async copies buffers H2D
            auto model = GetResources()->GetModel(ModelName(replica++));
            auto buffers = GetResources()->GetBuffers(); // <=== Limited Resource; May Block !!!
            auto bindings = buffers->CreateBindings(model);
            auto promise = std::make_shared<std::promise<void>>();
            futures.push_back(promise->get_future());

            bindings->SetBatchSize(batch_size);
            bindings->CopyToDevice(bindings->InputBindings());

            GetResources()->GetCudaThreadPool()->enqueue([this, bindings, promise]() mutable {
                // This thread enqueues two async kernels:
                //  1) TensorRT execution
                //  2) D2H of output tensors
                auto trt = GetResources()->GetExecutionContext(
                    bindings->GetModel()); // <=== Limited Resource; May Block !!!
                trt->Infer(bindings);
                bindings->CopyFromDevice(bindings->OutputBindings());

                GetResources()->GetResponseThreadPool()->enqueue(
                    [bindings, trt, promise]() mutable {
                        // This thread waits on the completion of the async compute and the async
                        // copy
                        trt->Synchronize();
                        trt.reset(); // Finished with the Execution Context - Release it to
                                     // competing threads
                        bindings->Synchronize();
                        bindings.reset(); // Finished with Buffers - Release it to competing threads
                        promise->set_value();
                    });
            });
        }

        for(const auto& f : futures)
        {
            f.wait();
        }

        /*
                // Join worker threads
                if (!warmup)
                    GetResources()->GetCudaThreadPool().reset();
                if (!warmup)
                    GetResources()->GetResponseThreadPool().reset();
        */
        // End timing and report
        auto total_time = std::chrono::duration<float>(elapsed()).count();
        auto inferences = inf_count * batch_size;
        if(!warmup)
            LOG(INFO) << "Inference Results: " << inf_count << "; batches in " << total_time
                      << " seconds"
                      << "; sec/batch/stream: " << total_time / (inf_count / g_Concurrency)
                      << "; batches/sec: " << inf_count / total_time
                      << "; inf/sec: " << inferences / total_time;
    }

  protected:
    inline std::shared_ptr<InferenceResources> GetResources() { return m_Resources; }

  private:
    std::shared_ptr<InferenceResources> m_Resources;
};

static bool ValidateEngine(const char* flagname, const std::string& value)
{
    struct stat buffer;
    return (stat(value.c_str(), &buffer) == 0);
}

DEFINE_string(engine, "/path/to/tensorrt.engine", "TensorRT serialized engine");
DEFINE_validator(engine, &ValidateEngine);
DEFINE_string(runtime, "default", "TensorRT Runtime");
DEFINE_int32(seconds, 5, "Approximate number of seconds for the timing loop");
DEFINE_int32(contexts, 1, "Number of Execution Contexts");
DEFINE_int32(buffers, 0, "Number of Buffers (default: 2x contexts)");
DEFINE_int32(cudathreads, 1, "Number Cuda Launcher Threads");
DEFINE_int32(respthreads, 1, "Number Response Sync Threads");
DEFINE_int32(replicas, 1, "Number of Replicas of the Model to load");
DEFINE_int32(batch_size, 0, "Overrides the max batch_size of the provided engine");

int main(int argc, char* argv[])
{
    FLAGS_alsologtostderr = 1; // Log to console
    ::google::InitGoogleLogging("TensorRT Inference");
    ::google::ParseCommandLineFlags(&argc, &argv, true);

    MPI_CHECK(MPI_Init(&argc, &argv));

    auto contexts = g_Concurrency = FLAGS_contexts;
    auto buffers = FLAGS_buffers ? FLAGS_buffers : 2 * FLAGS_contexts;

    auto resources = std::make_shared<InferenceResources>(contexts, buffers, FLAGS_cudathreads,
                                                          FLAGS_respthreads);

    std::shared_ptr<Runtime> runtime;
    if(FLAGS_runtime == "default")
    {
        runtime = std::make_shared<CustomRuntime<StandardAllocator>>();
    }
    else if(FLAGS_runtime == "unified")
    {
        runtime = std::make_shared<CustomRuntime<ManagedAllocator>>();
    }
    else
    {
        LOG(FATAL) << "Invalid TensorRT Runtime";
    }

    resources->RegisterModel("0", runtime->DeserializeEngine(FLAGS_engine));
    resources->AllocateResources();

    for(int i = 1; i < FLAGS_replicas; i++)
    {
        resources->RegisterModel(ModelName(i), runtime->DeserializeEngine(FLAGS_engine));
    }

    Inference inference(resources);
    inference.Run(0.1, true, 1, 0); // warmup

    // if testing mps - sync all processes before executing timed loop
    MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));
    inference.Run(FLAGS_seconds, false, FLAGS_replicas, FLAGS_batch_size);
    MPI_CHECK(MPI_Barrier(MPI_COMM_WORLD));

    // todo: perform an mpi_allreduce to collect the per process timings
    //       for a simplified report
    MPI_CHECK(MPI_Finalize());
    return 0;
}


================================================
FILE: examples/01_Basic_GRPC/CMakeLists.txt
================================================
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

add_executable(echo-grpc.x
    src/server.cpp)

target_link_libraries(echo-grpc.x
    nvrpc
    echo-protos
    gflags
)

add_executable(echo-client.x
    src/client.cpp)

target_link_libraries(echo-client.x
    nvrpc
    echo-protos
    gflags
)

add_executable(async-echo-client.x
    src/async_client.cc)

target_link_libraries(async-echo-client.x
    nvrpc
    nvrpc-client
    echo-protos
    gflags
)

================================================
FILE: examples/01_Basic_GRPC/README.md
================================================
Simple service to test and stress the core service and request logic.

The [`server.cc`](examples/01_Basic_GRPC/server.cc) is very well documented and
should be used a reference for the gRPC interface provided by the library.


================================================
FILE: examples/01_Basic_GRPC/src/async_client.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <chrono>
#include <iostream>
#include <memory>
#include <string>

#include <gflags/gflags.h>
#include <glog/logging.h>
#include <grpcpp/grpcpp.h>

#include "nvrpc/client/client_unary.h"
#include "nvrpc/client/executor.h"

#include "echo.grpc.pb.h"

using grpc::Channel;
using grpc::ClientContext;
using grpc::Status;
using simple::Inference;
using simple::Input;
using simple::Output;

using nvrpc::client::ClientUnary;
using nvrpc::client::Executor;

DEFINE_int32(count, 100, "number of grpc messages to send");
DEFINE_int32(thread_count, 1, "Size of thread pool");

int main(int argc, char** argv)
{
    // Instantiate the client. It requires a channel, out of which the actual RPCs
    // are created. This channel models a connection to an endpoint (in this case,
    // localhost at port 50051). We indicate that the channel isn't authenticated
    // (use of InsecureChannelCredentials()).
    FLAGS_alsologtostderr = 1; // It will dump to console
    ::google::ParseCommandLineFlags(&argc, &argv, true);

    auto executor = std::make_shared<Executor>(FLAGS_thread_count);

    auto channel = grpc::CreateChannel("localhost:50051", grpc::InsecureChannelCredentials());
    auto stub = Inference::NewStub(channel);

    auto infer_prepare_fn = [&stub](::grpc::ClientContext * context, const ::simple::Input& request,
                                    ::grpc::CompletionQueue* cq) -> auto
    {
        return std::move(stub->PrepareAsyncCompute(context, request, cq));
    };

    auto runner = std::make_unique<ClientUnary<Input, Output>>(infer_prepare_fn, executor);

    auto start = std::chrono::steady_clock::now();
    auto elapsed = [start]() -> float {
        return std::chrono::duration<float>(std::chrono::steady_clock::now() - start).count();
    };

    for(int i = 0; i < FLAGS_count; i++)
    {
        Input input;
        input.set_batch_id(i);
        runner->Enqueue(std::move(input),
                        [i](Input& input, Output& output, ::grpc::Status& status) -> bool {
                            CHECK(output.batch_id() == i);
                            LOG_FIRST_N(INFO, 20) << "Check: " << i;
                            return (bool)(output.batch_id() == i);
                        });
    }
    std::cout << FLAGS_count << " queued in " << elapsed() << "seconds" << std::endl;
    executor->ShutdownAndJoin();
    std::cout << FLAGS_count << " completed in " << elapsed() << "seconds" << std::endl;
    return 0;
}

================================================
FILE: examples/01_Basic_GRPC/src/client.cpp
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <iostream>
#include <memory>
#include <string>
#include <chrono>

#include <gflags/gflags.h>
#include <glog/logging.h>
#include <grpcpp/grpcpp.h>

#include "echo.grpc.pb.h"

using grpc::Channel;
using grpc::ClientContext;
using grpc::Status;
using simple::Input;
using simple::Output;
using simple::Inference;

class SimpleClient {
 public:
  SimpleClient(std::shared_ptr<Channel> channel)
      : stub_(Inference::NewStub(channel)) {}

  // Assembles the client's payload, sends it and presents the response back
  // from the server.
  int Compute(const int batch_id) {
    // Data we are sending to the server.
    Input request;
    request.set_batch_id(batch_id);

    // Container for the data we expect from the server.
    Output reply;

    // Context for the client. It could be used to convey extra information to
    // the server and/or tweak certain RPC behaviors.
    ClientContext context;

    // The actual RPC.
    Status status = stub_->Compute(&context, request, &reply);

    // Act upon its status.
    if (status.ok()) {
      return reply.batch_id();
    } else {
      std::cout << status.error_code() << ": " << status.error_message()
                << std::endl;
      return -1;
    }
  }

 private:
  std::unique_ptr<Inference::Stub> stub_;
};

DEFINE_int32(count, 100, "number of grpc messages to send");

int main(int argc, char** argv) {
  // Instantiate the client. It requires a channel, out of which the actual RPCs
  // are created. This channel models a connection to an endpoint (in this case,
  // localhost at port 50051). We indicate that the channel isn't authenticated
  // (use of InsecureChannelCredentials()).
  FLAGS_alsologtostderr = 1; // It will dump to console
   ::google::ParseCommandLineFlags(&argc, &argv, true);

  SimpleClient client(grpc::CreateChannel(
      "localhost:50051", grpc::InsecureChannelCredentials()));
  auto start = std::chrono::steady_clock::now();
  for(int i=0; i<FLAGS_count; i++) {
      auto reply = client.Compute(i);
      if(reply == -1 || reply != i) std::cout << "BatchId received: " << reply << std::endl;
  }
  auto end = std::chrono::steady_clock::now();
  float elapsed = std::chrono::duration<float>(end - start).count();
  std::cout << FLAGS_count << " requests in " << elapsed << "seconds" << std::endl;
  return 0;
}


================================================
FILE: examples/01_Basic_GRPC/src/server.cpp
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <chrono>
#include <thread>

#include "nvrpc/server.h"
#include "nvrpc/service.h"
#include "nvrpc/executor.h"
#include "tensorrt/laboratory/core/pool.h"
#include "tensorrt/laboratory/core/resources.h"
#include "tensorrt/laboratory/core/thread_pool.h"

#include "echo.pb.h"
#include "echo.grpc.pb.h"

using nvrpc::AsyncService;
using nvrpc::AsyncRPC;
using nvrpc::Context;
using nvrpc::Executor;
using nvrpc::Server;
using trtlab::Resources;
using trtlab::ThreadPool;

// CLI Options
DEFINE_int32(thread_count, 1, "Size of thread pool");

/**
 * Embedding a copy of the Protobuf specification for the gRPC service.
 *
 * Package Name: simple
 * Service Name: Inference
 *     RPC Name: Compute
 * 
 * Incoming Message: Input
 * Outgoing Message: Ouput
 **

syntax = "proto3";

package simple;

service Inference {
   rpc Compute (Input) returns (Output) {}
}

message Input {
    uint64 batch_id = 1;
}

message Output {
    uint64 batch_id = 1;
}
*/

// Define the resources your RPC will need to execute
// ==================================================
// In this case, all simple::Inference::Compute RPCs share a threadpool in which they will
// queue up some work on.  This essentially means, after the message as been received and
// processed, the actual work for the RPC is pushed to a worker pool outside the scope of
// the transaction processing system (TPS).  This is essentially async computing, we have 
// decoupled the transaction from the workers executing the implementation.  The TPS can
// continue to queue work, while the workers process the load.
struct SimpleResources : public Resources
{
    SimpleResources(int numThreadsInPool=3) : m_ThreadPool(numThreadsInPool) {
        LOG(INFO) << "Server ThreadCount: " << numThreadsInPool;
    }

    ThreadPool& AcquireThreadPool()
    {
        return m_ThreadPool;
    }

  private:
    ThreadPool m_ThreadPool;
};

// Contexts hold the state and provide the definition of the work to be performed by the RPC.
// This is where you define what gets executed for a given RPC.
// Incoming Message = simple::Input (RequestType)
// Outgoing Message = simple::Output (ResponseType)
class SimpleContext final : public Context<simple::Input, simple::Output, SimpleResources>
{
    void ExecuteRPC(RequestType &input, ResponseType &output) final override
    {
        // We could do work here, but we'd block the TPS, i.e. the threads pulling messages 
        // off the incoming recieve queue.  Very quick responses are best done here; however,
        // longer running workload should be offloaded so the TPS can avoid being blocked.
        GetResources()->AcquireThreadPool().enqueue([this, &input, &output]{
            // Now running on a worker thread of the ThreadPool defined in SimpleResources.
            // Here we are just echoing back the incoming // batch_id; however, in later 
            // examples, we'll show how to run an async cuda pipline.
            LOG_FIRST_N(INFO, 20) << "Tag = " << Tag() << " Thread = " << std::this_thread::get_id();
            output.set_batch_id(input.batch_id());
            this->FinishResponse();
        });
        // The TPS thread is now free to continue processing message - async ftw!
    }
};


int main(int argc, char *argv[])
{
    FLAGS_alsologtostderr = 1; // Log to console

    ::google::InitGoogleLogging("simpleServer");
    ::google::ParseCommandLineFlags(&argc, &argv, true);

    // A server will bind an IP:PORT to listen on
    Server server("0.0.0.0:50051");

    // A server can host multiple services
    LOG(INFO) << "Register Service (simple::Inference) with Server"; 
    auto simpleInference = server.RegisterAsyncService<simple::Inference>();

    // An RPC has two components that need to be specified when registering with the service:
    //  1) Type of Execution Context (SimpleContext).  The execution context defines the behavor
    //     of the RPC, i.e. it contains the control logic for the execution of the RPC.
    //  2) The Request function (RequestCompute) which was generated by gRPC when compiling the
    //     protobuf which defined the service.  This function is responsible for queuing the
    //     RPC's execution context to the 
    LOG(INFO) << "Register RPC (simple::Inference::Compute) with Service (simple::Inference)";
    auto rpcCompute = simpleInference->RegisterRPC<SimpleContext>(
        &simple::Inference::AsyncService::RequestCompute
    );

    LOG(INFO) << "Initializing Resources for RPC (simple::Inference::Compute)";
    auto rpcResources = std::make_shared<SimpleResources>(FLAGS_thread_count);

    // Create Executors - Executors provide the messaging processing resources for the RPCs
    // Multiple Executors can be registered with a Server.  The executor is responsible
    // for pulling incoming message off the receive queue and executing the associated
    // context.  By default, an executor only uses a single thread.  A typical usecase is
    // an Executor executes a context, which immediate pushes the work to a thread pool.
    // However, for very low-latency messaging, you might want to use a multi-threaded 
    // Executor and a Blocking Context - meaning the Context performs the entire RPC function
    // on the Executor's thread.
    LOG(INFO) << "Creating Executor";
    auto executor = server.RegisterExecutor(new Executor(1));

    // You can register RPC execution contexts from any registered RPC on any executor.
    // The power of that will become clear in later examples. For now, we will register
    // 10 instances of the simple::Inference::Compute RPC's SimpleContext execution context
    // with the Executor.
    LOG(INFO) << "Creating Execution Contexts for RPC (simple::Inference::Compute) with Executor";
    executor->RegisterContexts(rpcCompute, rpcResources, 10);

    LOG(INFO) << "Running Server";
    server.Run(std::chrono::milliseconds(2000), []{
        // This is a timeout loop executed every 2seconds
        // Run() with no arguments will run an empty timeout loop every 5 seconds.
        // RunAsync() will return immediately, its your responsibility to ensure the
        // server doesn't go out of scope or a Shutdown will be triggered on your services.
    });
}


================================================
FILE: examples/02_TensorRT_GRPC/CMakeLists.txt
================================================
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

find_package(prometheus-cpp CONFIG REQUIRED)
if(prometheus-cpp_FOUND)
  message(STATUS "Prometheus Metics Enabled")
endif(prometheus-cpp_FOUND)


add_executable(inference-grpc.x
    src/metrics.cc
    src/server.cc
)
target_include_directories(inference-grpc.x
  PUBLIC
    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
)
target_link_libraries(inference-grpc.x
    trtlab::nvrpc
    trtlab::tensorrt
    prometheus-cpp::prometheus-cpp
    demo-protos
    gflags
)

add_executable(client-sync.x
    src/sync-client.cc
)
target_link_libraries(client-sync.x
    nvrpc
    demo-protos
    gflags
)

add_executable(client-async.x
    src/async-client.cc
)
target_link_libraries(client-async.x
    nvrpc
    demo-protos
    gflags
)

add_executable(siege.x
    src/siege.cc
)
target_link_libraries(siege.x
    nvrpc
    demo-protos
    gflags
)


================================================
FILE: examples/02_TensorRT_GRPC/README.md
================================================
# TensorRT GRPC Example

This examples extends the [TensorRT](examples/00_TensorRT) compute loop into an
async gRPC service similar to [example 01_gRPC](examples/01_GRPC).

There are three take-aways from this example:

1. TensorRT compute pipeline is implemented as the `ExecuteRPC` virtual function
   of the `Context`.
2. An external datasource is used to override the input bindings
3. Custom [Prometheus](https://prometheus.io) metrics for inference compute and
     request durations, load ratio, and GPU power gauge.
     are recorded/observed.

## Quickstart

```
cd /work/build/examples/02_TensorRT_GRPC
./inference-grpc.x --contexts=8 --engine=/work/models/ResNet-50-b1-int8.engine --port 50051 &
./siege.x --port=50051 --rate=2500
# ctrl+c to cancel client
telegraf -test -config /work/examples/91_Prometheus/scrape.conf
```

## Explore

Fun things to try:

  * Evaluate the performance of the model using `inference.x` in 
    [examples/00_TensorRT](examples/00_TensorRT)
  * Try running `siege.x` below, at, and above the benchmarked rate and watch the metrics
    via `telegraf`.
  * Deploy on Kubernetes, collect metrics via Prometheus and visualize using Grafana;
    [examples/90_Kubernetes](examples/90_Kubernetes).

## Server/Service

`inference-grpc.x` CLI options:

  * `--engine` - the compiled TensorRT plan/engine
  * `--contexts` - the maximium number of concurrent evaluations of the engine.
  * `--port` - the port on which requests are received (default: 50051)
  * `--metrics` - the port on which to expose metrics to be scraped (default: 50078)


## Clients

Three clients are available:
  * `client-sync.x` - sends a blocking inference request to the service and waits for the
     response.  Only 1 request is ever in-flight at a given time.
  * `client-async.x` - the async client is capable of issuing multiple in-flight requests.
     Note: the load-balancer is limited to 1000 outstanding requests per client before circuit-
     breaking.  Running more than 1000 requests will trigger 503 if targeting the envoy load-
     balancer.  The client has no backoff and will try to send the full compliment of requested
     inference requests.  `siege.x` is the better async client.
  * `siege.x` - constant rate (`--rate`) async engine that is hard-coded to have no more than
     950 outstanding in-flight requests.  A warning will be given client-side if the outstanding
     requests tops meaning the rate is limited by the server-side compute.

TODO:
  * Add more varied test clients akin to [Netflix's Chaos Monkeys](https://github.com/Netflix/chaosmonkey),
    but for gRPC client behavior.
    * Random rate, random pulses, canceled messsages, messages wiht unreasonable timeouts, etc.

## Metrics

YAIS metrics are gathered and exposed via the [prometheus-cpp](https://github.com/jupp0r/prometheus-cpp) 
client library.  In this example, we expose four custom 
[metrics](https://prometheus.io/docs/concepts/metric_types/): 2 Summaries, 1 Histogram and 1 Gauge.

  * `compute_duration` and `request_duration` are summaries recored with the model
     name as a component of the metric.  This is useful for evaluating how a given
     model is performing, but this is not a good metric to aggregate across multiple
     service.
  * `load_ratio` is a histogram of `request_duraton / compute_duration`.  Ideally, this
     unitless value is just over 1.0.  Values higher than 1.0 are indictive of some
     delays in the compute of a given request. Sources of delays include, overloaded
     queues and/or starvation of resources. Histograms can be aggregated across services,
     which makes this metric a good candidate for triggering auto-scaling.
  * `gpu_power` is a simple gauge that periodicly reports the instaneous power being
    consumed by the device.  As the load increases on the service, the power should 
    increase proprotionally, until the power is capped either by device limits or compute 
    resources. When power capped, the `load_ratio` will begin to increase under futher 
    increases in traffic.


### Acquiring Metrics

Prometheus metrics are generally scraped by a Prometheus service.  When using Kubernetes
to deploy services, the [prometheus-operator](https://github.com/coreos/prometheus-operator)
provides a [`ServiceMonitor`](https://github.com/coreos/prometheus-operator#customresourcedefinitions)
which allows you to define custom scraping configuration per service. See the 
[Kubernetes example](examples/90_Kubernetes) for more details.

While testing, you can use the [`telegraf`](https://github.com/influxdata/telegraf) application
to scrape local services.

```
# start service
telegraf -test -config /work/examples/91_Prometheus/scrape.conf
```

Here is some sample output (line breaks added for readability):
```
> yais_inference_compute_duration_ms,host=dgx,model=flowers,url=http://localhost:50078/metrics count=1000,sum=2554.070996 1530985302000000000

> yais_inference_compute_duration_ms_quantile,host=dgx,model=flowers,quantile=0.500000,url=http://localhost:50078/metrics value=2.526903 1530985302000000000
> yais_inference_compute_duration_ms_quantile,host=dgx,model=flowers,quantile=0.900000,url=http://localhost:50078/metrics value=2.625447 1530985302000000000
> yais_inference_compute_duration_ms_quantile,host=dgx,model=flowers,quantile=0.990000,url=http://localhost:50078/metrics value=2.855728 1530985302000000000

> yais_inference_request_duration_ms,host=dgx,model=flowers,url=http://localhost:50078/metrics count=1000,sum=243547.558097 1530985302000000000
> yais_inference_request_duration_ms_quantile,host=dgx,model=flowers,quantile=0.500000,url=http://localhost:50078/metrics value=253.216653 1530985302000000000
> yais_inference_request_duration_ms_quantile,host=dgx,model=flowers,quantile=0.900000,url=http://localhost:50078/metrics value=256.715759 1530985302000000000
> yais_inference_request_duration_ms_quantile,host=dgx,model=flowers,quantile=0.990000,url=http://localhost:50078/metrics value=275.407232 1530985302000000000

> yais_inference_load_ratio,host=dgx,url=http://localhost:50078/metrics +Inf=1000,1.25=1,1.5=1,10=9,100=253,2=1,count=1000,sum=95879.013208 1530985302000000000

> yais_gpus_power_usage,gpu=0,host=dgx,url=http://localhost:50078/metrics gauge=52.821 1530985302000000000

> yais_executor_queue_depth,host=dgx,url=http://localhost:50078/metrics gauge=0 1530985302000000000
```
### Best Practices

For a good description of using histograms vs. summaries to collect meaningful metrics
see: https://prometheus.io/docs/practices/histograms/

Two rules of thumb:
 - If you need to aggregate, choose histograms.
 - Otherwise, choose a histogram if you have an idea of the range and distribution of 
   values that will be observed. Choose a summary if you need an accurate quantile, no
   matter what the range and distribution of the values is.


================================================
FILE: examples/02_TensorRT_GRPC/src/async-client.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * Original Copyright proivded below.
 * This work extends the original gRPC client examples to work with the
 * implemented server.
 *
 * Copyright 2015 gRPC authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

#include <chrono>
#include <iostream>
#include <memory>
#include <string>

#include <gflags/gflags.h>
#include <glog/logging.h>

#include <grpc/support/log.h>
#include <grpcpp/grpcpp.h>
#include <thread>

#include "inference.grpc.pb.h"

using grpc::Channel;
using grpc::ClientAsyncResponseReader;
using grpc::ClientContext;
using grpc::CompletionQueue;
using grpc::Status;
using ssd::BatchInput;
using ssd::BatchPredictions;
using ssd::Inference;

class GreeterClient
{
  public:
    explicit GreeterClient(std::shared_ptr<Channel> channel) : stub_(Inference::NewStub(channel)) {}

    // Assembles the client's payload and sends it to the server.
    void SayHello(const size_t batch_id, const int batch_size)
    {
        // Data we are sending to the server.
        BatchInput request;
        request.set_batch_id(batch_id);
        request.set_batch_size(batch_size);

        // Call object to store rpc data
        AsyncClientCall* call = new AsyncClientCall;

        // stub_->PrepareAsyncSayHello() creates an RPC object, returning
        // an instance to store in "call" but does not actually start the RPC
        // Because we are using the asynchronous API, we need to hold on to
        // the "call" instance in order to get updates on the ongoing RPC.
        call->response_reader = stub_->PrepareAsyncCompute(&call->context, request, &cq_);

        // StartCall initiates the RPC call
        call->response_reader->StartCall();

        // Request that, upon completion of the RPC, "reply" be updated with the
        // server's response; "status" with the indication of whether the operation
        // was successful. Tag the request with the memory address of the call object.
        call->response_reader->Finish(&call->reply, &call->status, (void*)call);
    }

    // Loop while listening for completed responses.
    // Prints out the response from the server.
    void AsyncCompleteRpc()
    {
        void* got_tag;
        bool ok = false;

        // Block until the next result is available in the completion queue "cq".
        while(cq_.Next(&got_tag, &ok))
        {
            // The tag in this example is the memory location of the call object
            AsyncClientCall* call = static_cast<AsyncClientCall*>(got_tag);

            // Verify that the request was completed successfully. Note that "ok"
            // corresponds solely to the request for updates introduced by Finish().
            GPR_ASSERT(ok);

            if(call->status.ok())
            {
                // std::cout << "Greeter received: " << call->reply.batch_id() << std::endl;
            }
            else
            {
                std::cout << "RPC failed" << std::endl;
            }
            // Once we're complete, deallocate the call object.
            delete call;
        }
    }

    void Shutdown() { cq_.Shutdown(); }

  private:
    // struct for keeping state and data information
    struct AsyncClientCall
    {
        // Container for the data we expect from the server.
        BatchPredictions reply;

        // Context for the client. It could be used to convey extra information to
        // the server and/or tweak certain RPC behaviors.
        ClientContext context;

        // Storage for the status of the RPC upon completion.
        Status status;

        std::unique_ptr<ClientAsyncResponseReader<BatchPredictions>> response_reader;
    };

    // Out of the passed in Channel comes the stub, stored here, our view of the
    // server's exposed services.
    std::unique_ptr<Inference::Stub> stub_;

    // The producer-consumer queue we use to communicate asynchronously with the
    // gRPC runtime.
    CompletionQueue cq_;
};

DEFINE_int32(count, 500, "number of grpc messages to send");
DEFINE_int32(batch_size, 1, "batch_size");
DEFINE_int32(port, 50051, "server_port");

int main(int argc, char** argv)
{
    FLAGS_alsologtostderr = 1; // It will dump to console
    ::google::ParseCommandLineFlags(&argc, &argv, true);

    // Instantiate the client. It requires a channel, out of which the actual RPCs
    // are created. This channel models a connection to an endpoint (in this case,
    // localhost at port 50051). We indicate that the channel isn't authenticated
    // (use of InsecureChannelCredentials()).
    std::ostringstream ip_port;
    ip_port << "localhost:" << FLAGS_port;
    GreeterClient greeter(grpc::CreateChannel(ip_port.str(), grpc::InsecureChannelCredentials()));

    // Spawn reader thread that loops indefinitely
    std::thread thread_ = std::thread(&GreeterClient::AsyncCompleteRpc, &greeter);

    auto start = std::chrono::steady_clock::now();
    for(size_t i = 0; i < FLAGS_count; i++)
    {
        greeter.SayHello(i, FLAGS_batch_size); // The actual RPC call!
    }

    greeter.Shutdown();
    thread_.join(); // blocks forever
    auto end = std::chrono::steady_clock::now();
    float elapsed = std::chrono::duration<float>(end - start).count();
    std::cout << FLAGS_count << " requests in " << elapsed
              << "seconds; inf/sec: " << FLAGS_count * FLAGS_batch_size / elapsed << std::endl;

    return 0;
}


================================================
FILE: examples/02_TensorRT_GRPC/src/metrics.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "metrics.h"

#include <glog/logging.h>
#include <ostream>

namespace trtlab {

void Metrics::Initialize(uint32_t port)
{
    auto singleton = GetSingleton();
    if(singleton->m_Exposer)
    {
        LOG(WARNING) << "Metrics already initialized.  This call is ignored";
        return;
    }
    std::ostringstream stream;
    stream << "0.0.0.0:" << port;
    singleton->m_Exposer = std::make_unique<Exposer>(stream.str());
    singleton->m_Exposer->RegisterCollectable(singleton->m_Registry);
}

auto Metrics::GetRegistry() -> Registry&
{
    auto singleton = Metrics::GetSingleton();
    return *(singleton->m_Registry);
}

Metrics* Metrics::GetSingleton()
{
    static Metrics singleton;
    return &singleton;
}

Metrics::Metrics() : m_Registry(std::make_shared<Registry>()) {}

Metrics::~Metrics() {}

} // namespace trtlab

================================================
FILE: examples/02_TensorRT_GRPC/src/metrics.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

#include <prometheus/exposer.h>
#include <prometheus/registry.h>

using prometheus::Exposer;
using prometheus::Registry;

namespace trtlab {

class Metrics
{
  public:
    static void Initialize(uint32_t port);
    static auto GetRegistry() -> Registry&;

  protected:
    Metrics();
    virtual ~Metrics();
    static Metrics* GetSingleton();

  private:
    std::unique_ptr<Exposer> m_Exposer;
    std::shared_ptr<Registry> m_Registry;
};

} // namespace trtlab


================================================
FILE: examples/02_TensorRT_GRPC/src/server.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "nvml.h"
#include <chrono>
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <sys/ipc.h>
#include <sys/shm.h>
#include <sys/stat.h>
#include <thread>
#include <unistd.h>

#include "tensorrt/laboratory/core/affinity.h"
#include "tensorrt/laboratory/core/memory/allocator.h"
#include "tensorrt/laboratory/cuda/device_info.h"
#include "tensorrt/laboratory/cuda/memory/cuda_pinned_host.h"
#include "tensorrt/laboratory/inference_manager.h"
#include "tensorrt/laboratory/runtime.h"

#include "nvrpc/context.h"
#include "nvrpc/executor.h"
#include "nvrpc/server.h"
#include "nvrpc/service.h"

#include "metrics.h"

using nvrpc::AsyncRPC;
using nvrpc::AsyncService;
using nvrpc::Context;
using nvrpc::Executor;
using nvrpc::Server;
using trtlab::Affinity;
using trtlab::Allocator;
using trtlab::CudaPinnedHostMemory;
using trtlab::DeviceInfo;
using trtlab::Metrics;
using trtlab::ThreadPool;
using trtlab::TensorRT::InferenceManager;
using trtlab::TensorRT::ManagedRuntime;
using trtlab::TensorRT::Model;
using trtlab::TensorRT::Runtime;
using trtlab::TensorRT::StandardRuntime;

// Flowers Protos
#include "inference.grpc.pb.h"
#include "inference.pb.h"

using ssd::BatchInput;
using ssd::BatchPredictions;
using ssd::Inference;

/*
 * Prometheus Metrics
 *
 * It is important to make collect measurements to find bottlenecks, performance issues,
 * and to trigger auto-scaling.
 */
static auto& registry = Metrics::GetRegistry();

// Summaries - Request and Compute duration on a per service basis
static auto& inf_compute =
    prometheus::BuildSummary().Name("yais_inference_compute_duration_ms").Register(registry);
static auto& inf_request =
    prometheus::BuildSummary().Name("yais_inference_request_duration_ms").Register(registry);
static const auto& quantiles =
    prometheus::Summary::Quantiles{{0.5, 0.05}, {0.90, 0.01}, {0.99, 0.001}};

// Histogram - Load Ratio = Request/Compute duration - should just above one for a service
//             that can keep up with its current load.  This metrics provides more
//             detailed information on the impact of the queue depth because it accounts
//             for request time.
static const std::vector<double> buckets = {1.25, 1.50, 2.0, 10.0, 100.0}; // unitless
static auto& inf_load_ratio_fam =
    prometheus::BuildHistogram().Name("yais_inference_load_ratio").Register(registry);
static auto& inf_load_ratio = inf_load_ratio_fam.Add({}, buckets);

// Gauge - Periodically measure and report GPU power utilization.  As the load increases
//         on the service, the power should increase proprotionally, until the power is capped
//         either by device limits or compute resources.  At this level, the inf_load_ratio
//         will begin to increase under futher increases in traffic
static auto& power_gauge_fam =
    prometheus::BuildGauge().Name("yais_gpus_power_usage").Register(registry);
static auto& power_gauge = power_gauge_fam.Add({{"gpu", "0"}});

/*
 * External Data Source
 *
 * Attaches to a System V shared memory segment owned by an external resources.
 * Example: the results of an image decode service could use this mechanism to transfer
 *          large tensors to an inference service by simply passing an offset.
 */
float* GetSharedMemory(const std::string& address);

/*
 * YAIS Resources - TensorRT InferenceManager + ThreadPools + External Datasource
 */
class FlowersResources : public InferenceManager
{
  public:
    explicit FlowersResources(int max_executions, int max_buffers, int nCuda, int nResp,
                              float* sysv_data)
        : InferenceManager(max_executions, max_buffers), m_CudaThreadPool(nCuda),
          m_ResponseThreadPool(nResp), m_SharedMemory(sysv_data)
    {
    }

    ThreadPool& GetCudaThreadPool() { return m_CudaThreadPool; }
    ThreadPool& GetResponseThreadPool() { return m_ResponseThreadPool; }

    float* GetSysvOffset(size_t offset_in_bytes)
    {
        return &m_SharedMemory[offset_in_bytes / sizeof(float)];
    }

  private:
    ThreadPool m_CudaThreadPool;
    ThreadPool m_ResponseThreadPool;
    float* m_SharedMemory;
};

/*
 * nvRPC Context - Defines the logic of the RPC.
 */
class FlowersContext final : public Context<BatchInput, BatchPredictions, FlowersResources>
{
    void ExecuteRPC(RequestType& input, ResponseType& output) final override
    {
        // Executing on a Executor threads - we don't want to block message handling, so we offload
        GetResources()->GetCudaThreadPool().enqueue([this, &input, &output]() {
            // Executed on a thread from CudaThreadPool
            auto model = GetResources()->GetModel("flowers");
            auto buffers = GetResources()->GetBuffers(); // <=== Limited Resource; May Block !!!
            auto bindings = buffers->CreateBindings(model);
            bindings->SetBatchSize(input.batch_size());
            bindings->SetHostAddress(0, GetResources()->GetSysvOffset(input.sysv_offset()));
            bindings->CopyToDevice(bindings->InputBindings());
            auto ctx =
                GetResources()->GetExecutionContext(model); // <=== Limited Resource; May Block !!!
            ctx->Infer(bindings);
            bindings->CopyFromDevice(bindings->OutputBindings());
            // All Async CUDA work has been queued - this thread's work is done.
            GetResources()->GetResponseThreadPool().enqueue([this, &input, &output, model, bindings,
                                                             ctx]() mutable {
                // Executed on a thread from ResponseThreadPool
                auto compute_time = ctx->Synchronize();
                ctx.reset(); // Finished with the Execution Context - Release it to competing
                             // threads
                bindings->Synchronize(); // Blocks on H2D, Compute, D2H Pipeline
                WriteBatchPredictions(input, output, (float*)bindings->HostAddress(1));
                bindings.reset(); // Finished with Buffers - Release it to competing threads
                auto request_time = Walltime();
                output.set_compute_time(static_cast<float>(compute_time));
                output.set_total_time(static_cast<float>(request_time));
                this->FinishResponse();
                // The Response is now sending; Record some metrics and be done
                inf_compute.Add({{"model", model->Name()}}, quantiles).Observe(compute_time * 1000);
                inf_request.Add({{"model", model->Name()}}, quantiles).Observe(request_time * 1000);
                inf_load_ratio.Observe(request_time / compute_time);
            });
        });
    }

    void WriteBatchPredictions(RequestType& input, ResponseType& output, float* scores)
    {
        int N = input.batch_size();
        auto nClasses = GetResources()->GetModel("flowers")->GetBinding(1).elementsPerBatchItem;
        size_t cntr = 0;
        for(int p = 0; p < N; p++)
        {
            auto element = output.add_elements();
            /* Customize the post-processing of the output tensor *\
            float max_val = -1.0;
            int max_idx = -1;
            for (int i = 0; i < nClasses; i++)
            {
                if (max_val < scores[cntr])
                {
                    max_val = scores[cntr];
                    max_idx = i;
                }
                cntr++;
            }
            auto top1 = element->add_predictions();
            top1->set_class_id(max_idx);
            top1->set_score(max_val);
            \* Customize the post-processing of the output tensor */
        }
        output.set_batch_id(input.batch_id());
    }
};

static bool ValidateEngine(const char* flagname, const std::string& value)
{
    struct stat buffer;
    return (stat(value.c_str(), &buffer) == 0);
}

static bool ValidateBytes(const char* flagname, const std::string& value)
{
    trtlab::StringToBytes(value);
    return true;
}

DEFINE_string(engine, "/path/to/tensorrt.engine", "TensorRT serialized engine");
DEFINE_validator(engine, &ValidateEngine);
DEFINE_string(dataset, "127.0.0.1:4444", "GRPC Dataset/SharedMemory Service Address");
DEFINE_int32(contexts, 1, "Number of Execution Contexts");
DEFINE_int32(buffers, 0, "Number of Input/Output Buffers");
DEFINE_string(runtime, "default", "TensorRT Runtime");
DEFINE_int32(execution_threads, 1, "Number of RPC execution threads");
DEFINE_int32(preprocessing_threads, 0, "Number of preprocessing threads");
DEFINE_int32(kernel_launching_threads, 1, "Number of threads to launch CUDA kernels");
DEFINE_int32(postprocessing_threads, 2, "Number of postprocessing threads");
DEFINE_string(max_recv_bytes, "10MiB", "Maximum number of bytes for incoming messages");
DEFINE_validator(max_recv_bytes, &ValidateBytes);
DEFINE_int32(port, 50051, "Port to listen for gRPC requests");
DEFINE_int32(metrics, 50078, "Port to expose metrics for scraping");

int main(int argc, char* argv[])
{
    FLAGS_alsologtostderr = 1; // Log to console
    ::google::InitGoogleLogging("flowers");
    ::google::ParseCommandLineFlags(&argc, &argv, true);

    // Set CPU Affinity to be near the GPU
    auto cpus = DeviceInfo::Affinity(0);
    Affinity::SetAffinity(cpus);

    // Enable metrics on port
    Metrics::Initialize(FLAGS_metrics);

    // Create a gRPC server bound to IP:PORT
    std::ostringstream ip_port;
    ip_port << "0.0.0.0:" << FLAGS_port;
    Server server(ip_port.str());

    // Modify MaxReceiveMessageSize
    auto bytes = trtlab::StringToBytes(FLAGS_max_recv_bytes);
    server.Builder().SetMaxReceiveMessageSize(bytes);
    LOG(INFO) << "gRPC MaxReceiveMessageSize = " << trtlab::BytesToString(bytes);

    // A server can host multiple services
    LOG(INFO) << "Register Service (flowers::Inference) with Server";
    auto inferenceService = server.RegisterAsyncService<Inference>();

    // An RPC has two components that need to be specified when registering with the service:
    //  1) Type of Execution Context (FlowersContext).  The execution context defines the behavor
    //     of the RPC, i.e. it contains the control logic for the execution of the RPC.
    //  2) The Request function (RequestCompute) which was generated by gRPC when compiling the
    //     protobuf which defined the service.  This function is responsible for queuing the
    //     RPC's execution context to the
    LOG(INFO) << "Register RPC (flowers::Inference::Compute) with Service (flowers::Inference)";
    auto rpcCompute =
        inferenceService->RegisterRPC<FlowersContext>(&Inference::AsyncService::RequestCompute);

    // Buffers default to execution contexts + 2
    // Allows for 1 H2D, N TensorRT Executions, 1 D2H to be inflight
    auto buffers = FLAGS_buffers;
    if(buffers == 0) buffers = FLAGS_contexts + 2;

    // Initialize Resources
    LOG(INFO) << "Initializing Resources for RPC (flowers::Inference::Compute)";
    auto rpcResources = std::make_shared<FlowersResources>(
        FLAGS_contexts, // number of IExecutionContexts - scratch space for DNN activations
        buffers, // number of host/device buffers for input/output tensors
        FLAGS_kernel_launching_threads, // number of threads used to execute cuda kernel launches
        FLAGS_postprocessing_threads, // number of threads used to write and complete responses
        GetSharedMemory(FLAGS_dataset) // pointer to data in shared memory
    );

    std::shared_ptr<Runtime> runtime;
    if(FLAGS_runtime == "default")
    {
        runtime = std::make_shared<StandardRuntime>();
    }
    else if(FLAGS_runtime == "unified")
    {
        runtime = std::make_shared<ManagedRuntime>();
    }
    else
    {
        LOG(FATAL) << "Invalid TensorRT Runtime";
    }

    rpcResources->RegisterModel("flowers", runtime->DeserializeEngine(FLAGS_engine));
    rpcResources->AllocateResources();

    // Create Executors - Executors provide the messaging processing resources for the RPCs
    LOG(INFO) << "Initializing Executor";
    auto executor = server.RegisterExecutor(new Executor(1));

    // You can register RPC execution contexts from any registered RPC on any executor.
    LOG(INFO)
        << "Registering Execution Contexts for RPC (flowers::Inference::Compute) with Executor";
    executor->RegisterContexts(rpcCompute, rpcResources, 100);

    LOG(INFO) << "Running Server";
    server.Run(std::chrono::milliseconds(2000), [] {
        // Query GPU Power
        nvmlDevice_t gpu;
        unsigned int power;
        CHECK_EQ(nvmlDeviceGetHandleByIndex(0, &gpu), NVML_SUCCESS)
            << "Failed to get Device for index=" << 0;
        CHECK_EQ(nvmlDeviceGetPowerUsage(gpu, &power), NVML_SUCCESS)
            << "Failed to get Power Usage for GPU=" << 0;
        power_gauge.Set((double)power * 0.001);
    });
}

static auto pinned_memory = std::make_unique<Allocator<CudaPinnedHostMemory>>(1024 * 1024 * 1024);

float* GetSharedMemory(const std::string& address)
{
    /* data in shared memory should go here - for the sake of quick examples just use and emptry
     * array */
    pinned_memory->Fill((char)0);
    return (float*)pinned_memory->Data();
    // the following code connects to a shared memory service to allow for non-serialized transfers
    // between microservices
    /*
    InfoRequest request;
    Info reply;
    grpc::ClientContext context;
    auto channel = grpc::CreateChannel(address, grpc::InsecureChannelCredentials());
    auto stub = SharedMemoryDataSet::NewStub(channel);
    auto status = stub->GetInfo(&context, request, &reply);
    CHECK(status.ok()) << "Dataset shared memory request failed";
    DLOG(INFO) << "SysV ShmKey: " << reply.sysv_key();
    int shmid = shmget(reply.sysv_key(), 0, 0);
    DLOG(INFO) << "SysV ShmID: " << shmid;
    float* data = (float*) shmat(shmid, 0, 0);
    CHECK(data) << "SysV Attached failed";
    return data;
    */
}


================================================
FILE: examples/02_TensorRT_GRPC/src/siege.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * Original Copyright proivded below.
 * This work extends the original gRPC client examples to work with the
 * implemented server.
 *
 * Copyright 2015 gRPC authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

#include <chrono>
#include <iostream>
#include <memory>
#include <string>

#include <gflags/gflags.h>
#include <glog/logging.h>

#include <grpc/support/log.h>
#include <grpcpp/grpcpp.h>
#include <thread>

#include "inference.grpc.pb.h"

#include "tensorrt/laboratory/core/utils.h"

using grpc::Channel;
using grpc::ClientAsyncResponseReader;
using grpc::ClientContext;
using grpc::CompletionQueue;
using grpc::Status;
using ssd::BatchInput;
using ssd::BatchPredictions;
using ssd::Inference;

static int g_BatchSize = 1;

class GreeterClient
{
  public:
    explicit GreeterClient(std::shared_ptr<Channel> channel, int max_outstanding)
        : stub_(Inference::NewStub(channel)), m_OutstandingMessageCount(0),
          m_MaxOutstandingMessageCount(max_outstanding)
    {
    }

    // Assembles the client's payload and sends it to the server.
    void SayHello(const size_t batch_id, const int batch_size, char* bytes, uint64_t total)
    {
        // Data we are sending to the server.
        {
            std::unique_lock<std::mutex> lock(m_Mutex);
            m_OutstandingMessageCount++;
            while(m_OutstandingMessageCount >= m_MaxOutstandingMessageCount)
            {
                LOG_FIRST_N(WARNING, 10) << "Initiated Backoff - (Siege Rate > Server Compute "
                                            "Rate) - Server Queues are full.";
                m_Condition.wait(lock);
            }
        }

        auto start = std::chrono::high_resolution_clock::now();

        BatchInput request;
        request.set_batch_id(batch_id);
        request.set_batch_size(batch_size);
        if(total)
        {
            request.set_data(bytes, total);
        }

        // Call object to store rpc data
        AsyncClientCall* call = new AsyncClientCall;

        // stub_->PrepareAsyncSayHello() creates an RPC object, returning
        // an instance to store in "call" but does not actually start the RPC
        // Because we are using the asynchronous API, we need to hold on to
        // the "call" instance in order to get updates on the ongoing RPC.
        call->response_reader = stub_->PrepareAsyncCompute(&call->context, request, &cq_);

        // StartCall initiates the RPC call
        call->response_reader->StartCall();

        // Request that, upon completion of the RPC, "reply" be updated with the
        // server's response; "status" with the indication of whether the operation
        // was successful. Tag the request with the memory address of the call object.
        call->response_reader->Finish(&call->reply, &call->status, (void*)call);

        float elapsed =
            std::chrono::duration<float>(std::chrono::high_resolution_clock::now() - start).count();
        m_RequestCalls++;
        m_TotalRequestTime += elapsed;
        // LOG_EVERY_N(INFO, 200) << "Request overhead: " << m_TotalRequestTime/m_RequestCalls;
    }

    // Loop while listening for completed responses.
    // Prints out the response from the server.
    void AsyncCompleteRpc()
    {
        void* got_tag;
        bool ok = false;
        size_t cntr = 0;
        auto start = std::chrono::steady_clock::now();
        float last = 0.0;

        // Block until the next result is available in the completion queue "cq".
        while(cq_.Next(&got_tag, &ok))
        {
            // The tag in this example is the memory location of the call object
            AsyncClientCall* call = static_cast<AsyncClientCall*>(got_tag);

            // Verify that the request was completed successfully. Note that "ok"
            // corresponds solely to the request for updates introduced by Finish().
            GPR_ASSERT(ok);

            if(call->status.ok())
            {
                // std::cout << "Greeter received: " << call->reply.batch_id() << std::endl;
            }
            else
            {
                std::cout << "RPC failed" << std::endl;
            }
            // Once we're complete, deallocate the call object.
            delete call;

            cntr++;
            float elapsed =
                std::chrono::duration<float>(std::chrono::steady_clock::now() - start).count();
            if(elapsed - last > 0.5)
            {
                LOG(INFO) << "avg. rate: " << (float)cntr / (elapsed - last) << "( "
                          << (float)(cntr * g_BatchSize) / (elapsed - last) << " inf/sec)";
                last = elapsed;
                cntr = 0;
            }

            {
                std::unique_lock<std::mutex> lock(m_Mutex);
                m_OutstandingMessageCount--;
            }
            m_Condition.notify_one();
        }
    }

    void Shutdown() { cq_.Shutdown(); }

  private:
    // struct for keeping state and data information
    struct AsyncClientCall
    {
        // Container for the data we expect from the server.
        BatchPredictions reply;

        // Context for the client. It could be used to convey extra information to
        // the server and/or tweak certain RPC behaviors.
        ClientContext context;

        // Storage for the status of the RPC upon completion.
        Status status;

        std::unique_ptr<ClientAsyncResponseReader<BatchPredictions>> response_reader;
    };

    // Out of the passed in Channel comes the stub, stored here, our view of the
    // server's exposed services.
    std::unique_ptr<Inference::Stub> stub_;

    // The producer-consumer queue we use to communicate asynchronously with the
    // gRPC runtime.
    CompletionQueue cq_;

    // mutex to help control rate
    std::mutex m_Mutex;
    std::condition_variable m_Condition;
    int m_OutstandingMessageCount;
    int m_MaxOutstandingMessageCount;
    float m_TotalRequestTime;
    size_t m_RequestCalls;
};

static bool ValidateBytes(const char* flagname, const std::string& value)
{
    trtlab::StringToBytes(value);
    return true;
}

DEFINE_int32(count, 1000000, "number of grpc messages to send");
DEFINE_int32(batch_size, 1, "batch_size");
DEFINE_int32(max_outstanding, 950, "maximum outstanding requests");
DEFINE_int32(port, 50051, "server_port");
DEFINE_double(rate, 1.0, "messages per second");
DEFINE_double(max_rate, 100000, "maximum number of messages per second when func is applied");
DEFINE_double(alpha, 0, "alpha");
DEFINE_double(beta, 1, "beta");
DEFINE_string(func, "constant", "constant, linear or cyclic");
DEFINE_string(bytes, "0B", "add extra bytes to the request payload");
DEFINE_validator(bytes, &ValidateBytes);

int main(int argc, char** argv)
{
    FLAGS_alsologtostderr = 1; // It will dump to console
    ::google::ParseCommandLineFlags(&argc, &argv, true);

    g_BatchSize = FLAGS_batch_size;

    auto bytes = trtlab::StringToBytes(FLAGS_bytes);
    char extra_bytes[bytes];
    if(bytes)
        LOG(INFO) << "Sending an addition " << trtlab::BytesToString(bytes)
                  << " bytes in request payload";

    // using a fixed rate of 15us per rpc call.  i could adjust dynamically as i'm tracking
    // the call overhead, but it's close enough.
    auto start = std::chrono::system_clock::now();
    auto walltime = [start]() -> double {
        return std::chrono::duration<double>(std::chrono::system_clock::now() - start).count();
    };
    std::map<std::string, std::function<double()>> rates_by_name;
    rates_by_name["constant"] = []() -> double { return std::min(FLAGS_rate, FLAGS_max_rate); };
    rates_by_name["linear"] = [start, walltime]() -> double {
        return std::min(FLAGS_rate + (FLAGS_alpha / 60.0) * walltime(), FLAGS_max_rate);
    };
    rates_by_name["cyclic"] = [start, walltime]() -> double {
        return std::min(FLAGS_rate + FLAGS_alpha *
                                         std::sin(2.0 * 3.14159 * (FLAGS_beta / 60.0) * walltime()),
                        FLAGS_max_rate);
    };
    auto search = rates_by_name.find(FLAGS_func);
    if(search == rates_by_name.end())
    {
        LOG(FATAL) << "--func must be constant, linear or cyclic; your value = " << FLAGS_func;
    }
    auto sleepy = [search]() -> double {
        auto sleep_time = ((std::chrono::seconds(1) / std::max((search->second)(), 2.0))) -
                          std::chrono::microseconds(15);
        return std::chrono::duration<double>(sleep_time).count();
    };

    // Instantiate the client. It requires a channel, out of which the actual RPCs
    // are created. This channel models a connection to an endpoint (in this case,
    // localhost at port 50051). We indicate that the channel isn't authenticated
    // (use of InsecureChannelCredentials()).
    std::ostringstream ip_port;
    ip_port << "localhost:" << FLAGS_port;

    grpc::ChannelArguments ch_args;
    ch_args.SetMaxReceiveMessageSize(-1);
    GreeterClient greeter(
        grpc::CreateCustomChannel(ip_port.str(), grpc::InsecureChannelCredentials(), ch_args),
        FLAGS_max_outstanding);

    // Spawn reader thread that loops indefinitely
    std::thread thread_ = std::thread(&GreeterClient::AsyncCompleteRpc, &greeter);

    for(size_t i = 0; i < FLAGS_count; i++)
    {
        greeter.SayHello(i, FLAGS_batch_size, extra_bytes, bytes); // The actual RPC call!
        auto start = std::chrono::high_resolution_clock::now();
        while(std::chrono::duration<float>(std::chrono::high_resolution_clock::now() - start)
                  .count() < sleepy())
        {
            std::this_thread::yield();
        }
    }

    greeter.Shutdown();
    thread_.join(); // blocks forever
    auto elapsed = walltime();
    std::cout << FLAGS_count << " requests in " << elapsed
              << "seconds; inf/sec: " << FLAGS_count * FLAGS_batch_size / elapsed << std::endl;

    return 0;
}


================================================
FILE: examples/02_TensorRT_GRPC/src/sync-client.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * Original Copyright proivded below.
 * This work extends the original gRPC client examples to work with the
 * implemented server.
 *
 * Copyright 2015 gRPC authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
#include <chrono>
#include <iostream>
#include <memory>
#include <string>

#include <gflags/gflags.h>
#include <glog/logging.h>
#include <grpcpp/grpcpp.h>

#include "inference.grpc.pb.h"

using grpc::Channel;
using grpc::ClientContext;
using grpc::Status;
using ssd::BatchInput;
using ssd::BatchPredictions;
using ssd::Inference;

class SimpleClient
{
  public:
    SimpleClient(std::shared_ptr<Channel> channel) : stub_(Inference::NewStub(channel)) {}

    // Assembles the client's payload, sends it and presents the response back
    // from the server.
    int Compute(const int batch_id, const int batch_size)
    {
        // Data we are sending to the server.
        BatchInput request;
        request.set_batch_id(batch_id);
        request.set_batch_size(batch_size);

        // Container for the data we expect from the server.
        BatchPredictions reply;

        // Context for the client. It could be used to convey extra information to
        // the server and/or tweak certain RPC behaviors.
        ClientContext context;

        // The actual RPC.
        Status status = stub_->Compute(&context, request, &reply);

        // Act upon its status.
        if(status.ok())
        {
            return reply.batch_id();
        }
        else
        {
            std::cout << status.error_code() << ": " << status.error_message() << std::endl;
            return -1;
        }
    }

  private:
    std::unique_ptr<Inference::Stub> stub_;
};

DEFINE_int32(count, 1000, "number of grpc messages to send");
DEFINE_int32(port, 50051, "server_port");
DEFINE_int32(batch, 1, "batch size");

int main(int argc, char** argv)
{
    // Instantiate the client. It requires a channel, out of which the actual RPCs
    // are created. This channel models a connection to an endpoint (in this case,
    // localhost at port 50051). We indicate that the channel isn't authenticated
    // (use of InsecureChannelCredentials()).
    FLAGS_alsologtostderr = 1; // It will dump to console
    ::google::ParseCommandLineFlags(&argc, &argv, true);

    std::ostringstream ip_port;
    ip_port << "localhost:" << FLAGS_port;
    SimpleClient client(grpc::CreateChannel(ip_port.str(), grpc::InsecureChannelCredentials()));
    auto start = std::chrono::steady_clock::now();
    for(int i = 0; i < FLAGS_count; i++)
    {
        auto reply = client.Compute(i, FLAGS_batch);
        if(reply == -1 || reply != i) std::cout << "BatchId received: " << reply << std::endl;
    }
    auto end = std::chrono::steady_clock::now();
    float elapsed = std::chrono::duration<float>(end - start).count();
    std::cout << FLAGS_count << " requests in " << elapsed
              << " seconds; inf/sec: " << FLAGS_count * FLAGS_batch / elapsed << std::endl;
    return 0;
}


================================================
FILE: examples/03_Batching/CMakeLists.txt
================================================
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

add_executable(streaming-service-echo.x
    streaming-service.cc
)
target_link_libraries(streaming-service-echo.x
    trtlab::nvrpc
    echo-protos
    gflags
)

add_executable(batching-service-echo.x
    inference-batcher.cc
)

target_link_libraries(batching-service-echo.x
    trtlab::nvrpc
    echo-protos
    gflags
)


================================================
FILE: examples/03_Batching/README.md
================================================
# Batching Service

A batching service is a service that trying to collect sets of similar requests into a
collective batch which can be executed in a single-shot.

#### Why do we want to batch?

In the case of Deep Neural Networks, batching can improve the computational efficiency
of executing on a GPU by increases the operational intensity, i.e. improving the ratio of 
the number of math operations per memory transaction.  This translates to improved
throughput, better hardware utilization and cost reductions.

#### Sounds great, but what's the catch?

In many cases, batching can add latency to an individual request.  Because a batch of more
than 1 item, BatchN, computed as a single unit, the time to compute BatchN is greater than
Batch1.  However, in many cases, the time to compute delta between Batch1 and Batch2/4/8 is 
fairly small due to the improved operational efficiency.

Secondly, because batching requires requests to be collected, there is a timed collection
window prior to the compute.  The first request in a batch sees the longest latency. 

The worst-case increased latency is bounded by the following formula:
```
worst_additional_latency = batch_window_timeout + batchN_compute - batch1_compute
```

#### When to Batch?

You want to batch requests when your service has very high-load and you can tolerate
minor increases in latency.

Throughput improvements can be 2-5x which translates into direct cost savings.

#### What does this Batching Service do for me?

The basis YAIS service examples [01_GRPC](../01_GRPC) and [02_GRPC_TensorRT](../02_GRPC_TensorRT)
have implemented high-performance send/recv unary services.  That is, the client
sends a request which is computed and a response is returned.  The client could in theory
create a single message is itself a batch, i.e. multiple images files or sentences to be 
translated.  However, in most common realworld usecases, the clients of a service send
a single item at a time.  This keeps the logic simple and lifecycle of the request simple.

If this is your RPC definition,
```
service Inference {
   rpc Compute (Input) returns (Output) {}
}
```

Then, instead of implementing `rpc Compute` to perform the inference computation, instead, we
hyjack that RPC and turn it into a batcher.  In the [`inference-batcher.cc`](inference-batcher.cc)
file, you will see that we are indeed we implement our batching service as the `Compute` method.

The batching service collecting incoming `Input` requests and forwards them via a gRPC stream
to a service that accepts a "batching stream".

A “batching stream” is a stream where the endpoint service reads and collects the elements of the stream until the client signifies it is done writing.  That is the signal at which YAIS performs a single batched inference call on the concatenated set of requests that came in over the stream.  After the inference calculation is complete, the server writes the results for each request item to the stream.  That is, for each request that came in on the stream, the server is expected to return a response.  

We still need to compute inference on the batching stream.  This is performed by [streaming-service.cc](streaming-service.cc).

The `streaming-service` implements the `BatchedCompute` RPC method using a `BatchingContext`.
```
service Inference {
   rpc Compute (Input) returns (Output) {}
   rpc BatchedCompute (stream Input) returns (stream Output) {}
}
```

Because the stream consists of an array of individual messages, you simply need to make
minor modifications to your existing Batch1 service to preprocess and concat the incoming requests
together to form a single batch compute.  For each `Input` item in the stream, it is expected that
the service writes an `Output` response in the same order as the inputs (FIFO).

The batching service doesn’t need to know anything about the format of the `Input`/`Output` messages.  It simply accepts and forwards them.  The result is that this batching service example should be able to work with any unary gRPC service with any request/response message.  You simply need to implement a streaming service capable of handling the forwarding stream.

## Running Example

```
./launch_batching.sh
```

```
... # streaming service startup
... # batching service startup

Starting a shell keeping the services and load-balancer running...
Try python unary_client.py - exit shell to kill services
Batching Subshell: python unary_client.py
I0822 14:48:18.900671    50 inference-batcher.cc:344] incoming unary request
I0822 14:48:18.902642    41 inference-batcher.cc:109] Client using CQ: 0x14470f0
I0822 14:48:18.902680    41 inference-batcher.cc:140] Starting Batch Forwarding of Size 1 for Tag 0x1458450
I0822 14:48:18.903472    35 streaming-service.cc:61] Recieved request with batch_id=78
I0822 14:48:18.903504    35 streaming-service.cc:54] Response with batch_id=78
I0822 14:48:18.903656    47 inference-batcher.cc:243] Batch Forwarding Completed for Tag 0x1458450
Received msg  with batch_id=78
```


================================================
FILE: examples/03_Batching/inference-batcher.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <chrono>
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <thread>

#include "nvrpc/context.h"
#include "nvrpc/executor.h"
#include "nvrpc/server.h"
#include "tensorrt/laboratory/core/thread_pool.h"

using nvrpc::Context;
using nvrpc::Executor;
using nvrpc::Server;
using trtlab::ThreadPool;

#include "moodycamel/blockingconcurrentqueue.h"

using moodycamel::BlockingConcurrentQueue;
using moodycamel::ConsumerToken;
using moodycamel::ProducerToken;

#include "echo.grpc.pb.h"
#include "echo.pb.h"

/**
 * @brief Batching Service for Unary Requests
 *
 * Exposes a Unary (send/recv) interface for a given RPC, but rather than
 * computing the RPC, the service simply batches the incoming requests and
 * forwards them via a gRPC stream to a service that implements the actual
 * compute portion of the RPC.
 *
 * The backend compute service is not a Unary service.  Rather it must
 * implemented the LifeCycleBatching service Context, i.e. BatchingContext.
 * The other application in this folder implements the backend service.
 *
 * Streams are used as a forwarding mechanism because of how they interact
 * with a load-balancer.  Unlike unary requests which get balanced on each
 * request, a stream only get balanced when it is opened.  All items of a stream
 * go to the same endpoint service.
 *
 * @tparam ServiceType
 * @tparam Request
 * @tparam Response
 */
template<class ServiceType, class Request, class Response>
struct BatchingService
{
    using Callback = std::function<void(bool)>;

    struct MessageType
    {
        Request* request;
        Response* response;
        Callback callback;
    };

    /**
     * @brief Forwards incoming Unary requests via a gRPC Stream to
     * a Batched Steaming Service that implements the actual RPC
     */
    class Client
    {
      public:
        using PrepareFunc =
            std::function<std::unique_ptr<::grpc::ClientAsyncReaderWriter<Request, Response>>(
                ::grpc::ClientContext*, ::grpc::CompletionQueue*)>;

        Client(PrepareFunc prepare_func, std::shared_ptr<ThreadPool> thread_pool)
            : m_PrepareFunc(prepare_func), m_ThreadPool(thread_pool), m_CurrentCQ(0)
        {
            for(decltype(m_ThreadPool->Size()) i = 0; i < m_ThreadPool->Size(); i++)
            {
                LOG(INFO) << "Starting Client Progress Engine #" << i;
                m_CQs.emplace_back(new ::grpc::CompletionQueue);
                auto cq = m_CQs.back().get();
                m_ThreadPool->enqueue([this, cq] { ProgressEngine(*cq); });
            }
        }

        void WriteAndCloseStream(uint32_t messages_count, MessageType* messages)
        {
            auto cq = m_CQs[++m_CurrentCQ % m_CQs.size()].get();
            LOG(INFO) << "Client using CQ: " << (void*)cq;

            auto ctx = new Call;
            for(uint32_t i = 0; i < messages_count; i++)
            {
                ctx->Push(messages[i]);
            }

            ctx->m_Stream = m_PrepareFunc(&ctx->m_Context, cq);
            ctx->Start();
        }

      private:
        class Call
        {
          public:
            Call() : m_Started(false), m_NextState(&Call::StateInvalid) {}
            virtual ~Call() {}

            void Push(MessageType& message)
            {
                if(m_Started) LOG(FATAL) << "Stream started; No pushing allowed.";
                m_Requests.push(message.request);
                m_Responses.push(message.response);
                m_CallbackByResponse[message.response] = message.callback;
            }

            void Start()
            {
                LOG(INFO) << "Starting Batch Forwarding of Size " << m_Requests.size()
                          << " for Tag " << Tag();
                m_NextState = &Call::StateWriteDone;
                m_Stream->StartCall(Tag());
            }

          private:
            bool RunNextState(bool ok)
            {
                bool ret = (this->*m_NextState)(ok);
                if(!ret) DLOG(INFO) << "RunNextState returning false";
                return ret;
            }

            void* Tag() { return static_cast<void*>(this); }

            bool Fail()
            {
                LOG(FATAL) << "Fail";
                return false;
            }

            void WriteNext()
            {
                if(m_Requests.size())
                {
                    auto request = m_Requests.front();
                    m_Requests.pop();
                    DLOG(INFO) << "forwarding request";
                    m_NextState = &Call::StateWriteDone;
                    m_Stream->Write(*request, Tag());
                }
                else
                {
                    DLOG(INFO) << "closing client stream for writing";
                    m_NextState = &Call::StateCloseStreamDone;
                    m_Stream->WritesDone(Tag());
                }
            }

            void ReadNext()
            {
                if(m_Responses.size())
                {
                    DLOG(INFO) << "waiting on response";
                    auto response = m_Responses.front();
                    m_NextState = &Call::StateReadDone;
                    m_Stream->Read(response, Tag());
                }
                else
                {
                    DLOG(INFO) << "waiting on finished message from server";
                    m_NextState = &Call::StateFinishedDone;
                    m_Stream->Finish(&m_Status, Tag());
                }
            }

            bool StateWriteDone(bool ok)
            {
                if(!ok) return Fail();
                DLOG(INFO) << "request forwarded!";
                WriteNext();
                return true;
            }

            bool StateReadDone(bool ok)
            {
                if(!ok) return Fail();
                DLOG(INFO) << "response received";
                auto response = m_Responses.front();
                m_Responses.pop();
                auto search = m_CallbackByResponse.find(response);
                if(search == m_CallbackByResponse.end())
                    LOG(FATAL) << "Callback for response not found";
                ReadNext();
                // Execute callback which will complete the unary request for this stream item
                DLOG(INFO) << "triggering callback on held receive context";
                search->second(true);
                DLOG(INFO) << "callback completed";
                return true;
            }

            bool StateCloseStreamDone(bool ok)
            {
                if(!ok) return Fail();
                DLOG(INFO) << "closed client stream for writing";
                ReadNext();
                return true;
            }

            bool StateFinishedDone(bool ok)
            {
                if(m_Status.ok())
                    DLOG(INFO) << "ClientContext: " << Tag() << " finished with OK";
                else
                    DLOG(INFO) << "ClientContext: " << Tag() << " finished with CANCELLED";
                m_NextState = &Call::StateInvalid;
                LOG(INFO) << "Batch Forwarding Completed for Tag " << Tag();
                return false;
            }

            bool StateInvalid(bool ok) { LOG(FATAL) << "This should never be called"; }

          private:
            std::queue<Request*> m_Requests;
            std::queue<Response*> m_Responses;
            std::map<const Response*, Callback> m_CallbackByResponse;

            bool (Call::*m_NextState)(bool);

            ::grpc::Status m_Status;
            ::grpc::ClientContext m_Context;
            std::unique_ptr<::grpc::ClientAsyncReaderWriter<Request, Response>> m_Stream;
            bool m_Started;

            friend class Client;
        };

        void ProgressEngine(::grpc::CompletionQueue& cq)
        {
            void* tag;
            bool ok = false;

            while(cq.Next(&tag, &ok))
            {
                CHECK(ok) << "not ok";
                Call* call = static_cast<Call*>(tag);
                if(!call->RunNextState(ok))
                {
                    DLOG(INFO) << "Deleting Stream: " << tag;
                    delete call;
                }
            }
        }

        int m_CurrentCQ;
        PrepareFunc m_PrepareFunc;
        std::shared_ptr<ThreadPool> m_ThreadPool;
        std::vector<std::unique_ptr<::grpc::CompletionQueue>> m_CQs;
    };

    class Resources : public ::trtlab::Resources
    {
      public:
        Resources(uint32_t max_batch_size, uint64_t timeout, std::shared_ptr<Client> client)
            : m_MaxBatchsize(max_batch_size), m_Timeout(timeout), m_Client(client)
        {
        }

        virtual void PreprocessRequest(Request* req) {}

        void Push(Request* req, Response* resp, Callback callback)
        {
            // thread_local ProducerToken token(m_MessageQueue);
            // m_MessageQueue.enqueue(token, MessageType(req, resp, callback));
            PreprocessRequest(req);
            m_MessageQueue.enqueue(MessageType{req, resp, callback});
        }

        void ProgressEngine()
        {
            constexpr uint64_t quanta = 100;
            const double timeout = static_cast<double>(m_Timeout - quanta) / 1000000.0;
            size_t total_count;
            size_t max_batch;
            std::vector<MessageType> messages;
            messages.resize(m_MaxBatch)

                thread_local ConsumerToken token(m_MessageQueue);
            for(;;)
            {
                max_batch = m_MaxBatchsize;
                total_count = 0;
                auto start = std::chrono::steady_clock::now();
                auto elapsed = [start]() -> double {
                    return std::chrono::duration<double>(std::chrono::steady_clock::now() - start)
                        .count();
                };

                // initial pull - if not successful, restart loop

                // if successful, then open a stream, push message to stream and continue to collect
                // requests until the max_batch_size is reach for the timeout is triggered

                // finish sending

                // r
                do
                {
                    auto count = m_MessageQueue.wait_dequeue_bulk_timed(
                        token, &messages[total_count], max_batch, quanta);
                    total_count += count;
                    max_batch -= count;
                } while(total_count && total_count < m_MaxBatchsize && elapsed() < timeout);
                if(total_count)
                {
                    m_Client->WriteAndCloseStream(total_count, messages);
                }
            }
        }

      private:
        size_t m_MaxBatchsize;
        uint64_t m_Timeout;
        std::shared_ptr<Client> m_Client;
        BlockingConcurrentQueue<MessageType> m_MessageQueue;
    };

    class ReceiveContext final : public ::nvrpc::Context<Request, Response, Resources>
    {
        void ExecuteRPC(Request& request, Response& response) final override
        {
            LOG(INFO) << "incoming unary request";
            this->GetResources()->Push(&request, &response, [this](bool ok) {
                if(ok)
                    this->FinishResponse();
                else
                {
                    LOG(INFO) << "shoot";
                    this->CancelResponse();
                }
            });
        }
    };
};

DEFINE_uint32(max_batch_size, 8, "Maximum batch size to collect and foward");
DEFINE_uint64(timeout_usecs, 2000, "Batching window timeout in microseconds");
DEFINE_uint32(max_batches_in_flight, 1, "Maximum number of forwarded batches");
DEFINE_uint32(receiving_threads, 1, "Number of Forwarding threads");
DEFINE_uint32(forwarding_threads, 1, "Number of Forwarding threads");
DEFINE_string(forwarding_target, "localhost:50051", "Batched Compute Service / Load-Balancer");

using InferenceBatchingService = BatchingService<simple::Inference, simple::Input, simple::Output>;

int main(int argc, char* argv[])
{
    FLAGS_alsologtostderr = 1; // Log to console
    ::google::InitGoogleLogging("simpleBatchingService");
    ::google::ParseCommandLineFlags(&argc, &argv, true);
    auto forwarding_threads = std::make_shared<ThreadPool>(FLAGS_forwarding_threads);
    auto channel = grpc::CreateChannel(FLAGS_forwarding_target, grpc::InsecureChannelCredentials());
    auto stub = ::simple::Inference::NewStub(channel);
    auto forwarding_prepare_func = [&stub](::grpc::ClientContext * context,
                                           ::grpc::CompletionQueue * cq) -> auto
    {
        return std::move(stub->PrepareAsyncBatchedCompute(context, cq));
    };

    auto client = std::make_shared<InferenceBatchingService::Client>(forwarding_prepare_func,
                                                                     forwarding_threads);

    auto rpcResources = std::make_shared<InferenceBatchingService::Resources>(
        FLAGS_max_batch_size, FLAGS_timeout_usecs, client);

    Server server("0.0.0.0:50049");
    auto recvService = server.RegisterAsyncService<::simple::Inference>();
    auto rpcCompute = recvService->RegisterRPC<InferenceBatchingService::ReceiveContext>(
        &::simple::Inference::AsyncService::RequestCompute);

    uint64_t context_count = FLAGS_max_batch_size * FLAGS_max_batches_in_flight;
    uint64_t contexts_per_executor_thread = std::max(context_count / FLAGS_receiving_threads, 1UL);

    auto executor = server.RegisterExecutor(new Executor(FLAGS_receiving_threads));
    executor->RegisterContexts(rpcCompute, rpcResources, contexts_per_executor_thread);

    LOG(INFO) << "Running Server";
    server.Run(std::chrono::milliseconds(1), [rpcResources] { rpcResources->ProgressEngine(); });

    return 0;
}


================================================
FILE: examples/03_Batching/launch_batching.sh
================================================
#!/bin/bash -e
#
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
cleanup() {
  kill $(jobs -p) ||:
}
trap "cleanup" EXIT SIGINT SIGTERM

sleep 1

echo "starting streaming services"
/work/build/examples/03_Batching/streaming-service-echo.x &
wait-for-it.sh localhost:50051 --timeout=0 -- echo "Streaming service is ready."

echo "starting batching service"
/work/build/examples/03_Batching/batching-service-echo.x &
wait-for-it.sh localhost:50049 --timeout=0 -- echo "Batching service is ready."

echo
echo "Starting a shell keeping the services and load-balancer running..."
echo "Try python unary_client.py - exit shell to kill services"
bash --rcfile <(echo "PS1='Batching Subshell: '")


================================================
FILE: examples/03_Batching/simple_batching_client.py
================================================
import grpc

import simple_pb2
import simple_pb2_grpc


def run():
    with grpc.insecure_channel('localhost:50051') as channel:
        stub = simple_pb2_grpc.InferenceStub(channel)
        def requests():
            messages = [simple_pb2.Input(batch_id=i) for i in range(10)]
            for msg in messages:
                print("Sending Stream batch_id={}".format(msg.batch_id))
                yield msg

        responses = stub.BatchedCompute(requests())
        for resp in responses:
            print("Received msg on stream with batch_id={}".format(resp.batch_id))

if __name__ == "__main__":
    run() 


================================================
FILE: examples/03_Batching/simple_pb2.py
================================================
# Generated by the protocol buffer compiler.  DO NOT EDIT!
# source: simple.proto

import sys
_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
from google.protobuf import descriptor as _descriptor
from google.protobuf import message as _message
from google.protobuf import reflection as _reflection
from google.protobuf import symbol_database as _symbol_database
from google.protobuf import descriptor_pb2
# @@protoc_insertion_point(imports)

_sym_db = _symbol_database.Default()


DESCRIPTOR = _descriptor.FileDescriptor(
  name='simple.proto',
  package='simple',
  syntax='proto3',
  serialized_pb=_b('\n\x0csimple.proto\x12\x06simple\"\x19\n\x05Input\x12\x10\n\x08\x62\x61tch_id\x18\x01 \x01(\x04\"\x1a\n\x06Output\x12\x10\n\x08\x62\x61tch_id\x18\x01 \x01(\x04\x32n\n\tInference\x12*\n\x07\x43ompute\x12\r.simple.Input\x1a\x0e.simple.Output\"\x00\x12\x35\n\x0e\x42\x61tchedCompute\x12\r.simple.Input\x1a\x0e.simple.Output\"\x00(\x01\x30\x01\x62\x06proto3')
)


_INPUT = _descriptor.Descriptor(
  name='Input',
  full_name='simple.Input',
  filename=None,
  file=DESCRIPTOR,
  containing_type=None,
  fields=[
    _descriptor.FieldDescriptor(
      name='batch_id', full_name='simple.Input.batch_id', index=0,
      number=1, type=4, cpp_type=4, label=1,
      has_default_value=False, default_value=0,
      message_type=None, enum_type=None, containing_type=None,
      is_extension=False, extension_scope=None,
      options=None, file=DESCRIPTOR),
  ],
  extensions=[
  ],
  nested_types=[],
  enum_types=[
  ],
  options=None,
  is_extendable=False,
  syntax='proto3',
  extension_ranges=[],
  oneofs=[
  ],
  serialized_start=24,
  serialized_end=49,
)


_OUTPUT = _descriptor.Descriptor(
  name='Output',
  full_name='simple.Output',
  filename=None,
  file=DESCRIPTOR,
  containing_type=None,
  fields=[
    _descriptor.FieldDescriptor(
      name='batch_id', full_name='simple.Output.batch_id', index=0,
      number=1, type=4, cpp_type=4, label=1,
      has_default_value=False, default_value=0,
      message_type=None, enum_type=None, containing_type=None,
      is_extension=False, extension_scope=None,
      options=None, file=DESCRIPTOR),
  ],
  extensions=[
  ],
  nested_types=[],
  enum_types=[
  ],
  options=None,
  is_extendable=False,
  syntax='proto3',
  extension_ranges=[],
  oneofs=[
  ],
  serialized_start=51,
  serialized_end=77,
)

DESCRIPTOR.message_types_by_name['Input'] = _INPUT
DESCRIPTOR.message_types_by_name['Output'] = _OUTPUT
_sym_db.RegisterFileDescriptor(DESCRIPTOR)

Input = _reflection.GeneratedProtocolMessageType('Input', (_message.Message,), dict(
  DESCRIPTOR = _INPUT,
  __module__ = 'simple_pb2'
  # @@protoc_insertion_point(class_scope:simple.Input)
  ))
_sym_db.RegisterMessage(Input)

Output = _reflection.GeneratedProtocolMessageType('Output', (_message.Message,), dict(
  DESCRIPTOR = _OUTPUT,
  __module__ = 'simple_pb2'
  # @@protoc_insertion_point(class_scope:simple.Output)
  ))
_sym_db.RegisterMessage(Output)


_INFERENCE = _descriptor.ServiceDescriptor(
  name='Inference',
  full_name='simple.Inference',
  file=DESCRIPTOR,
  index=0,
  options=None,
  serialized_start=79,
  serialized_end=189,
  methods=[
  _descriptor.MethodDescriptor(
    name='Compute',
    full_name='simple.Inference.Compute',
    index=0,
    containing_service=None,
    input_type=_INPUT,
    output_type=_OUTPUT,
    options=None,
  ),
  _descriptor.MethodDescriptor(
    name='BatchedCompute',
    full_name='simple.Inference.BatchedCompute',
    index=1,
    containing_service=None,
    input_type=_INPUT,
    output_type=_OUTPUT,
    options=None,
  ),
])
_sym_db.RegisterServiceDescriptor(_INFERENCE)

DESCRIPTOR.services_by_name['Inference'] = _INFERENCE

# @@protoc_insertion_point(module_scope)


================================================
FILE: examples/03_Batching/simple_pb2_grpc.py
================================================
# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
import grpc

import simple_pb2 as simple__pb2


class InferenceStub(object):
  # missing associated documentation comment in .proto file
  pass

  def __init__(self, channel):
    """Constructor.

    Args:
      channel: A grpc.Channel.
    """
    self.Compute = channel.unary_unary(
        '/simple.Inference/Compute',
        request_serializer=simple__pb2.Input.SerializeToString,
        response_deserializer=simple__pb2.Output.FromString,
        )
    self.BatchedCompute = channel.stream_stream(
        '/simple.Inference/BatchedCompute',
        request_serializer=simple__pb2.Input.SerializeToString,
        response_deserializer=simple__pb2.Output.FromString,
        )


class InferenceServicer(object):
  # missing associated documentation comment in .proto file
  pass

  def Compute(self, request, context):
    # missing associated documentation comment in .proto file
    pass
    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
    context.set_details('Method not implemented!')
    raise NotImplementedError('Method not implemented!')

  def BatchedCompute(self, request_iterator, context):
    # missing associated documentation comment in .proto file
    pass
    context.set_code(grpc.StatusCode.UNIMPLEMENTED)
    context.set_details('Method not implemented!')
    raise NotImplementedError('Method not implemented!')


def add_InferenceServicer_to_server(servicer, server):
  rpc_method_handlers = {
      'Compute': grpc.unary_unary_rpc_method_handler(
          servicer.Compute,
          request_deserializer=simple__pb2.Input.FromString,
          response_serializer=simple__pb2.Output.SerializeToString,
      ),
      'BatchedCompute': grpc.stream_stream_rpc_method_handler(
          servicer.BatchedCompute,
          request_deserializer=simple__pb2.Input.FromString,
          response_serializer=simple__pb2.Output.SerializeToString,
      ),
  }
  generic_handler = grpc.method_handlers_generic_handler(
      'simple.Inference', rpc_method_handlers)
  server.add_generic_rpc_handlers((generic_handler,))


================================================
FILE: examples/03_Batching/streaming-service.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <chrono>
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <thread>

#include "nvrpc/context.h"
#include "nvrpc/executor.h"
#include "nvrpc/rpc.h"
#include "nvrpc/server.h"
#include "nvrpc/service.h"
#include "tensorrt/laboratory/core/resources.h"
#include "tensorrt/laboratory/core/thread_pool.h"

using nvrpc::AsyncRPC;
using nvrpc::AsyncService;
using nvrpc::BatchingContext;
using nvrpc::Executor;
using nvrpc::Server;
using trtlab::Resources;
using trtlab::ThreadPool;

#include "echo.grpc.pb.h"
#include "echo.pb.h"

class SimpleContext final : public BatchingContext<simple::Input, simple::Output, Resources>
{
    void ExecuteRPC(std::vector<RequestType>& inputs,
                    std::vector<ResponseType>& outputs) final override
    {
        for(auto input = inputs.cbegin(); input != inputs.cend(); input++)
        {
            auto output = outputs.emplace(outputs.end());
            output->set_batch_id(input->batch_id());
            LOG(INFO) << "Response with batch_id=" << output->batch_id();
        }
        this->FinishResponse();
    }

    void OnRequestReceived(const RequestType& request) final override
    {
        LOG(INFO) << "Recieved request with batch_id=" << request.batch_id();
    }
};

int main(int argc, char* argv[])
{
    FLAGS_alsologtostderr = 1; // Log to console

    ::google::InitGoogleLogging("simpleServer");
    ::google::ParseCommandLineFlags(&argc, &argv, true);

    Server server("0.0.0.0:50051");

    LOG(INFO) << "Register Service (simple::Inference)";
    auto simpleInference = server.RegisterAsyncService<simple::Inference>();

    LOG(INFO)
        << "Register RPC (simple::Inference::BatchedCompute) with Service (simple::Inference)";
    auto rpcCompute = simpleInference->RegisterRPC<SimpleContext>(
        &simple::Inference::AsyncService::RequestBatchedCompute);

    LOG(INFO) << "Initializing Resources for RPC (simple::Inference::BatchedCompute)";
    auto rpcResources = std::make_shared<Resources>();

    LOG(INFO) << "Creating Executor";
    auto executor = server.RegisterExecutor(new Executor(1));

    LOG(INFO) << "Creating Execution Contexts for RPC (simple::Inference::Compute) with Executor";
    executor->RegisterContexts(rpcCompute, rpcResources, 10);

    LOG(INFO) << "Running Server";
    server.Run(std::chrono::milliseconds(2000), [] {
        // This is a timeout loop executed every 2seconds
        // Run() with no arguments will run an empty timeout loop every 5 seconds.
        // RunAsync() will return immediately, its your responsibility to ensure the
        // server doesn't go out of scope or a Shutdown will be triggered on your services.
    });
}


================================================
FILE: examples/03_Batching/unary_client.py
================================================
import grpc

import simple_pb2
import simple_pb2_grpc


def run():
    with grpc.insecure_channel('localhost:50049') as channel:
        stub = simple_pb2_grpc.InferenceStub(channel)
        
        response  = stub.Compute(simple_pb2.Input(batch_id=78))
        print("Received msg  with batch_id={}".format(response.batch_id))

if __name__ == "__main__":
    run() 


================================================
FILE: examples/04_Middleman/CMakeLists.txt
================================================
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

add_executable(middleman-client.x
    middleman-client.cc
)

target_link_libraries(middleman-client.x
    trtlab::nvrpc
    nv-inference-protos
    gflags
)


================================================
FILE: examples/04_Middleman/middleman-client.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <chrono>
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <thread>

#include "tensorrt/laboratory/core/memory/allocator.h"
#include "tensorrt/laboratory/core/memory/malloc.h"

using trtlab::Allocator;
using trtlab::Malloc;

#include "nvrpc/context.h"
#include "nvrpc/executor.h"
#include "nvrpc/server.h"

using nvrpc::Context;
using nvrpc::Executor;
using nvrpc::Server;
using trtlab::ThreadPool;

#include "moodycamel/blockingconcurrentqueue.h"

using moodycamel::BlockingConcurrentQueue;
using moodycamel::ConsumerToken;
using moodycamel::ProducerToken;

// NVIDIA Inference Server Protos
#include "nvidia_inference.grpc.pb.h"
#include "nvidia_inference.pb.h"

namespace easter = ::nvidia::inferenceserver;
/*
using nvidia::inferenceserver::GRPCService;
using nvidia::inferenceserver::InferRequest;
using nvidia::inferenceserver::InferResponse;
*/

/**
 * @brief Batching Service for Unary Requests
 *
 * Exposes a Unary (send/recv) interface for a given RPC, but rather than
 * computing the RPC, the service simply batches the incoming requests and
 * forwards them via a gRPC stream to a service that implements the actual
 * compute portion of the RPC.
 *
 * The backend compute service is not a Unary service.  Rather it must
 * implemented the LifeCycleBatching service Context, i.e. BatchingContext.
 * The other application in this folder implements the backend service.
 *
 * Streams are used as a forwarding mechanism because of how they interact
 * with a load-balancer.  Unlike unary requests which get balanced on each
 * request, a stream only get balanced when it is opened.  All items of a stream
 * go to the same endpoint service.
 *
 * @tparam ServiceType
 * @tparam Request
 * @tparam Response
 */
template<class ServiceType, class Request, class Response>
struct MiddlemanService
{
    using Callback = std::function<void(bool)>;

    struct MessageType
    {
        Request* request;
        Response* response;
        Callback callback;
    };

    /**
     * @brief Forwards incoming Unary requests via a gRPC Stream to
     * a Batched Steaming Service that implements the actual RPC
     */
    class Client
    {
      public:
        using PrepareFunc =
            std::function<std::unique_ptr<::grpc::ClientAsyncResponseReader<Response>>(
                ::grpc::ClientContext*, const Request&, ::grpc::CompletionQueue*)>;

        Client(PrepareFunc prepare_func, std::shared_ptr<ThreadPool> thread_pool)
            : m_PrepareFunc(prepare_func), m_ThreadPool(thread_pool), m_CurrentCQ(0)
        {
            for(decltype(m_ThreadPool->Size()) i = 0; i < m_ThreadPool->Size(); i++)
            {
                LOG(INFO) << "Starting Client Progress Engine #" << i;
                m_CQs.emplace_back(new ::grpc::CompletionQueue);
                auto cq = m_CQs.back().get();
                m_ThreadPool->enqueue([this, cq] { ProgressEngine(*cq); });
            }
        }

        void WriteAndCloseStream(uint32_t messages_count, MessageType* messages)
        {
            auto cq = m_CQs[++m_CurrentCQ % m_CQs.size()].get();
            DLOG(INFO) << "Client using CQ: " << (void*)cq;
            CHECK_EQ(1U, messages_count) << "forwarder; not batcher";

            auto ctx = new Call;
            for(uint32_t i = 0; i < messages_count; i++)
            {
                ctx->Push(messages[i]);
            }

            ctx->m_Reader = m_PrepareFunc(&ctx->m_Context, *ctx->m_Request, cq);
            ctx->m_Reader->StartCall();
            ctx->m_Reader->Finish(ctx->m_Response, &ctx->m_Status, ctx->Tag());
        }

      private:
        class Call
        {
          public:
            Call() : m_NextState(&Call::StateFinishedDone) {}
            virtual ~Call() {}

            void Push(MessageType& message)
            {
                m_Request = message.request;
                m_Response = message.response;
                m_Callback = message.callback;
            }

          private:
            bool RunNextState(bool ok)
            {
                bool ret = (this->*m_NextState)(ok);
                if(!ret) DLOG(INFO) << "RunNextState returning false";
                return ret;
            }

            void* Tag() { return static_cast<void*>(this); }

            bool Fail()
            {
                LOG(FATAL) << "Fail";
                return false;
            }

            bool StateFinishedDone(bool ok)
            {
                if(m_Status.ok())
                    DLOG(INFO) << "ClientContext: " << Tag() << " finished with OK";
                else
                    DLOG(INFO) << "ClientContext: " << Tag() << " finished with CANCELLED";
                m_Callback(m_Status.ok());
                DLOG(INFO) << "Forwarding Completed for Tag " << Tag();
                return false;
            }

          private:
            Request* m_Request;
            Response* m_Response;
            Callback m_Callback;

            bool (Call::*m_NextState)(bool);

            ::grpc::Status m_Status;
            ::grpc::ClientContext m_Context;
            std::unique_ptr<::grpc::ClientAsyncResponseReader<Response>> m_Reader;

            friend class Client;
        };

        void ProgressEngine(::grpc::CompletionQueue& cq)
        {
            void* tag;
            bool ok = false;

            while(cq.Next(&tag, &ok))
            {
                CHECK(ok) << "not ok";
                Call* call = static_cast<Call*>(tag);
                if(!call->RunNextState(ok))
                {
                    DLOG(INFO) << "Deleting Stream: " << tag;
                    delete call;
                }
            }
        }

        int m_CurrentCQ;
        PrepareFunc m_PrepareFunc;
        std::shared_ptr<ThreadPool> m_ThreadPool;
        std::vector<std::unique_ptr<::grpc::CompletionQueue>> m_CQs;
    };

  public:
    class Resources : public ::trtlab::Resources
    {
      public:
        Resources(uint32_t max_batch_size, uint64_t timeout, std::shared_ptr<Client> client)
            : m_MaxBatchsize(max_batch_size), m_Timeout(timeout), m_Client(client)
        {
        }

        virtual void PreprocessRequest(Request* req) {}

        void Push(Request* req, Response* resp, Callback callback)
        {
            // thread_local ProducerToken token(m_MessageQueue);
            // m_MessageQueue.enqueue(token, MessageType(req, resp, callback));
            PreprocessRequest(req);
            m_MessageQueue.enqueue(MessageType{req, resp, callback});
        }

        void ProgressEngine()
        {
            constexpr uint64_t quanta = 100;
            const double timeout = static_cast<double>(m_Timeout - quanta) / 1000000.0;
            size_t total_count;
            size_t max_batch;

            thread_local ConsumerToken token(m_MessageQueue);
            for(;;)
            {
                MessageType messages[m_MaxBatchsize];
                max_batch = m_MaxBatchsize;
                total_count = 0;
                auto start = std::chrono::steady_clock::now();
                auto elapsed = [start]() -> double {
                    return std::chrono::duration<double>(std::chrono::steady_clock::now() - start)
                        .count();
                };
                do
                {
                    auto count = m_MessageQueue.wait_dequeue_bulk_timed(
                        token, &messages[total_count], max_batch, quanta);
                    CHECK_LE(count, max_batch);
                    total_count += count;
                    max_batch -= count;
                } while(total_count && total_count < m_MaxBatchsize && elapsed() < timeout);
                if(total_count)
                {
                    m_Client->WriteAndCloseStream(total_count, messages);
                }
            }
        }

      private:
        size_t m_MaxBatchsize;
        uint64_t m_Timeout;
        std::shared_ptr<Client> m_Client;
        BlockingConcurrentQueue<MessageType> m_MessageQueue;
    };

    class ReceiveContext final : public ::nvrpc::Context<Request, Response, Resources>
    {
        void ExecuteRPC(Request& request, Response& response) final override
        {
            DLOG(INFO) << "incoming unary request";
            this->GetResources()->Push(&request, &response, [this](bool ok) {
                if(ok)
                    this->FinishResponse();
                else
                {
                    LOG(INFO) << "shoot";
                    this->CancelResponse();
                }
            });
        }
    };
};

DEFINE_uint32(max_batch_size, 1, "Maximum batch size to collect and foward");
DEFINE_uint64(timeout_usecs, 200, "Batching window timeout in microseconds");
DEFINE_uint32(max_batches_in_flight, 300, "Maximum number of forwarded batches");
DEFINE_uint32(receiving_threads, 2, "Number of Forwarding threads");
DEFINE_uint32(forwarding_threads, 2, "Number of Forwarding threads");
DEFINE_string(forwarding_target, "localhost:8001", "Batched Compute Service / Load-Balancer");

using InferMiddlemanService =
    MiddlemanService<easter::GRPCService, easter::InferRequest, easter::InferResponse>;
using StatusMiddlemanService =
    MiddlemanService<easter::GRPCService, easter::StatusRequest, easter::StatusResponse>;

class DemoMiddlemanService : public InferMiddlemanService
{
  public:
    class Resources : public InferMiddlemanService::Resources
    {
      public:
        using InferMiddlemanService::Resources::Resources;
        void PreprocessRequest(easter::InferRequest* req) override
        {
            static auto local_data = std::make_unique<Allocator<Malloc>>(10 * 1024 * 1024);
            DLOG(INFO) << "Boom - preprocess request here!";
            auto bytes = req->meta_data().batch_size() * req->meta_data().input(0).byte_size();
            CHECK_EQ(0, req->raw_input_size());
            req->add_raw_input(local_data->Data(), bytes);
        }
    };
};

int main(int argc, char* argv[])
{
    FLAGS_alsologtostderr = 1; // Log to console
    ::google::InitGoogleLogging("easterForwardingService");
    ::google::ParseCommandLineFlags(&argc, &argv, true);

    grpc::ChannelArguments ch_args;
    ch_args.SetMaxReceiveMessageSize(-1);
    auto channel = grpc::CreateCustomChannel(FLAGS_forwarding_target,
                                             grpc::InsecureChannelCredentials(), ch_args);

    // GRPCService::Infer async forwarder
    auto forwarding_threads = std::make_shared<ThreadPool>(FLAGS_forwarding_threads);
    auto stub = ::easter::GRPCService::NewStub(channel);
    auto forwarding_prepare_func = [&stub](::grpc::ClientContext * context,
                                           const ::easter::InferRequest& request,
                                           ::grpc::CompletionQueue* cq) -> auto
    {
        return std::move(stub->PrepareAsyncInfer(context, request, cq));
    };
    auto client =
        std::make_shared<DemoMiddlemanService::Client>(forwarding_prepare_func, forwarding_threads);

    // GRPCService::Status async forwarder
    auto status_forwarding_threads = std::make_shared<ThreadPool>(1);
    auto status_stub = ::easter::GRPCService::NewStub(channel);
    auto status_forwarding_prepare_func = [&stub](::grpc::ClientContext * context,
                                                  const ::easter::StatusRequest& request,
                                                  ::grpc::CompletionQueue* cq) -> auto
    {
        return std::move(stub->PrepareAsyncStatus(context, request, cq));
    };
    auto status_client = std::make_shared<StatusMiddlemanService::Client>(
        status_forwarding_prepare_func, status_forwarding_threads);

    auto rpcResources = std::make_shared<DemoMiddlemanService::Resources>(
        FLAGS_max_batch_size, FLAGS_timeout_usecs, client);

    auto statusResources = std::make_shared<StatusMiddlemanService::Resources>(
        FLAGS_max_batch_size, FLAGS_timeout_usecs, status_client);

    Server server("0.0.0.0:50049");
    auto bytes = trtlab::StringToBytes("100MiB");
    server.Builder().SetMaxReceiveMessageSize(bytes);
    LOG(INFO) << "gRPC MaxReceiveMessageSize = " << trtlab::BytesToString(bytes);

    auto recvService = server.RegisterAsyncService<::easter::GRPCService>();
    auto rpcCompute = recvService->RegisterRPC<DemoMiddlemanService::ReceiveContext>(
        &::easter::GRPCService::AsyncService::RequestInfer);
    auto rpcStatus = recvService->RegisterRPC<StatusMiddlemanService::ReceiveContext>(
        &::easter::GRPCService::AsyncService::RequestStatus);

    uint64_t context_count = FLAGS_max_batch_size * FLAGS_max_batches_in_flight;
    uint64_t contexts_per_executor_thread = std::max(context_count / FLAGS_receiving_threads, 1UL);

    auto executor = server.RegisterExecutor(new Executor(FLAGS_receiving_threads));
    executor->RegisterContexts(rpcCompute, rpcResources, contexts_per_executor_thread);

    auto status_executor = server.RegisterExecutor(new Executor(1));
    status_executor->RegisterContexts(rpcStatus, statusResources, 1);

    auto executor_threads = std::make_shared<ThreadPool>(2);
    executor_threads->enqueue([rpcResources] { rpcResources->ProgressEngine(); });
    executor_threads->enqueue([statusResources] { statusResources->ProgressEngine(); });

    LOG(INFO) << "Running Server";
    server.Run(std::chrono::milliseconds(1), [] {});
}


================================================
FILE: examples/10_Internals/CMakeLists.txt
================================================
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

add_executable(internals.x
    internals.cc
)
target_link_libraries(internals.x
    trtlab::core
    trtlab::cuda
    gflags
)

================================================
FILE: examples/10_Internals/README.md
================================================
# YAIS Internals

The `internals.x` program is designed to be run on a DGX-Station or DGX-1.  This is mostly to highlight
the use of the Affinity API.  If you want to run on a different CPU architecture, you simply need to
change the following lines to a range that works with your CPU.

```
    // Socket 1 - non-hyperthreads on a DGX-1, or
    // Socket 0 - hyperthreads on a DGX-Station
    auto socket_1 = Affinity::GetAffinity().Intersection(
        Affinity::GetCpusFromString("20-39") // <== Change Me!
    );
```

## Primative Classes

  * `Affinity` 
    * Get and Set CPU affinities for current thread
  * `ThreadPool` 
    * Create generic worker thread pool that accept arbiturary lambda functions.
    * Pinned threads from `ThreadPools` are used to allocate memory to ensure that the CPU allocation
      are allocated and first-touched on the threads on the NUMA node for which it will be used.
      This is important for keeping threads and memory pool separate on NUMA systems.
  * `Memory`
    * `Memory` and the derived classes (`MallocMemory`, `CudaPinnedHostMemory`, `CudaDeviceMemory`) are not
      used directly; however they provided the implmentation details used by the generic `Allocator`.
  * `Allocator<MemoryType>`
    * Generic Templated Class used to create `std::shared_ptr` and `std::unique_ptr` to instances of
      `Allocator<MemoryType>`.
  * `MemoryStack<AllocatorType>`
    * Generic Templated Class to create a memory stack from a given `AllocatorType`.
    * You can only advance the stack pointer, or reset the entire stack.
    * TODO: Create sub-stacks from a given stack.
  * `Pool<ResourceType>`
    * Generic Templated Class that holds objects of `ResourceType`.
    * Resources can be checked-out of the Pool (Pop) as a *special-type* of `std::shared_ptr<ResourceType>`,
      which automatically returned the Resource object the pool when the reference count of the
      `shared_ptr` goes to zero.  This Resources are not lost on exceptions, but also that the Pool can not
      be deleted until all object have been returned to the Pool.

## TensorRT Classes

  * `Model` 
    * Wrapper around `nvinfer1::ICudaEngine`
  * `Buffers` 
    * `MemoryStackWithTracking<CudaPinnedHostMemory>` and `MemoryStackWithTracking<CudaDeviceMemory>` used
      to manage Input/Output Tensor Bindings.
    * Owns a `cudaStream_t` to be used with Async Copies and Kernel Executions on the data held by the Buffers.
    * Convenience H2D and D2H copy functions
  * `ExecutionContext` - Wrapper around `nvinfer1::IExecutionContext`
    * `Enqueue` launches the inference calculuation and adds a `cudaEvent_t` to the stream to be triggered
      when the inference calcuation is finished and the `ExecutionContext` can be released.
  * `Resources`
    * Combines the above set of resources into a single `trtlab::Resources` class capable of being associated
      with a `nvrpc::Context`.


## Examples

### Affinity

  * [Definition: tensorrt/laboratory/core/affinity.h](../../yais/include/tensorrt/laboratory/core/affinity.h)
  * [Implementation: YAIS/Affinity.cc](../../yais/src/Affinity.cc)

In this, we request the all logical CPUs from Socket 0 that are not hyperthreads, then we get either all 
the non-hyperthreads from socket_1 on a DGX-1, or the hyperthreads on socket0 on a DGX-Station using 
`GetCpusFromString`.

```
    // Socket 0 - non-hyperthreads on a DGX-1 or Station
    auto socket_0 = Affinity::GetAffinity().Intersection(
        Affinity::GetCpusBySocket(0).Intersection(
            Affinity::GetCpusByProcessingUnit(0)
    ));

    // Socket 1 - non-hyperthreads on a DGX-1, or
    // Socket 0 - hyperthreads on a DGX-Station
    auto socket_1 = Affinity::GetAffinity().Intersection(
        Affinity::GetCpusFromString("20-39")
    );

    LOG(INFO) << socket_0;
```

Single line output reformatted to per-line-indented output for readability.
```
0515 07:14:48.007148 10919 test_affinity.cc:61] 
    [id: 0, numa: 0, socket: 0, core: 0, processing_unit: 0], 
    [id: 1, numa: 0, socket: 0, core: 1, processing_unit: 0], 
    [id: 2, numa: 0, socket: 0, core: 2, processing_unit: 0],
    ... omitted for brevity ...
    [id: 18, numa: 0, socket: 0, core: 18, processing_unit: 0], 
    [id: 19, numa: 0, socket: 0, core: 19, processing_unit: 0]
```

### ThreadPool

  * [Definition: tensorrt/laboratory/core/thread_pool.h](../../yais/include/tensorrt/laboratory/core/thread_pool.h)
  * [Implementation: YAIS/ThreadPool.cc](../../yais/src/ThreadPool.cc)

The ThreadPool class creates a pool of worker threads that pull work from a queue.  The work queue can
be any set of captured lambda functions or function pointers passed to the `enqueue` function.

```
    // Create a ThreadPool where each thread is pinned to one logical CPU in the CpuSet
    auto workers_0 = std::make_shared<ThreadPool>(socket_0);
    auto workers_1 = std::make_shared<ThreadPool>(socket_1);

    // Create a massive set of threads that can run anywhere our current process is allowed to run
    auto bftp = std::make_unique<ThreadPool>(128, Affinity::GetAffinity());

    // Shutdown the BFTP
    bftp.reset();

    // Enqueue some basic logging
    for(int i=0; i<10; i++) {
        auto result = workers->enqueue([i]{
            LOG(INFO) << i << " " << Affinity::GetAffinity();
            std::this_thread::sleep_for(std::chrono::milliseconds(10));
        });
    }
```

As these ThreadPools are generic, we can enqueue any type of work to them.  Many thanks to the original
authors Jakob Progsch and Václav Zeman for this incredible useful class.  For details on the original
work and the modificiations made in this project, see [CREDITS.md](../../CREDITS.md) and the source code.


### Memory

One of the reasons why `Affinity` and `ThreadPool` were introduced prior to `Memory` is that memory on
NUMA system can be difficult to do correctly.  For memory segments that will be primarly used by sets
of threads, it is very important to first set the affinity of the threads, then allocate and touch each
page in the memory allocation (first-touch) on the thread that will primarly use the segment.  NERSC 
has a nice [write-up on memory affinity and first touch policies](http://www.nersc.gov/users/computational-systems/cori/application-porting-and-performance/improving-openmp-scaling/).
In this section, we'll show how to properly use the `Memory` and `Allocator` classes in a NUMA friendly
way using `ThreadPool`s.

  * [Definition: tensorrt/laboratory/core/memory.h](../../yais/include/tensorrt/laboratory/core/memory.h)

The `Memory` class and its derived classes, see below, are the core memory classes in YAIS; however,
these classes are not direclty used.  Instead, they provide the implmentation details on how memory of
their respective classes is to be allocated, freed, and page-aligned.  For details, see the comments
in the source code.

Derived `Memory` Classes:
  * `Malloc`
  * `CudaPinnedHostMemory`
  * `CudaDeviceMemory`
  * `CudaManagedMemory`

### Allocator<MemoryType>

  * [Definition: tensorrt/laboratory/core/memory.h](../../yais/include/tensorrt/laboratory/core/memory.h)

The templated `Allocator<MemoryType>` class performs memory allocations and freeing operations.  This
class does not have a public constructor, instead, you are required to use either the `make_shared`
or `make_unique` static methods.  In doing so, the method to free the allocation is captured by the
deconstructor which is triggered by the default deleter of `shared_ptr` and `unique_ptr`.

An allocated memory segments is of type `Allocator<MemoryType>` which inherits from `MemoryType`.
The base `Memory` class provides three functions, `GetPointer()`, `GetSize()`, and `WriteZeros()`.

```
    std::shared_ptr<CudaPinnedHostMemory> pinned_0, pinned_1;

    auto future_0 = workers_0->enqueue([&pinned_0]{
        pinned_0 = Allocator<CudaPinnedHostMemory>::make_shared(1024*1024*1024);
        pinned_0->WriteZeros();
    });

    auto future_1 = workers_1->enqueue([&pinned_1]{
        pinned_1 = Allocator<CudaPinnedHostMemory>::make_shared(1024*1024*1024);
        pinned_1->WriteZeros();
    });

    future_0.get();
    CHECK(pinned_0) << "pinned_0 got deallocated - fail";
    LOG(INFO) << "pinned_0 (ptr, size): (" 
              << pinned_0->GetPointer() << ", "
              << pinned_0->GetSize() << ")";
```

```
I0515 08:36:56.619297 13260 test_affinity.cc:59] pinned_0 (ptr, size): (0x1005e000000, 1073741824)
```

### MemoryStack<AllocatorType>

  * [Definition: tensorrt/laboratory/core/memory_stack.h](../../yais/include/tensorrt/laboratory/core/memory_stack.h)

Generic `MemoryStack` that takes an `AllocatorType`.  The memory stack advances the stack pointer
via `Allocate` and resets the stack pointer via `ResetAllocations`.  `MemoryStackWithTracking`
is a specialized derivation that records the pointer and size of each call to `Allocate`.
`MemoryStackWithTracking` is used in the provided TensorRT classes as a means to push the
input/output tensor bindings onto the stack.

```
    std::shared_ptr<MemoryStackWithTracking<CudaDeviceMemory>> gpu_stack_on_socket0;

    future_0 = workers_0->enqueue([&gpu_stack_on_socket0]{
        CHECK_EQ(cudaSetDevice(0), CUDA_SUCCESS) << "Set Device 0 failed";
        gpu_stack_on_socket0 = std::make_shared<
            MemoryStackWithTracking<CudaDeviceMemory>>(1024*1024*1024);
    });

    future_0.get(); // thread allocating gpu_stack_on_socket0 finished with task
    
    LOG(INFO) << "Push Binding 0 - 10MB - stack_ptr = " 
        << gpu_stack_on_socket0->Allocate(10*1024*1024);
    LOG(INFO) << "Push Binding 1 - 128MB - stack_ptr = " 
        << gpu_stack_on_socket0->Allocate(128*1024*1024);
    gpu_stack_on_socket0->ResetAllocations();
```

```
I0515 09:46:55.159700 14176 test_affinity.cc:78] Push Binding 0 - 10MB - stack_ptr = 0x1009e000000
I0515 09:46:55.159710 14176 test_affinity.cc:80] Push Binding 1 - 128MB - stack_ptr = 0x1009ea00000
```

### Pool<ResourceType>

  * [Definition: tensorrt/laboratory/core/pool.h](../../yais/include/tensorrt/laboratory/core/pool.h)

A `Pool<ResourceType>` is a generic of `Queue<std::shared_ptr<ResourceType>>` with a special `Pop`
method.  The class inherits from `std::enabled_shared_from_this` meaning it must be constructed using
the factory method, which ensures the object is owned by a `std::shared_ptr`.

The `Pop` method of `Pool<ResourceType>` is probably the coolest and most contensious component of this
library.  `Pop` pulls an resource off the queue (`from_queue`); however, it does not return this resource.
Instead, a *new type* of `std::shared_ptr<ResourceType>` is created using the raw pointer from `from_pool`.
The reason they this is a *new type* of `shared_ptr` is because we provide a custom `Deleter` method that
captures by value (increments reference count) of both `from_pool` and a `shared_ptr` to the pool itself.

The custom `Deleter` does not free the resource when its reference count goes to zero; rather, it returns
the original `from_pool` `shared_ptr` to the pool.

By capturing a `shared_ptr` to the pool in the `Deleter`, we ensure the the pool can not be freed while
resources are checkedout.  This also ensures that the `shared_ptr` returned from `Pop` is exception
safe; meaning, the resource will be returned to the pool if an exception is thrown and caught - it won't
leak resources.

Alternatively, `Pop` can be called with an `onReturn` lambda function, which will be executed just prior
to the original object being returned to the Pool. If the `ResourceType` is stateful, this is a good 
chance to clear the state and prepare it for the next use.

```
    struct Buffer
    {
        Buffer(
            std::shared_ptr<CudaPinnedHostMemory> pinned_,
            std::shared_ptr<MemoryStackWithTracking<CudaDeviceMemory>> gpu_stack_,
            std::shared_ptr<ThreadPool> workers_
        ) : pinned(pinned_), gpu_stack(gpu_stack_), workers(workers_) {}

        // a real example probably includes a deviceID and a stream as part of the buffer

        std::shared_ptr<CudaPinnedHostMemory> pinned;
        std::shared_ptr<MemoryStackWithTracking<CudaDeviceMemory>> gpu_stack;
        std::shared_ptr<ThreadPool> workers;
    };

    auto buffers = Pool<Buffer>::Create();

    buffers->EmplacePush(new Buffer(pinned_0, gpu_stack_on_socket0, workers_0));
    buffers->EmplacePush(new Buffer(pinned_1, gpu_stack_on_socket1, workers_1));

    for(int i=0; i<6; i++)
    {
        auto buffer = buffers->Pop();
        buffer->workers->enqueue([buffer]{
            // perform some work - regardless of which buffer you got, you are working
            // on a thread properly assocated with the resources
            // note: buffer is captures by value, incrementing its reference count,
            // meaning you have access to it here and when it goes out of scope, it will
            // be returned to the Pool.
            LOG(INFO) << Affinity::GetAffinity();
        });
    }
```

## TensorRT Examples

  * [Definition: YAIS/YAIS/TensorRT/TensorRT.h](../../yais/include/YAIS/TensorRT/TensorRT.h)
  * [Implemenation: YAIS/TensorRT.cc](../../yais/src/TensorRT.cc)

TensoRT classes build on the primatives above.  For now, see the comments in the header file, as
the header file is pretty well documented.


================================================
FILE: examples/10_Internals/internals.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "tensorrt/laboratory/core/affinity.h"
#include "tensorrt/laboratory/core/memory/allocator.h"
#include "tensorrt/laboratory/core/memory/memory_stack.h"
#include "tensorrt/laboratory/core/pool.h"
#include "tensorrt/laboratory/core/thread_pool.h"
#include "tensorrt/laboratory/cuda/device_info.h"
#include "tensorrt/laboratory/cuda/memory/cuda_device.h"
#include "tensorrt/laboratory/cuda/memory/cuda_managed.h"
#include "tensorrt/laboratory/cuda/memory/cuda_pinned_host.h"

#include <cuda.h>
#include <cuda_runtime.h>
#include <glog/logging.h>

#include <chrono>
#include <iostream>
#include <memory>
#include <string>
#include <vector>

using trtlab::Affinity;
using trtlab::Allocator;
using trtlab::CpuSet;
using trtlab::CudaDeviceMemory;
using trtlab::CudaPinnedHostMemory;
using trtlab::DeviceInfo;
using trtlab::MemoryStack;
using trtlab::Pool;
using trtlab::ThreadPool;

int main(int argc, char* argv[])
{
    FLAGS_alsologtostderr = 1; // Log to console
    ::google::InitGoogleLogging("example10/internals.x");

    auto one_gib = 1024 * 1024 * 1024;
    auto zeroMemory = true;

    const auto& gpu_0 = DeviceInfo::Affinity(0);

    // Socket 0 - non-hyperthreads on a DGX-1 or Station
    const auto& socket_0 = Affinity::GetAffinity().Intersection(
        Affinity::GetCpusBySocket(0).Intersection(Affinity::GetCpusByProcessingUnit(0)));

    // Socket 1 - non-hyperthreads on a DGX-1, or
    // Socket 0 - hyperthreads on a DGX-Station
    const auto& socket_1 =
        Affinity::GetAffinity().Intersection(Affinity::GetCpusFromString("20-39"));

    auto workers_0 = std::make_shared<ThreadPool>(socket_0);
    auto workers_1 = std::make_shared<ThreadPool>(socket_1);

    std::shared_ptr<CudaPinnedHostMemory> pinned_0, pinned_1;

    auto future_0 = workers_0->enqueue([=, &pinned_0] {
        pinned_0 = std::make_shared<Allocator<CudaPinnedHostMemory>>(one_gib);
        pinned_0->Fill(0);
    });

    auto future_1 = workers_1->enqueue([=, &pinned_1] {
        pinned_1 = std::make_shared<Allocator<CudaPinnedHostMemory>>(one_gib);
        pinned_1->Fill(0);
    });

    LOG(INFO) << socket_0;

    future_0.get();
    CHECK(pinned_0) << "pinned_0 got deAllocator - fail";
    LOG(INFO) << "pinned_0 (ptr, size): (" << pinned_0->Data() << ", " << pinned_0->Size() << ")";
    future_1.get();

    std::shared_ptr<MemoryStack<CudaDeviceMemory>> gpu_stack_on_socket0;
    std::shared_ptr<MemoryStack<CudaDeviceMemory>> gpu_stack_on_socket1;

    // It's not strictly necessary to alloaction GPU memory from threads near the GPU
    // this just drives home the point that we want to align CPU worker thread to GPU affinity.
    future_0 = workers_0->enqueue([=, &gpu_stack_on_socket0] {
        CHECK_EQ(cudaSetDevice(0), CUDA_SUCCESS) << "Set Device 0 failed";
        gpu_stack_on_socket0 = std::make_shared<MemoryStack<CudaDeviceMemory>>(one_gib);
        gpu_stack_on_socket0->Reset(zeroMemory);
    });

    // On a dual-socket system, we could use workers_1 to allocation device memory.
    // Leaving this as an exercise to the reader.

    future_0.get(); // thread allocating gpu_stack_on_socket0 finished with task
    LOG(INFO) << "Push Binding 0 - 10MB - stack_ptr = "
              << gpu_stack_on_socket0->Allocate(10 * 1024 * 1024);
    LOG(INFO) << "Push Binding 1 - 128MB - stack_ptr = "
              << gpu_stack_on_socket0->Allocate(128 * 1024 * 1024);
    // Try allocating 1 byte. Notice how the memory is aligned. Default alignment
    // is defined by the MemoryType in Memory.h
    gpu_stack_on_socket0->Reset();

    /**
     * Create a Buffer object associates a worker threads, host memory and device memory
     * that are properly aligned to the hardware topology.
     */
    struct Buffer
    {
        Buffer(std::shared_ptr<CudaPinnedHostMemory> pinned_,
               std::shared_ptr<MemoryStack<CudaDeviceMemory>> gpu_stack_,
               std::shared_ptr<ThreadPool> workers_)
            : pinned(pinned_), gpu_stack(gpu_stack_), workers(workers_)
        {
        }

        std::shared_ptr<CudaPinnedHostMemory> pinned;
        std::shared_ptr<MemoryStack<CudaDeviceMemory>> gpu_stack;
        std::shared_ptr<ThreadPool> workers;

        // Normally, we'd associate some GPU index value to the buffer.
    };

    // Now create a Pool of Buffers
    auto buffers = Pool<Buffer>::Create();

    // Here we push two buffers, one for each socket.
    buffers->EmplacePush(new Buffer(pinned_0, gpu_stack_on_socket0, workers_0));
    buffers->EmplacePush(new Buffer(pinned_1, gpu_stack_on_socket1, workers_1));

    // Exercise: add more buffer objects.  Which of the three objects per Buffer
    // will you reuse, which will you make new instances of?

    // If you have arbituray work which is not necesasry topology aligned, say an incoming
    // inference request, you can pull a buffer object from the pool and queue work to the
    // proper set of threads best associated with that device
    for(int i = 0; i < 6; i++)
    {
        auto buffer = buffers->Pop();
        buffer->workers->enqueue([buffer] {
            // perform some work - regardless of which buffer you got, you are working
            // on a thread properly assocated with the resources
            LOG(INFO) << Affinity::GetAffinity();
            std::this_thread::sleep_for(std::chrono::milliseconds(1));
        });
    }

    workers_0.reset();
    workers_1.reset();

    return 0;
}


================================================
FILE: examples/11_Protos/CMakeLists.txt
================================================
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


set(protobuf_MODULE_COMPATIBLE TRUE)
find_package(Protobuf CONFIG REQUIRED)
message(STATUS "Using protobuf ${protobuf_VERSION}")
set(_PROTOBUF_LIBPROTOBUF protobuf::libprotobuf)
set(_PROTOBUF_PROTOC $<TARGET_FILE:protobuf::protoc>)

add_subdirectory(echo)
add_subdirectory(demo)
add_subdirectory(inference)


================================================
FILE: examples/11_Protos/demo/CMakeLists.txt
================================================
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

INCLUDE(GRPCGenerateCPP)

PROTOBUF_GENERATE_CPP(PROTO_SRCS PROTO_HDRS 
    dataset.proto
    inference.proto
)

PROTOBUF_GENERATE_GRPC_CPP(PROTO_GRPC_SRCS PROTO_GRPC_HDRS 
    dataset.proto
    inference.proto
)

add_library(demo-protos
    ${PROTO_SRCS}
    ${PROTO_GRPC_SRCS}
)

target_link_libraries(demo-protos
  PUBLIC
  ${_PROTOBUF_LIBPROTOBUF}
)

target_include_directories(demo-protos PUBLIC
    ${CMAKE_CURRENT_BINARY_DIR}
)


================================================
FILE: examples/11_Protos/demo/dataset.proto
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

syntax = "proto3";

service SharedMemoryDataSet {
    rpc GetInfo (InfoRequest) returns (Info) {}
}

message Image {
    fixed64 sysv_offset = 1;
    uint32 label_index = 2;
    repeated int32 shape = 3;
    uint64 size = 4;
    string filename = 5;
}

message Info {
    uint32 handle = 1;
    uint64 sysv_key = 2;
    repeated Image images = 3;
    repeated string labels = 4;
}

message InfoRequest {
    uint32 image_size = 1;
}


================================================
FILE: examples/11_Protos/demo/inference.proto
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

syntax = "proto3";

package ssd;

service Inference {
   rpc Compute (BatchInput) returns (BatchPredictions) {}
}

message BatchInput {
    uint32 engine_id = 1;
    uint64 batch_id = 2;
    uint32 batch_size = 3;

    uint32 int_offset = 4;
    uint64 sysv_offset = 5;
    bytes data = 6;
}

message BatchPredictions {
    repeated Element elements = 1;
    uint64 batch_id = 2;
    float compute_time = 3;
    float total_time = 4;
    repeated Timer timers = 5;
}

message Element {
    repeated Prediction predictions = 2;
}

message Bbox {
    float x = 1;  // upper left
    float y = 2;  // upper left
    float width = 3;
    float height = 4;
    uint32 class_id = 5;
}

message Prediction {
    uint32 class_id = 1;
    string class_str = 2;
    float score = 3;
    Bbox bbox = 4;
}

message Timer {
    string name = 1;
    float time = 2;
    enum TimerUnit {
        SECONDS = 0;
        MILLI = 1;
        MICRO = 2;
        NANO = 3;
    }
    TimerUnit unit = 3;
}


================================================
FILE: examples/11_Protos/echo/CMakeLists.txt
================================================
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

INCLUDE(GRPCGenerateCPP)

PROTOBUF_GENERATE_CPP(PROTO_SRCS PROTO_HDRS 
    echo.proto
)

PROTOBUF_GENERATE_GRPC_CPP(PROTO_GRPC_SRCS PROTO_GRPC_HDRS 
    echo.proto
)

add_library(echo-protos
    ${PROTO_SRCS}
    ${PROTO_GRPC_SRCS}
)

target_link_libraries(echo-protos
  PUBLIC
  ${_PROTOBUF_LIBPROTOBUF}
)

target_include_directories(echo-protos PUBLIC
    ${CMAKE_CURRENT_BINARY_DIR}
)


================================================
FILE: examples/11_Protos/echo/echo.proto
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

syntax = "proto3";

package simple;

service Inference {
   rpc Compute (Input) returns (Output) {}
   rpc Bidirectional (stream Input) returns (stream Output) {}
   rpc BatchedCompute (stream Input) returns (stream Output) {}
}

message SystemV {
    uint64 shm_id = 1;
    uint64 offset = 2;
    uint64 size = 3;
}

message Input {
    uint64 batch_id = 1;
    oneof data {
        bytes raw_bytes = 2;
        SystemV sysv = 3;
    }
}

message Output {
    uint64 batch_id = 1;
}


================================================
FILE: examples/11_Protos/inference/CMakeLists.txt
================================================
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

INCLUDE(GRPCGenerateCPP)

PROTOBUF_GENERATE_CPP(PROTO_SRCS PROTO_HDRS 
    nvidia_inference.proto
    api.proto
    model_config.proto
    request_status.proto
    server_status.proto
)

PROTOBUF_GENERATE_GRPC_CPP(PROTO_GRPC_SRCS PROTO_GRPC_HDRS 
    nvidia_inference.proto
)

add_library(nv-inference-protos
    ${PROTO_SRCS}
    ${PROTO_GRPC_SRCS}
)

target_link_libraries(nv-inference-protos
  PUBLIC
  ${_PROTOBUF_LIBPROTOBUF}
)

target_include_directories(nv-inference-protos PUBLIC
    ${CMAKE_CURRENT_BINARY_DIR}
)


================================================
FILE: examples/11_Protos/inference/api.proto
================================================
// Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

syntax = "proto3";

package nvidia.inferenceserver;

// Request header for inferencing. The actual input data is delivered
// separate from the header.
message InferRequestHeader {
  // Input...
  message Input {
    // Name of the input.
    string name = 1;

    // Size of the input, in bytes. This is the size for one instance
    // of the input, not the entire size of a batch of the input.
    uint64 byte_size = 2;
  }

  // Output...
  message Output {
    // Name of the output.
    string name = 1;

    // Size of the output, in bytes. This is the size for one instance
    // of the output, not the entire size of a batch of the output.
    uint64 byte_size = 2;

    // Class result format. The output must be a vector. Output values
    // will be interpreted as probabilities and the highest 'count'
    // values will be returned.
    message Class {
      // Return the 'count' highest valued results.
      uint32 count = 1;
    }

    // Optional. If defined return this result as a classification
    // instead of raw data.
    Class cls = 3;
  }

  // Batch size of the inference inputs.
  uint32 batch_size = 1;

  // Inference inputs.
  repeated Input input = 2;

  // Inference outputs that are being requested.
  repeated Output output = 3;
}

// Response header for inferencing. Any raw response data (i.e. tensor
// values) is delivered separately from the header.
message InferResponseHeader {
  // Output...
  message Output {
    // Name of the output.
    string name = 1;

    // Raw result
    message Raw {
      // Size of the output, in bytes. This is the size for one
      // instance of the output, not the entire size of a batch of the
      // output.
      uint64 byte_size = 1;
    }

    // Classification result
    message Class {
      // The index in the output tensor.
      int32 idx = 1;
      // The value of the class as a float (typically a probability).
      float value = 2;
      // The label for the class (optional, only available if provided
      // by the model).
      string label = 3;
    }
    message Classes {
      // The topk classes for this output
      repeated Class cls = 1;
    }

    // Result format for this output. Only one of these may be
    // specified. For 'batch_classes' there should be one entry for
    // each output of the batch.
    Raw raw = 2;
    repeated Classes batch_classes = 3;
  }

  // Name of the model that produced the results.
  string model_name = 1;

  // Version of the model that produced the results.
  uint32 model_version = 2;

  // Batch size of the inference outputs.
  uint32 batch_size = 3;

  // The outputs
  repeated Output output = 4;
}


================================================
FILE: examples/11_Protos/inference/model_config.proto
================================================
// Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Copyright (c) 2018-2019, TensorFlow Authors. All rights reserved.

syntax = "proto3";

package nvidia.inferenceserver;

// Data types supported for input and output tensors.
enum DataType {
  TYPE_INVALID = 0;

  TYPE_BOOL = 1;

  TYPE_UINT8 = 2;
  TYPE_UINT16 = 3;
  TYPE_UINT32 = 4;
  TYPE_UINT64 = 5;

  TYPE_INT8 = 6;
  TYPE_INT16 = 7;
  TYPE_INT32 = 8;
  TYPE_INT64 = 9;

  TYPE_FP16 = 10;
  TYPE_FP32 = 11;
  TYPE_FP64 = 12;
}

// A group of one or more instances of a model and resources made
// available for those instances.
message ModelInstanceGroup {
  // Kind of this instance group.
  enum Kind {
    // This instance group represents instances that can run on either
    // CPU or GPU. If all GPUs listed in 'gpus' are available then
    // instances will be create on GPU(s), otherwise instances will be
    // created on CPU.
    KIND_AUTO = 0;

    // This instance group represents instances that must run on the
    // GPU.
    KIND_GPU = 1;

    // This instance group represents instances that must run on the
    // CPU.
    KIND_CPU = 2;
  }

  // Optional name of this group of instances. If not specified the
  // name will be formed as <model name>_<group number>. The name of
  // individual instances will be further formed by a unique instance
  // number and GPU index:
  //   <name>_<instance number>_gpu<gpu index>
  string name = 1;

  // The kind of this instance group. Default is KIND_AUTO. If
  // KIND_AUTO or KIND_GPU then both 'count' and 'gpu' are valid and
  // may be specified. If KIND_CPU only 'count' is valid and 'gpu'
  // cannot be specified.
  Kind kind = 4;

  // Number of instances in this group created for each GPU listed in
  // 'gpus'. Default is 1.
  int32 count = 2;

  // GPU(s) where instances should be available. For each GPU listed,
  // 'count' instances of the model will be available. Setting 'gpus'
  // to empty (or not specifying at all) is eqivalent to listing all
  // system GPUs.
  repeated int32 gpus = 3;
}

// Input tensor for the model
message ModelInput {
  // Format for the input.
  enum Format {
    // The input has no specific format.
    FORMAT_NONE = 0;

    // Image formats. Tensors with this format require 3 dimensions if
    // the model does not support batching (max_batch_size = 0) or 4
    // dimensions if the model does support batching (max_batch_size
    // >= 1). In either case the 'dims' below should only specify the
    // 3 non-batch dimensions (i.e. HWC or CHW).
    FORMAT_NHWC = 1;
    FORMAT_NCHW = 2;
  }

  string name = 1;
  DataType data_type = 2;
  Format format = 3;
  repeated int64 dims = 4;
}

// Output tensor for the model
message ModelOutput {
  string name = 1;
  DataType data_type = 2;
  repeated int64 dims = 3;

  // Label file for this output (optional).
  string label_filename = 4;
}

// Policy indicating which versions of a model should be made
// available by the inference server.
message ModelVersionPolicy {
  // Serve only the 'num_versions' highest-numbered versions.  This is
  // the default policy and the default value of 'num_versions' is 1,
  // indicating that by default only the highest-number version of a
  // model will be served.
  message Latest {
    uint32 num_versions = 1;
  }

  // Serve all versions of the model.
  message All {
  }

  // Serve only a specific set of versions of the model.
  message Specific {
    repeated int64 versions = 1;
  }

  // Each model must implement only a single policy. The default
  // policy is 'Latest'.
  oneof policy_choice {
    Latest latest = 1;
    All all = 2;
    Specific specific = 3;
  }
}

// Model configuration.
message ModelConfig {
  // Name of the model.
  string name = 1;

  // Type of model (e.g. "tensorflow").
  string platform = 2;

  // Policy indicating which version(s) of the model will be served.
  ModelVersionPolicy version_policy = 3;

  // Maximum batch size allowed for inference. This can only decrease
  // what is allowed by the model itself. A value of 0 indicates that
  // batching is not-allowed/is-disabled (for some input formats this
  // has implications on the expected dimension of the inputs, see
  // Format above).
  int32 max_batch_size = 4;

  // Inputs and outputs to the model.
  repeated ModelInput input = 5;
  repeated ModelOutput output = 6;

  // Optional instances of this model. If not specified, one instance
  // of the model will be instantiated on each available GPU.
  repeated ModelInstanceGroup instance_group = 7;

  // Optional filename of the model file to use if a
  // compute-capability specific model is not specified in
  // 'cc_model_names'. If not specified the default is model.graphdef
  // for TF graphdef models and model.plan for TensorRT PLAN models.
  string default_model_filename = 8;

  // Optional map from CUDA compute capabilities to the filename of
  // the model that supports that compute capability. The filename
  // refers to a file within the model version directory.
  map<string, string> cc_model_filenames = 9;
 }

// List of model configurations.
message ModelConfigList {
  repeated ModelConfig config = 1;
}


================================================
FILE: examples/11_Protos/inference/nvidia_inference.proto
================================================
// Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

syntax = "proto3";

package nvidia.inferenceserver;

import "api.proto";
import "request_status.proto";
import "server_status.proto";

service GRPCService {
  // Get server or model status
  rpc Status(StatusRequest) returns (StatusResponse) {}

  // Control server profiling
  rpc Profile(ProfileRequest) returns (ProfileResponse) {}

  // Health check
  rpc Health(HealthRequest) returns (HealthResponse) {}

  // Perform inference. [ Set the maximum message size (default 4 MB)
  // and transmit in one pass Seems like Tensorflow uses this approach
  // to transfer tensor which can be large
  // https://github.com/grpc/grpc/issues/8975 ]
  rpc Infer(InferRequest) returns (InferResponse) {}
}

// Request message for server status.
message StatusRequest {
  // The specific model status to be returned. Return status for all
  // models if empty.
  string model_name = 1;
}

// Response message for server status.
message StatusResponse {
  RequestStatus request_status = 1;
  ServerStatus server_status = 2;
}

// Request message for profile.
message ProfileRequest {
  string cmd = 1;
}

// Response message for profile.
message ProfileResponse{
  RequestStatus request_status = 1;
}

// Request message for health.
message HealthRequest {
  string mode = 1;
}

// Response message for health.
message HealthResponse{
  RequestStatus request_status = 1;
  bool health = 2;
}

// Request message for inference.
message InferRequest {
  // Name of model to use for inference
  string model_name = 1;

  // Version of the model to use for inference. If not specified use
  // the latest/most-recent version of the model. [ Use string here so
  // default value of empty indicates not specified].
  string version = 2;

  // Meta-data for the inference request.
  InferRequestHeader meta_data = 3;

  // Raw input tensor data in the order specified in 'meta_data'.
  repeated bytes raw_input = 4;

  uint64 batch_id = 100;
  uint32 batch_size = 101;
  uint64 sysv_offset = 102;
}

// Response message for inference.
message InferResponse {
  RequestStatus request_status = 1;

  // Meta-data for the inference response.
  InferResponseHeader meta_data = 2;

  // Raw output tensor data in the order specified in 'meta_data'.
  repeated bytes raw_output = 3;

  uint64 batch_id = 100;
  float compute_time = 101;
  float request_time = 102;
}


================================================
FILE: examples/11_Protos/inference/request_status.proto
================================================
// Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

syntax = "proto3";

package nvidia.inferenceserver;

// Status codes returned for inference server requests
enum RequestStatusCode {
  INVALID = 0;

  SUCCESS = 1;
  UNKNOWN = 2;
  INTERNAL = 3;
  NOT_FOUND = 4;
  INVALID_ARG = 5;
  UNAVAILABLE = 6;
  UNSUPPORTED = 7;
}

// Status returned for all inference server requests
message RequestStatus {
  // Required status code
  RequestStatusCode code = 1;

  // Optional message
  string msg = 2;

  // Inference server identifier.
  string server_id = 3;

  // Unique identifier for the request. Value 0 (zero) indicates
  // request ID is not known.
  uint64 request_id = 4;
}


================================================
FILE: examples/11_Protos/inference/server_status.proto
================================================
// Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

syntax = "proto3";

package nvidia.inferenceserver;

import "model_config.proto";

// Statistic collecting a duration metric
message StatDuration {
  // Cummulative number of times this metric occurred.
  uint64 count = 1;

  // Total collected duration of this metric in nanoseconds.
  uint64 total_time_ns = 2;
}

// Stats collected for Status requests.
message StatusRequestStats {
  // Total time required to service api/status requests, not including
  // HTTP or gRPC endpoint termination time.
  StatDuration success = 1;
}

// Stats collected for Profile requests.
message ProfileRequestStats {
  // Total time required to service profile requests, not including
  // HTTP or gRPC endpoint termination time.
  StatDuration success = 1;
}

// Stats collected for Health requests.
message HealthRequestStats {
  // Total time required to service health requests, not including
  // HTTP or gRPC endpoint termination time.
  StatDuration success = 1;
}

// Stats collected for Infer requests.
message InferRequestStats {
  // Total time required to service successful inference requests,
  // not including HTTP or gRPC endpoint termination time.
  StatDuration success = 1;

  // Total time required to service failed inference requests, not
  // including HTTP or gRPC endpoint termination time.
  StatDuration failed = 2;

  // Time required to run inferencing including time waiting for an
  // available model instance, time copying input tensors to GPU
  // memory, time executing the model, and time copying output tensors
  // from GPU memory. Wait time is also captured separately 'run_wait'
  // so to get inferencing time not including wait time use 'run' -
  // 'run_wait'.
  StatDuration run = 3;

  // Time waiting for an available model instance.
  StatDuration run_wait = 4;
}

// Server readiness states.
enum ModelReadyState {
  MODEL_UNKNOWN = 0;
  MODEL_READY = 1;
  MODEL_UNAVAILABLE = 2;
  MODEL_LOADING = 3;
  MODEL_UNLOADING = 4;
}

// Status for a version of a model.
message ModelVersionStatus {
  // Current readiness state for the model version.
  ModelReadyState ready_state = 1;

  // Duration statistics for each batch size used for this version of
  // the model.
  map<uint32, InferRequestStats> infer_stats = 2;
}

// Status for a model.
message ModelStatus {
  // The configuration for the model.
  ModelConfig config = 1;

  // Duration statistics for each version of this model.
  map<uint32, ModelVersionStatus> version_status = 2;
}

// Server readiness states.
enum ServerReadyState {
  SERVER_INVALID = 0;
  SERVER_INITIALIZING = 1;
  SERVER_READY = 2;
  SERVER_EXITING = 3;

  SERVER_FAILED_TO_INITIALIZE = 10;
}

// Status for inference server
message ServerStatus {
  // Server ID.
  string id = 1;

  // Server version.
  string version = 2;

  // Current readiness state for the server.
  ServerReadyState ready_state = 7;

  // Server uptime in nanoseconds
  uint64 uptime_ns = 3;

  // Status for each model on the server as map from <model name> ->
  // ModelStatus.
  map<string, ModelStatus> model_status = 4;

  // Statistics for Status requests.
  StatusRequestStats status_stats = 5;

  // Statistics for Profile requests.
  ProfileRequestStats profile_stats = 6;

  // Statistics for Health requests.
  HealthRequestStats health_stats = 8;
}


================================================
FILE: examples/12_ConfigGenerator/CMakeLists.txt
================================================
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

pybind11_add_module(config_generator generator.cc)

target_link_libraries(config_generator 
  PUBLIC
     yais
     nv-inference-protos
)


================================================
FILE: examples/12_ConfigGenerator/README.md
================================================
# TensorRT Inference Server Model Store Builder

- Ensure you built the project.
- Run `./link.sh` in this directory

## Design Requirements

This example consists of a ModelStore manager (Python) and a
ModelConfigGenerator (C++ w/ Python bindings).

The ModelConfigGenerator shall:
  - [X] parse serialized TensorRT engine files
  - [X] translate the necessary properties of the ICudaEngine to an
    `::nvidia::inferenceserver::ModelConfig` protobuf message
  - [ ] not require the presence of Cuda or a GPU to perform the actions

The ModelStore manager consists of a Python class for direct consumption and a
command-line application that shall:
  - [ ] create and manage a model-store in a user-supplied filesystem directory 
  - [X] add TensorRT model files to the model store using the
    ModelConfigGenerator and user-specified arguments
  - [ ] add new version of TensorRT models to a ModelStore
  - [ ] remove versions of entire models from the ModelStore
  - [ ] add, edit, update and remove Tensorflow models
  - [ ] add, edit, update and remove PyTorch/Caffe2 models

## Prototype Implementation

```
./ms_mgmt --help
Usage: ms_mgmt [OPTIONS]

Options:
  --engine PATH          TensorRT serialized engine  [required]
  --concurrency INTEGER  max number of concurrency executions allowed
  --name TEXT            model name; default to basename(engine) with the ext
                         dropped
  --version INTEGER      model version
  --store-path TEXT      model store path; default to ./model-store
  --help                 Show this message and exit.
```

```
./ms_mgmt --store-path=/tmp/model-store --engine=/work/models/ResNet-50-b1-fp32.engine --name=overridden-model-name --version=1337 --concurrency=10

ls /tmp/model-store/
overridden-model-name

ls /tmp/model-store/overridden-model-name/1337/
ResNet-50-b1-fp32.engine  model.plan

cat /tmp/model-store/overridden-model-name/config.pbtxt
name: "overridden-model-name"
platform: "tensorrt_plan"
max_batch_size: 1
input {
  name: "data"
  data_type: TYPE_FP32
  dims: 3
  dims: 224
  dims: 224
}
output {
  name: "prob"
  data_type: TYPE_FP32
  dims: 1000
  dims: 1
  dims: 1
}
instance_group {
  count: 10
  gpus: 0
}
```


================================================
FILE: examples/12_ConfigGenerator/generator.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <pybind11/pybind11.h>

#include <gflags/gflags.h>
#include <glog/logging.h>

#include "YAIS/TensorRT/TensorRT.h"
#include "YAIS/YAIS.h"

// NVIDIA Inference Server Protos
#include "nvidia_inference.grpc.pb.h"
#include "nvidia_inference.pb.h"

using nvidia::inferenceserver::ModelConfig;

using trtlab::TensorRT::Runtime;

static size_t DataTypeToBytes(nvidia::inferenceserver::DataType dataType)
{
    switch(dataType)
    {
        case nvidia::inferenceserver::TYPE_INVALID:
            CHECK(false) << "Invalid DataType used";
            return 0;
        case nvidia::inferenceserver::TYPE_BOOL:
        case nvidia::inferenceserver::TYPE_UINT8:
        case nvidia::inferenceserver::TYPE_INT8:
            return 1;
        case nvidia::inferenceserver::TYPE_UINT16:
        case nvidia::inferenceserver::TYPE_INT16:
        case nvidia::inferenceserver::TYPE_FP16:
            return 2;
        case nvidia::inferenceserver::TYPE_UINT32:
        case nvidia::inferenceserver::TYPE_INT32:
        case nvidia::inferenceserver::TYPE_FP32:
            return 4;
        case nvidia::inferenceserver::TYPE_UINT64:
        case nvidia::inferenceserver::TYPE_INT64:
        case nvidia::inferenceserver::TYPE_FP64:
            return 8;
        default:
            CHECK(false) << "Invalid DataType used";
            return 0;
    }
}

static nvidia::inferenceserver::DataType ConvertTensorRTDataType(nvinfer1::DataType trt_datatype)
{
    switch(trt_datatype)
    {
        case nvinfer1::DataType::kFLOAT:
            return nvidia::inferenceserver::TYPE_FP32;
        case nvinfer1::DataType::kHALF:
            return nvidia::inferenceserver::TYPE_FP16;
        case nvinfer1::DataType::kINT8:
            return nvidia::inferenceserver::TYPE_INT8;
        case nvinfer1::DataType::kINT32:
            return nvidia::inferenceserver::TYPE_INT32;
        default:
            LOG(FATAL) << "Unknown TensorRT DataType";
    }
}

std::string tensorrt_engine(std::string model_name, std::string engine, int concurrency)
{
    ModelConfig config;
    auto model = trtlab::TensorRT::Runtime::DeserializeEngine(engine);
    config.set_name(model_name);
    config.set_platform("tensorrt_plan");
    config.set_max_batch_size(model->GetMaxBatchSize());

    for(auto i : model->GetInputBindingIds())
    {
        const auto& binding = model->GetBinding(i);
        auto input = config.add_input();
        input->set_name(binding.name);
        input->set_data_type(ConvertTensorRTDataType(binding.dtype));
        for(auto d : binding.dims)
        {
            input->add_dims(d);
        }
    }

    for(auto i : model->GetOutputBindingIds())
    {
        const auto& binding = model->GetBinding(i);
        auto output = config.add_output();
        output->set_name(binding.name);
        output->set_data_type(ConvertTensorRTDataType(binding.dtype));
        for(auto d : binding.dims)
        {
            output->add_dims(d);
        }
    }

    auto instance_group = config.add_instance_group();
    CHECK(concurrency > 0) << "Concurrency must be >= 0";
    instance_group->set_count(concurrency);
    instance_group->add_gpus(0);

    return config.DebugString();
}

namespace py = pybind11;

PYBIND11_MODULE(config_generator, m)
{
    m.doc() = R"pbdoc(
        Pybind11 Yais plugin
        --------------------
        .. currentmodule:: config_generator
        .. autosummary::
           :toctree: _generate
           tensorrt_engine
    )pbdoc";

    m.def("tensorrt_engine", &tensorrt_engine, R"pbdoc(
        Generate a TensorRT Inference Server ModelConfig from a serialized engine file
    )pbdoc");

#ifdef VERSION_INFO
    m.attr("__version__") = VERSION_INFO;
#else
    m.attr("__version__") = "dev";
#endif
}


================================================
FILE: examples/12_ConfigGenerator/link.sh
================================================
#!/bin/bash

ln -s /work/build/examples/12_ConfigGenerator/config_generator.cpython-35m-x86_64-linux-gnu.so


================================================
FILE: examples/12_ConfigGenerator/ms_mgmt
================================================
#!/usr/bin/env python3
import os
import pathlib
import shutil

from contextlib import contextmanager

import click
import config_generator as cg

FileType = click.Path(exists=True, file_okay=True, dir_okay=False, resolve_path=True)
PathType = click.Path(exists=True, file_okay=False, dir_okay=True, resolve_path=True)

@contextmanager
def cd(newdir):
    prevdir = os.getcwd()
    os.chdir(os.path.expanduser(newdir))
    try:
        yield
    finally:
        os.chdir(prevdir)

# Path(exists=False, file_okay=True, dir_okay=True, writable=False, readable=True, resolve_path=False)

class ModelStore:

    def __init__(self, *, path, create=False, validate=False):
        if not os.path.isdir(path):
            pathlib.Path(path).mkdir(parents=create, exist_ok=create)
        self.path = os.path.abspath(path)
        self.name = os.path.basename(self.path)

    def model_name(self, *, engine, name=None):
        base = os.path.basename(engine)
        model = name or base.replace(".engine", "")
        return model

    def model_path(self, *, name):
        return os.path.join(self.path, name)

    def engine_path(self, *, name, version=0):
        return os.path.join(self.model_path(name=name), str(version))

    def create_engine_path(self, *, name, version):
        engine_path = self.engine_path(name=name, version=version)
        if os.path.exists(engine_path):
            raise RuntimeError("{} already exists in the model store".format(model))
        pathlib.Path(engine_path).mkdir(parents=True, exist_ok=True)
        return engine_path

    def copy_and_link_engine(self, *, name, version, engine):
        engine_path = self.create_engine_path(name=name, version=version)
        shutil.copy(engine, engine_path)
        with cd(engine_path):
            os.symlink(os.path.basename(engine), "model.plan")

    def add_tensorrt_engine(self, *, engine, name=None, concurrency=1, version=0):
        engine = os.path.abspath(engine)
        if not os.path.isfile(engine):
            raise RuntimeError("{} engine does not exist".format(engine))
        name = self.model_name(engine=engine, name=name)
        model_path = self.model_path(name=name)
        self.copy_and_link_engine(name=name, version=version, engine=engine)
        config = cg.tensorrt_engine(name, engine, concurrency)
        with cd(model_path), open("config.pbtxt", "w") as file:
            file.write(config)

@click.command()
@click.option("--engine", type=FileType, required=True, help="TensorRT serialized engine")
@click.option("--concurrency", type=int, default=1, help="max number of concurrency executions allowed")
@click.option("--name", default=None, help="model name; default to basename(engine) with the ext dropped")
@click.option("--version", type=int, default=0, help="model version")
@click.option("--store-path", default=None, help="model store path; default to ./model-store")
def main(engine, concurrency, name, store_path, version):
    store_path = store_path or "model-store"
    store = ModelStore(path=store_path, create=True)
    base = os.path.basename(engine)
    name = name or base.replace(".engine", "")
    store.add_tensorrt_engine(engine=engine, name=name, concurrency=concurrency, version=version)

if __name__ == "__main__":
    main()


================================================
FILE: examples/12_FlatBuffers/CMakeLists.txt
================================================
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

find_package(Flatbuffers)

# This module defines
#  FLATBUFFERS_INCLUDE_DIR, directory containing headers
#  FLATBUFFERS_LIBS, directory containing flatbuffers libraries
#  FLATBUFFERS_STATIC_LIB, path to libflatbuffers.a
#  FLATBUFFERS_FOUND, whether flatbuffers has been found


add_library(example-fbs 
  example.grpc.fb.cc
)

target_link_libraries(example-fbs PUBLIC
  flatbuffers
)

target_include_directories(example-fbs PUBLIC
    ${CMAKE_CURRENT_BINARY_DIR}
)


add_executable(fb-server.x
    server.cc)

target_link_libraries(fb-server.x
    nvrpc
    example-fbs
    gflags
)

add_executable(fb-client.x
    client.cc)

target_link_libraries(fb-client.x
    nvrpc
    example-fbs
    gflags
)


================================================
FILE: examples/12_FlatBuffers/client.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <chrono>
#include <iostream>
#include <memory>
#include <string>

#include <gflags/gflags.h>
#include <glog/logging.h>
#include <grpcpp/grpcpp.h>

#include "example.grpc.fb.h"
#include "example_generated.h"

using Input = flatbuffers::grpc::Message<HelloRequest>;
using Output = flatbuffers::grpc::Message<HelloReply>;

using grpc::Channel;
using grpc::ClientContext;
using grpc::Status;

class SimpleClient
{
  public:
    SimpleClient(std::shared_ptr<Channel> channel) : stub_(Greeter::NewStub(channel)) {}

    // Assembles the client's payload, sends it and presents the response back
    // from the server.
    std::string Compute(const int batch_id)
    {
        flatbuffers::grpc::MessageBuilder mb;

        // Data we are sending to the server.
        auto name_offset = mb.CreateString(std::to_string(batch_id));
        auto request_offset = CreateHelloRequest(mb, name_offset);
        mb.Finish(request_offset);
        auto request = mb.ReleaseMessage<HelloRequest>();

        // Container for the data we expect from the server.
        Output reply;

        // Context for the client. It could be used to convey extra information to
        // the server and/or tweak certain RPC behaviors.
        ClientContext context;

        // The actual RPC.
        Status status = stub_->SayHello(&context, request, &reply);

        // Act upon its status.
        if(status.ok())
        {
            const HelloReply* output = reply.GetRoot();
            return output->message()->str();
        }
        else
        {
            std::cout << status.error_code() << ": " << status.error_message() << std::endl;
            return "Fail!";
        }
    }

  private:
    std::unique_ptr<Greeter::Stub> stub_;
};

DEFINE_int32(count, 100, "number of grpc messages to send");

int main(int argc, char** argv)
{
    // Instantiate the client. It requires a channel, out of which the actual RPCs
    // are created. This channel models a connection to an endpoint (in this case,
    // localhost at port 50051). We indicate that the channel isn't authenticated
    // (use of InsecureChannelCredentials()).
    FLAGS_alsologtostderr = 1; // It will dump to console
    ::google::ParseCommandLineFlags(&argc, &argv, true);

    SimpleClient client(grpc::CreateChannel("localhost:50051", grpc::InsecureChannelCredentials()));
    auto start = std::chrono::steady_clock::now();
    for(int i = 0; i < FLAGS_count; i++)
    {
        auto reply = client.Compute(i);
        LOG_FIRST_N(INFO, 20) << reply;
    }
    auto end = std::chrono::steady_clock::now();
    float elapsed = std::chrono::duration<float>(end - start).count();
    std::cout << FLAGS_count << " requests in " << elapsed << "seconds" << std::endl;
    return 0;
}


================================================
FILE: examples/12_FlatBuffers/example.fbs
================================================
table HelloReply {
  message:string;
}

table HelloRequest {
  name:string;
}

table ManyHellosRequest {
  name:string;
  num_greetings:int;
}

rpc_service Greeter {
  SayHello(HelloRequest):HelloReply;
  SayManyHellos(ManyHellosRequest):HelloReply (streaming: "server");
}


================================================
FILE: examples/12_FlatBuffers/example.grpc.fb.cc
================================================
// Generated by the gRPC C++ plugin.
// If you make any local change, they will be lost.
// source: example

#include "example.grpc.fb.h"
#include "example_generated.h"

#include <grpc++/impl/codegen/async_stream.h>
#include <grpc++/impl/codegen/async_unary_call.h>
#include <grpc++/impl/codegen/channel_interface.h>
#include <grpc++/impl/codegen/client_unary_call.h>
#include <grpc++/impl/codegen/method_handler_impl.h>
#include <grpc++/impl/codegen/rpc_service_method.h>
#include <grpc++/impl/codegen/service_type.h>
#include <grpc++/impl/codegen/sync_stream.h>

static const char* Greeter_method_names[] = {
    "/Greeter/SayHello",
    "/Greeter/SayManyHellos",
};

std::unique_ptr<Greeter::Stub>
    Greeter::NewStub(const std::shared_ptr<::grpc::ChannelInterface>& channel,
                     const ::grpc::StubOptions& options)
{
    std::unique_ptr<Greeter::Stub> stub(new Greeter::Stub(channel));
    return stub;
}

Greeter::Stub::Stub(const std::shared_ptr<::grpc::ChannelInterface>& channel)
    : channel_(channel), rpcmethod_SayHello_(Greeter_method_names[0],
                                             ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
      rpcmethod_SayManyHellos_(Greeter_method_names[1],
                               ::grpc::internal::RpcMethod::SERVER_STREAMING, channel)
{
}

::grpc::Status Greeter::Stub::SayHello(::grpc::ClientContext* context,
                                       const flatbuffers::grpc::Message<HelloRequest>& request,
                                       flatbuffers::grpc::Message<HelloReply>* response)
{
    return ::grpc::internal::BlockingUnaryCall(channel_.get(), rpcmethod_SayHello_, context,
                                               request, response);
}

::grpc::ClientAsyncResponseReader<flatbuffers::grpc::Message<HelloReply>>*
    Greeter::Stub::AsyncSayHelloRaw(::grpc::ClientContext* context,
                                    const flatbuffers::grpc::Message<HelloRequest>& request,
                                    ::grpc::CompletionQueue* cq)
{
    return ::grpc::internal::ClientAsyncResponseReaderFactory<
        flatbuffers::grpc::Message<HelloReply>>::Create(channel_.get(), cq, rpcmethod_SayHello_,
                                                        context, request, true);
}

::grpc::ClientAsyncResponseReader<flatbuffers::grpc::Message<HelloReply>>*
    Greeter::Stub::PrepareAsyncSayHelloRaw(::grpc::ClientContext* context,
                                           const flatbuffers::grpc::Message<HelloRequest>& request,
                                           ::grpc::CompletionQueue* cq)
{
    return ::grpc::internal::ClientAsyncResponseReaderFactory<
        flatbuffers::grpc::Message<HelloReply>>::Create(channel_.get(), cq, rpcmethod_SayHello_,
                                                        context, request, false);
}

::grpc::ClientReader<flatbuffers::grpc::Message<HelloReply>>*
    Greeter::Stub::SayManyHellosRaw(::grpc::ClientContext* context,
                                    const flatbuffers::grpc::Message<ManyHellosRequest>& request)
{
    return ::grpc::internal::ClientReaderFactory<flatbuffers::grpc::Message<HelloReply>>::Create(
        channel_.get(), rpcmethod_SayManyHellos_, context, request);
}

::grpc::ClientAsyncReader<flatbuffers::grpc::Message<HelloReply>>*
    Greeter::Stub::AsyncSayManyHellosRaw(
        ::grpc::ClientContext* context,
        const flatbuffers::grpc::Message<ManyHellosRequest>& request, ::grpc::CompletionQueue* cq,
        void* tag)
{
    return ::grpc::internal::ClientAsyncReaderFactory<
        flatbuffers::grpc::Message<HelloReply>>::Create(channel_.get(), cq,
                                                        rpcmethod_SayManyHellos_, context, request,
                                                        true, tag);
}

::grpc::ClientAsyncReader<flatbuffers::grpc::Message<HelloReply>>*
    Greeter::Stub::PrepareAsyncSayManyHellosRaw(
        ::grpc::ClientContext* context,
        const flatbuffers::grpc::Message<ManyHellosRequest>& request, ::grpc::CompletionQueue* cq)
{
    return ::grpc::internal::ClientAsyncReaderFactory<
        flatbuffers::grpc::Message<HelloReply>>::Create(channel_.get(), cq,
                                                        rpcmethod_SayManyHellos_, context, request,
                                                        false, nullptr);
}

Greeter::Service::Service()
{
    AddMethod(new ::grpc::internal::RpcServiceMethod(
        Greeter_method_names[0], ::grpc::internal::RpcMethod::NORMAL_RPC,
        new ::grpc::internal::RpcMethodHandler<Greeter::Service,
                                               flatbuffers::grpc::Message<HelloRequest>,
                                               flatbuffers::grpc::Message<HelloReply>>(
            std::mem_fn(&Greeter::Service::SayHello), this)));
    AddMethod(new ::grpc::internal::RpcServiceMethod(
        Greeter_method_names[1], ::grpc::internal::RpcMethod::SERVER_STREAMING,
        new ::grpc::internal::ServerStreamingHandler<Greeter::Service,
                                                     flatbuffers::grpc::Message<ManyHellosRequest>,
                                                     flatbuffers::grpc::Message<HelloReply>>(
            std::mem_fn(&Greeter::Service::SayManyHellos), this)));
}

Greeter::Service::~Service() {}

::grpc::Status Greeter::Service::SayHello(::grpc::ServerContext* context,
                                          const flatbuffers::grpc::Message<HelloRequest>* request,
                                          flatbuffers::grpc::Message<HelloReply>* response)
{
    (void)context;
    (void)request;
    (void)response;
    return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
}

::grpc::Status Greeter::Service::SayManyHellos(
    ::grpc::ServerContext* context, const flatbuffers::grpc::Message<ManyHellosRequest>* request,
    ::grpc::ServerWriter<flatbuffers::grpc::Message<HelloReply>>* writer)
{
    (void)context;
    (void)request;
    (void)writer;
    return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
}


================================================
FILE: examples/12_FlatBuffers/example.grpc.fb.h
================================================
// Generated by the gRPC C++ plugin.
// If you make any local change, they will be lost.
// source: example
#ifndef GRPC_example__INCLUDED
#define GRPC_example__INCLUDED

#include "example_generated.h"
#include "flatbuffers/grpc.h"

#include <grpc++/impl/codegen/async_stream.h>
#include <grpc++/impl/codegen/async_unary_call.h>
#include <grpc++/impl/codegen/method_handler_impl.h>
#include <grpc++/impl/codegen/proto_utils.h>
#include <grpc++/impl/codegen/rpc_method.h>
#include <grpc++/impl/codegen/service_type.h>
#include <grpc++/impl/codegen/status.h>
#include <grpc++/impl/codegen/stub_options.h>
#include <grpc++/impl/codegen/sync_stream.h>

namespace grpc {
class CompletionQueue;
class Channel;
class ServerCompletionQueue;
class ServerContext;
} // namespace grpc

class Greeter final
{
  public:
    static constexpr char const* service_full_name() { return "Greeter"; }
    class StubInterface
    {
      public:
        virtual ~StubInterface() {}
        virtual ::grpc::Status SayHello(::grpc::ClientContext* context,
                                        const flatbuffers::grpc::Message<HelloRequest>& request,
                                        flatbuffers::grpc::Message<HelloReply>* response) = 0;
        std::unique_ptr<
            ::grpc::ClientAsyncResponseReaderInterface<flatbuffers::grpc::Message<HelloReply>>>
            AsyncSayHello(::grpc::ClientContext* context,
                          const flatbuffers::grpc::Message<HelloRequest>& request,
                          ::grpc::CompletionQueue* cq)
        {
            return std::unique_ptr<
                ::grpc::ClientAsyncResponseReaderInterface<flatbuffers::grpc::Message<HelloReply>>>(
                AsyncSayHelloRaw(context, request, cq));
        }
        std::unique_ptr<
            ::grpc::ClientAsyncResponseReaderInterface<flatbuffers::grpc::Message<HelloReply>>>
            PrepareAsyncSayHello(::grpc::ClientContext* context,
                                 const flatbuffers::grpc::Message<HelloRequest>& request,
                                 ::grpc::CompletionQueue* cq)
        {
            return std::unique_ptr<
                ::grpc::ClientAsyncResponseReaderInterface<flatbuffers::grpc::Message<HelloReply>>>(
                PrepareAsyncSayHelloRaw(context, request, cq));
        }
        std::unique_ptr<::grpc::ClientReaderInterface<flatbuffers::grpc::Message<HelloReply>>>
            SayManyHellos(::grpc::ClientContext* context,
                          const flatbuffers::grpc::Message<ManyHellosRequest>& request)
        {
            return std::unique_ptr<
                ::grpc::ClientReaderInterface<flatbuffers::grpc::Message<HelloReply>>>(
                SayManyHellosRaw(context, request));
        }
        std::unique_ptr<::grpc::ClientAsyncReaderInterface<flatbuffers::grpc::Message<HelloReply>>>
            AsyncSayManyHellos(::grpc::ClientContext* context,
                               const flatbuffers::grpc::Message<ManyHellosRequest>& request,
                               ::grpc::CompletionQueue* cq, void* tag)
        {
            return std::unique_ptr<
                ::grpc::ClientAsyncReaderInterface<flatbuffers::grpc::Message<HelloReply>>>(
                AsyncSayManyHellosRaw(context, request, cq, tag));
        }
        std::unique_ptr<::grpc::ClientAsyncReaderInterface<flatbuffers::grpc::Message<HelloReply>>>
            PrepareAsyncSayManyHellos(::grpc::ClientContext* context,
                                      const flatbuffers::grpc::Message<ManyHellosRequest>& request,
                                      ::grpc::CompletionQueue* cq)
        {
            return std::unique_ptr<
                ::grpc::ClientAsyncReaderInterface<flatbuffers::grpc::Message<HelloReply>>>(
                PrepareAsyncSayManyHellosRaw(context, request, cq));
        }

      private:
        virtual ::grpc::ClientAsyncResponseReaderInterface<flatbuffers::grpc::Message<HelloReply>>*
            AsyncSayHelloRaw(::grpc::ClientContext* context,
                             const flatbuffers::grpc::Message<HelloRequest>& request,
                             ::grpc::CompletionQueue* cq) = 0;
        virtual ::grpc::ClientAsyncResponseReaderInterface<flatbuffers::grpc::Message<HelloReply>>*
            PrepareAsyncSayHelloRaw(::grpc::ClientContext* context,
                                    const flatbuffers::grpc::Message<HelloRequest>& request,
                                    ::grpc::CompletionQueue* cq) = 0;
        virtual ::grpc::ClientReaderInterface<flatbuffers::grpc::Message<HelloReply>>*
            SayManyHellosRaw(::grpc::ClientContext* context,
                             const flatbuffers::grpc::Message<ManyHellosRequest>& request) = 0;
        virtual ::grpc::ClientAsyncReaderInterface<flatbuffers::grpc::Message<HelloReply>>*
            AsyncSayManyHellosRaw(::grpc::ClientContext* context,
                                  const flatbuffers::grpc::Message<ManyHellosRequest>& request,
                                  ::grpc::CompletionQueue* cq, void* tag) = 0;
        virtual ::grpc::ClientAsyncReaderInterface<flatbuffers::grpc::Message<HelloReply>>*
            PrepareAsyncSayManyHellosRaw(
                ::grpc::ClientContext* context,
                const flatbuffers::grpc::Message<ManyHellosRequest>& request,
                ::grpc::CompletionQueue* cq) = 0;
    };
    class Stub final : public StubInterface
    {
      public:
        Stub(const std::shared_ptr<::grpc::ChannelInterface>& channel);
        ::grpc::Status SayHello(::grpc::ClientContext* context,
                                const flatbuffers::grpc::Message<HelloRequest>& request,
                                flatbuffers::grpc::Message<HelloReply>* response) override;
        std::unique_ptr<::grpc::ClientAsyncResponseReader<flatbuffers::grpc::Message<HelloReply>>>
            AsyncSayHello(::grpc::ClientContext* context,
                          const flatbuffers::grpc::Message<HelloRequest>& request,
                          ::grpc::CompletionQueue* cq)
        {
            return std::unique_ptr<
                ::grpc::ClientAsyncResponseReader<flatbuffers::grpc::Message<HelloReply>>>(
                AsyncSayHelloRaw(context, request, cq));
        }
        std::unique_ptr<::grpc::ClientAsyncResponseReader<flatbuffers::grpc::Message<HelloReply>>>
            PrepareAsyncSayHello(::grpc::ClientContext* context,
                                 const flatbuffers::grpc::Message<HelloRequest>& request,
                                 ::grpc::CompletionQueue* cq)
        {
            return std::unique_ptr<
                ::grpc::ClientAsyncResponseReader<flatbuffers::grpc::Message<HelloReply>>>(
                PrepareAsyncSayHelloRaw(context, request, cq));
        }
        std::unique_ptr<::grpc::ClientReader<flatbuffers::grpc::Message<HelloReply>>>
            SayManyHellos(::grpc::ClientContext* context,
                          const flatbuffers::grpc::Message<ManyHellosRequest>& request)
        {
            return std::unique_ptr<::grpc::ClientReader<flatbuffers::grpc::Message<HelloReply>>>(
                SayManyHellosRaw(context, request));
        }
        std::unique_ptr<::grpc::ClientAsyncReader<flatbuffers::grpc::Message<HelloReply>>>
            AsyncSayManyHellos(::grpc::ClientContext* context,
                               const flatbuffers::grpc::Message<ManyHellosRequest>& request,
                               ::grpc::CompletionQueue* cq, void* tag)
        {
            return std::unique_ptr<
                ::grpc::ClientAsyncReader<flatbuffers::grpc::Message<HelloReply>>>(
                AsyncSayManyHellosRaw(context, request, cq, tag));
        }
        std::unique_ptr<::grpc::ClientAsyncReader<flatbuffers::grpc::Message<HelloReply>>>
            PrepareAsyncSayManyHellos(::grpc::ClientContext* context,
                                      const flatbuffers::grpc::Message<ManyHellosRequest>& request,
                                      ::grpc::CompletionQueue* cq)
        {
            return std::unique_ptr<
                ::grpc::ClientAsyncReader<flatbuffers::grpc::Message<HelloReply>>>(
                PrepareAsyncSayManyHellosRaw(context, request, cq));
        }

      private:
        std::shared_ptr<::grpc::ChannelInterface> channel_;
        ::grpc::ClientAsyncResponseReader<flatbuffers::grpc::Message<HelloReply>>*
            AsyncSayHelloRaw(::grpc::ClientContext* context,
                             const flatbuffers::grpc::Message<HelloRequest>& request,
                             ::grpc::CompletionQueue* cq) override;
        ::grpc::ClientAsyncResponseReader<flatbuffers::grpc::Message<HelloReply>>*
            PrepareAsyncSayHelloRaw(::grpc::ClientContext* context,
                                    const flatbuffers::grpc::Message<HelloRequest>& request,
                                    ::grpc::CompletionQueue* cq) override;
        ::grpc::ClientReader<flatbuffers::grpc::Message<HelloReply>>*
            SayManyHellosRaw(::grpc::ClientContext* context,
                             const flatbuffers::grpc::Message<ManyHellosRequest>& request) override;
        ::grpc::ClientAsyncReader<flatbuffers::grpc::Message<HelloReply>>*
            AsyncSayManyHellosRaw(::grpc::ClientContext* context,
                                  const flatbuffers::grpc::Message<ManyHellosRequest>& request,
                                  ::grpc::CompletionQueue* cq, void* tag) override;
        ::grpc::ClientAsyncReader<flatbuffers::grpc::Message<HelloReply>>*
            PrepareAsyncSayManyHellosRaw(
                ::grpc::ClientContext* context,
                const flatbuffers::grpc::Message<ManyHellosRequest>& request,
                ::grpc::CompletionQueue* cq) override;
        const ::grpc::internal::RpcMethod rpcmethod_SayHello_;
        const ::grpc::internal::RpcMethod rpcmethod_SayManyHellos_;
    };
    static std::unique_ptr<Stub>
        NewStub(const std::shared_ptr<::grpc::ChannelInterface>& channel,
                const ::grpc::StubOptions& options = ::grpc::StubOptions());

    class Service : public ::grpc::Service
    {
      public:
        Service();
        virtual ~Service();
        virtual ::grpc::Status SayHello(::grpc::ServerContext* context,
                                        const flatbuffers::grpc::Message<HelloRequest>* request,
                                        flatbuffers::grpc::Message<HelloReply>* response);
        virtual ::grpc::Status
            SayManyHellos(::grpc::ServerContext* context,
                          const flatbuffers::grpc::Message<ManyHellosRequest>* request,
                          ::grpc::ServerWriter<flatbuffers::grpc::Message<HelloReply>>* writer);
    };
    template<class BaseClass>
    class WithAsyncMethod_SayHello : public BaseClass
    {
      private:
        void BaseClassMustBeDerivedFromService(const Service* service) {}

      public:
        WithAsyncMethod_SayHello() { ::grpc::Service::MarkMethodAsync(0); }
        ~WithAsyncMethod_SayHello() override { BaseClassMustBeDerivedFromService(this); }
        // disable synchronous version of this method
        ::grpc::Status SayHello(::grpc::ServerContext* context,
                                const flatbuffers::grpc::Message<HelloRequest>* request,
                                flatbuffers::grpc::Message<HelloReply>* response) final override
        {
            abort();
            return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
        }
        void RequestSayHello(
            ::grpc::ServerContext* context, flatbuffers::grpc::Message<HelloRequest>* request,
            ::grpc::ServerAsyncResponseWriter<flatbuffers::grpc::Message<HelloReply>>* response,
            ::grpc::CompletionQueue* new_call_cq, ::grpc::ServerCompletionQueue* notification_cq,
            void* tag)
        {
            ::grpc::Service::RequestAsyncUnary(0, context, request, response, new_call_cq,
                                               notification_cq, tag);
        }
    };
    template<class BaseClass>
    class WithAsyncMethod_SayManyHellos : public BaseClass
    {
      private:
        void BaseClassMustBeDerivedFromService(const Service* service) {}

      public:
        WithAsyncMethod_SayManyHellos() { ::grpc::Service::MarkMethodAsync(1); }
        ~WithAsyncMethod_SayManyHellos() override { BaseClassMustBeDerivedFromService(this); }
        // disable synchronous version of this method
        ::grpc::Status SayManyHellos(
            ::grpc::ServerContext* context,
            const flatbuffers::grpc::Message<ManyHellosRequest>* request,
            ::grpc::ServerWriter<flatbuffers::grpc::Message<HelloReply>>* writer) final override
        {
            abort();
            return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
        }
        void RequestSayManyHellos(
            ::grpc::ServerContext* context, flatbuffers::grpc::Message<ManyHellosRequest>* request,
            ::grpc::ServerAsyncWriter<flatbuffers::grpc::Message<HelloReply>>* writer,
            ::grpc::CompletionQueue* new_call_cq, ::grpc::ServerCompletionQueue* notification_cq,
            void* tag)
        {
            ::grpc::Service::RequestAsyncServerStreaming(1, context, request, writer, new_call_cq,
                                                         notification_cq, tag);
        }
    };
    typedef WithAsyncMethod_SayHello<WithAsyncMethod_SayManyHellos<Service>> AsyncService;
    template<class BaseClass>
    class WithGenericMethod_SayHello : public BaseClass
    {
      private:
        void BaseClassMustBeDerivedFromService(const Service* service) {}

      public:
        WithGenericMethod_SayHello() { ::grpc::Service::MarkMethodGeneric(0); }
        ~WithGenericMethod_SayHello() override { BaseClassMustBeDerivedFromService(this); }
        // disable synchronous version of this method
        ::grpc::Status SayHello(::grpc::ServerContext* context,
                                const flatbuffers::grpc::Message<HelloRequest>* request,
                                flatbuffers::grpc::Message<HelloReply>* response) final override
        {
            abort();
            return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
        }
    };
    template<class BaseClass>
    class WithGenericMethod_SayManyHellos : public BaseClass
    {
      private:
        void BaseClassMustBeDerivedFromService(const Service* service) {}

      public:
        WithGenericMethod_SayManyHellos() { ::grpc::Service::MarkMethodGeneric(1); }
        ~WithGenericMethod_SayManyHellos() override { BaseClassMustBeDerivedFromService(this); }
        // disable synchronous version of this method
        ::grpc::Status SayManyHellos(
            ::grpc::ServerContext* context,
            const flatbuffers::grpc::Message<ManyHellosRequest>* request,
            ::grpc::ServerWriter<flatbuffers::grpc::Message<HelloReply>>* writer) final override
        {
            abort();
            return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
        }
    };
    template<class BaseClass>
    class WithStreamedUnaryMethod_SayHello : public BaseClass
    {
      private:
        void BaseClassMustBeDerivedFromService(const Service* service) {}

      public:
        WithStreamedUnaryMethod_SayHello()
        {
            ::grpc::Service::MarkMethodStreamed(
                0,
                new ::grpc::internal::StreamedUnaryHandler<flatbuffers::grpc::Message<HelloRequest>,
                                                           flatbuffers::grpc::Message<HelloReply>>(
                    std::bind(&WithStreamedUnaryMethod_SayHello<BaseClass>::StreamedSayHello, this,
                              std::placeholders::_1, std::placeholders::_2)));
        }
        ~WithStreamedUnaryMethod_SayHello() override { BaseClassMustBeDerivedFromService(this); }
        // disable regular version of this method
        ::grpc::Status SayHello(::grpc::ServerContext* context,
                                const flatbuffers::grpc::Message<HelloRequest>* request,
                                flatbuffers::grpc::Message<HelloReply>* response) final override
        {
            abort();
            return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
        }
        // replace default version of method with streamed unary
        virtual ::grpc::Status
            StreamedSayHello(::grpc::ServerContext* context,
                             ::grpc::ServerUnaryStreamer<flatbuffers::grpc::Message<HelloRequest>,
                                                         flatbuffers::grpc::Message<HelloReply>>*
                                 server_unary_streamer) = 0;
    };
    typedef WithStreamedUnaryMethod_SayHello<Service> StreamedUnaryService;
    template<class BaseClass>
    class WithSplitStreamingMethod_SayManyHellos : public BaseClass
    {
      private:
        void BaseClassMustBeDerivedFromService(const Service* service) {}

      public:
        WithSplitStreamingMethod_SayManyHellos()
        {
            ::grpc::Service::MarkMethodStreamed(
                1, new ::grpc::internal::SplitServerStreamingHandler<
                       flatbuffers::grpc::Message<ManyHellosRequest>,
                       flatbuffers::grpc::Message<HelloReply>>(std::bind(
                       &WithSplitStreamingMethod_SayManyHellos<BaseClass>::StreamedSayManyHellos,
                       this, std::placeholders::_1, std::placeholders::_2)));
        }
        ~WithSplitStreamingMethod_SayManyHellos() override
        {
            BaseClassMustBeDerivedFromService(this);
        }
        // disable regular version of this method
        ::grpc::Status SayManyHellos(
            ::grpc::ServerContext* context,
            const flatbuffers::grpc::Message<ManyHellosRequest>* request,
            ::grpc::ServerWriter<flatbuffers::grpc::Message<HelloReply>>* writer) final override
        {
            abort();
            return ::grpc::Status(::grpc::StatusCode::UNIMPLEMENTED, "");
        }
        // replace default version of method with split streamed
        virtual ::grpc::Status StreamedSayManyHellos(
            ::grpc::ServerContext* context,
            ::grpc::ServerSplitStreamer<flatbuffers::grpc::Message<ManyHellosRequest>,
                                        flatbuffers::grpc::Message<HelloReply>>*
                server_split_streamer) = 0;
    };
    typedef WithSplitStreamingMethod_SayManyHellos<Service> SplitStreamedService;
    typedef WithStreamedUnaryMethod_SayHello<WithSplitStreamingMethod_SayManyHellos<Service>>
        StreamedService;
};

#endif // GRPC_example__INCLUDED


================================================
FILE: examples/12_FlatBuffers/example_generated.h
================================================
// automatically generated by the FlatBuffers compiler, do not modify

#ifndef FLATBUFFERS_GENERATED_EXAMPLE_H_
#define FLATBUFFERS_GENERATED_EXAMPLE_H_

#include "flatbuffers/flatbuffers.h"

struct HelloReply;

struct HelloRequest;

struct ManyHellosRequest;

struct HelloReply FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
{
    enum
    {
        VT_MESSAGE = 4
    };
    const flatbuffers::String* message() const
    {
        return GetPointer<const flatbuffers::String*>(VT_MESSAGE);
    }
    bool Verify(flatbuffers::Verifier& verifier) const
    {
        return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_MESSAGE) &&
               verifier.VerifyString(message()) && verifier.EndTable();
    }
};

struct HelloReplyBuilder
{
    flatbuffers::FlatBufferBuilder& fbb_;
    flatbuffers::uoffset_t start_;
    void add_message(flatbuffers::Offset<flatbuffers::String> message)
    {
        fbb_.AddOffset(HelloReply::VT_MESSAGE, message);
    }
    explicit HelloReplyBuilder(flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
    {
        start_ = fbb_.StartTable();
    }
    HelloReplyBuilder& operator=(const HelloReplyBuilder&);
    flatbuffers::Offset<HelloReply> Finish()
    {
        const auto end = fbb_.EndTable(start_);
        auto o = flatbuffers::Offset<HelloReply>(end);
        return o;
    }
};

inline flatbuffers::Offset<HelloReply>
    CreateHelloReply(flatbuffers::FlatBufferBuilder& _fbb,
                     flatbuffers::Offset<flatbuffers::String> message = 0)
{
    HelloReplyBuilder builder_(_fbb);
    builder_.add_message(message);
    return builder_.Finish();
}

inline flatbuffers::Offset<HelloReply> CreateHelloReplyDirect(flatbuffers::FlatBufferBuilder& _fbb,
                                                              const char* message = nullptr)
{
    return CreateHelloReply(_fbb, message ? _fbb.CreateString(message) : 0);
}

struct HelloRequest FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
{
    enum
    {
        VT_NAME = 4
    };
    const flatbuffers::String* name() const
    {
        return GetPointer<const flatbuffers::String*>(VT_NAME);
    }
    bool Verify(flatbuffers::Verifier& verifier) const
    {
        return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_NAME) &&
               verifier.VerifyString(name()) && verifier.EndTable();
    }
};

struct HelloRequestBuilder
{
    flatbuffers::FlatBufferBuilder& fbb_;
    flatbuffers::uoffset_t start_;
    void add_name(flatbuffers::Offset<flatbuffers::String> name)
    {
        fbb_.AddOffset(HelloRequest::VT_NAME, name);
    }
    explicit HelloRequestBuilder(flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
    {
        start_ = fbb_.StartTable();
    }
    HelloRequestBuilder& operator=(const HelloRequestBuilder&);
    flatbuffers::Offset<HelloRequest> Finish()
    {
        const auto end = fbb_.EndTable(start_);
        auto o = flatbuffers::Offset<HelloRequest>(end);
        return o;
    }
};

inline flatbuffers::Offset<HelloRequest>
    CreateHelloRequest(flatbuffers::FlatBufferBuilder& _fbb,
                       flatbuffers::Offset<flatbuffers::String> name = 0)
{
    HelloRequestBuilder builder_(_fbb);
    builder_.add_name(name);
    return builder_.Finish();
}

inline flatbuffers::Offset<HelloRequest>
    CreateHelloRequestDirect(flatbuffers::FlatBufferBuilder& _fbb, const char* name = nullptr)
{
    return CreateHelloRequest(_fbb, name ? _fbb.CreateString(name) : 0);
}

struct ManyHellosRequest FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table
{
    enum
    {
        VT_NAME = 4,
        VT_NUM_GREETINGS = 6
    };
    const flatbuffers::String* name() const
    {
        return GetPointer<const flatbuffers::String*>(VT_NAME);
    }
    int32_t num_greetings() const { return GetField<int32_t>(VT_NUM_GREETINGS, 0); }
    bool Verify(flatbuffers::Verifier& verifier) const
    {
        return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_NAME) &&
               verifier.VerifyString(name()) && VerifyField<int32_t>(verifier, VT_NUM_GREETINGS) &&
               verifier.EndTable();
    }
};

struct ManyHellosRequestBuilder
{
    flatbuffers::FlatBufferBuilder& fbb_;
    flatbuffers::uoffset_t start_;
    void add_name(flatbuffers::Offset<flatbuffers::String> name)
    {
        fbb_.AddOffset(ManyHellosRequest::VT_NAME, name);
    }
    void add_num_greetings(int32_t num_greetings)
    {
        fbb_.AddElement<int32_t>(ManyHellosRequest::VT_NUM_GREETINGS, num_greetings, 0);
    }
    explicit ManyHellosRequestBuilder(flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
    {
        start_ = fbb_.StartTable();
    }
    ManyHellosRequestBuilder& operator=(const ManyHellosRequestBuilder&);
    flatbuffers::Offset<ManyHellosRequest> Finish()
    {
        const auto end = fbb_.EndTable(start_);
        auto o = flatbuffers::Offset<ManyHellosRequest>(end);
        return o;
    }
};

inline flatbuffers::Offset<ManyHellosRequest>
    CreateManyHellosRequest(flatbuffers::FlatBufferBuilder& _fbb,
                            flatbuffers::Offset<flatbuffers::String> name = 0,
                            int32_t num_greetings = 0)
{
    ManyHellosRequestBuilder builder_(_fbb);
    builder_.add_num_greetings(num_greetings);
    builder_.add_name(name);
    return builder_.Finish();
}

inline flatbuffers::Offset<ManyHellosRequest>
    CreateManyHellosRequestDirect(flatbuffers::FlatBufferBuilder& _fbb, const char* name = nullptr,
                                  int32_t num_greetings = 0)
{
    return CreateManyHellosRequest(_fbb, name ? _fbb.CreateString(name) : 0, num_greetings);
}

#endif // FLATBUFFERS_GENERATED_EXAMPLE_H_


================================================
FILE: examples/12_FlatBuffers/server.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <chrono>
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <thread>

#include "nvrpc/executor.h"
#include "nvrpc/server.h"
#include "nvrpc/service.h"
#include "tensorrt/laboratory/core/resources.h"

#include "example.grpc.fb.h"
#include "example_generated.h"

using nvrpc::AsyncRPC;
using nvrpc::AsyncService;
using nvrpc::Context;
using nvrpc::Executor;
using nvrpc::Server;
using trtlab::Resources;

using Request = flatbuffers::grpc::Message<HelloRequest>;
using Response = flatbuffers::grpc::Message<HelloReply>;

struct SimpleResources : public Resources
{
};

class SimpleContext final : public Context<Request, Response, SimpleResources>
{
    void ExecuteRPC(Request& input, Response& output) final override
    {
        flatbuffers::grpc::MessageBuilder mb_;

        // We call GetRoot to "parse" the message. Verification is already
        // performed by default. See the notes below for more details.
        const HelloRequest* request = input.GetRoot();

        // Fields are retrieved as usual with FlatBuffers
        const std::string& name = request->name()->str();

        // `flatbuffers::grpc::MessageBuilder` is a `FlatBufferBuilder` with a
        // special allocator for efficient gRPC buffer transfer, but otherwise
        // usage is the same as usual.
        auto msg_offset = mb_.CreateString("Hello, " + name);
        auto hello_offset = CreateHelloReply(mb_, msg_offset);
        mb_.Finish(hello_offset);

        // The `ReleaseMessage<T>()` function detaches the message from the
        // builder, so we can transfer the resopnse to gRPC while simultaneously
        // detaching that memory buffer from the builer.
        output = mb_.ReleaseMessage<HelloReply>();
        CHECK(output.Verify());
        this->FinishResponse();
    }
};

int main(int argc, char* argv[])
{
    FLAGS_alsologtostderr = 1; // Log to console

    ::google::InitGoogleLogging("flatbuffer service");
    ::google::ParseCommandLineFlags(&argc, &argv, true);

    // A server will bind an IP:PORT to listen on
    Server server("0.0.0.0:50051");

    // A server can host multiple services
    auto simpleInference = server.RegisterAsyncService<Greeter>();

    auto rpcCompute =
        simpleInference->RegisterRPC<SimpleContext>(&Greeter::AsyncService::RequestSayHello);

    auto rpcResources = std::make_shared<SimpleResources>();
    auto executor = server.RegisterExecutor(new Executor(1));
    executor->RegisterContexts(rpcCompute, rpcResources, 10);

    LOG(INFO) << "Running Server";
    server.Run(std::chrono::milliseconds(2000), [] {
        // This is a timeout loop executed every 2seconds
        // Run() with no arguments will run an empty timeout loop every 5 seconds.
        // RunAsync() will return immediately, its your responsibility to ensure the
        // server doesn't go out of scope or a Shutdown will be triggered on your services.
    });
}


================================================
FILE: examples/30_PyTensorRT/README.md
================================================
# Python Inference Example


================================================
FILE: examples/30_PyTensorRT/client.py
================================================
#!/usr/bin/env python3
import os
import time

import numpy as np

import infer
import infer_test_utils as utils


def main():
    manager = infer.RemoteInferenceManager(hostname="localhost:50052")
    models = manager.get_models()
    print(models)

    mnist = manager.infer_runner("mnist")

    print("Input Bindings: {}".format(mnist.input_bindings()))
    print("Output Bindings: {}".format(mnist.output_bindings()))

    inputs = utils.load_inputs("/work/models/onnx/mnist-v1.3/test_data_set_0")
    expected = utils.load_outputs("/work/models/onnx/mnist-v1.3/test_data_set_0")

    start = time.process_time()
    results = [mnist.infer(Input3=input) for input in inputs]
    results = [r.get() for r in results]
    print("Compute Time: {}".format(time.process_time() - start))
    print(results)

#   for r, e in zip(results, expected):
#       for key, val in r.items():
#           print("Output Binding Name: {}; shape{}".format(key, val.shape))
#           r = val.reshape((1,10))
#           np.testing.assert_almost_equal(r, e, decimal=3) 

#   models.serve()
    #mnist_model = models.get_model("mnist")
    #benchmark = infer.InferBench(models)
    #benchmark.run(mnist_model, 1, 0.1)
    #print(results)
 

if __name__ == "__main__":
    main()


================================================
FILE: examples/30_PyTensorRT/compute.py
================================================
#!/usr/bin/env python3
import itertools
import os
import time

import numpy as np

import trtlab
import infer_test_utils as utils


def main():
    models = trtlab.InferenceManager(max_exec_concurrency=1)
    mnist = models.register_tensorrt_engine("mnist", "/work/models/onnx/mnist-v1.3/mnist-v1.3.engine")
    models.update_resources()

    print("Input Bindings: {}".format(mnist.input_bindings()))
    print("Output Bindings: {}".format(mnist.output_bindings()))

    inputs = utils.load_inputs("/work/models/onnx/mnist-v1.3/test_data_set_0")
    expected = utils.load_outputs("/work/models/onnx/mnist-v1.3/test_data_set_0")

    start = time.process_time()
    while True:
        futures = [mnist.infer(Input3=inputs[0]) for _ in range(100)]
        results = [f.get() for f in futures]
#   while True:
#       results = [mnist.infer(Input3=input) for input in itertools.repeat(inputs[0], 1000)]
#       results = [r.get() for r in results]
#       time.sleep(0.1)
    print("Compute Time: {}".format(time.process_time() - start))
    
#   for r, e in zip(results, expected):
#       for key, val in r.items():
#           print("Output Binding Name: {}; shape{}".format(key, val.shape))
#           r = val.reshape((1,10))
#           np.testing.assert_almost_equal(r, e, decimal=3) 

    #mnist_model = models.get_model("mnist")
    #benchmark = infer.InferBench(models)
    #benchmark.run(mnist_model, 1, 0.1)
    #print(results)
 

if __name__ == "__main__":
    main()


================================================
FILE: examples/30_PyTensorRT/infer_test_utils.py
================================================
#!/usr/bin/env python3
import glob
import os

import onnx
from onnx import numpy_helper
from matplotlib import pyplot as plt
import numpy as np

def load_inputs(test_data_dir):
    # Load inputs
    inputs = []
    inputs_num = len(glob.glob(os.path.join(test_data_dir, 'input_*.pb')))
    for i in range(inputs_num):
        input_file = os.path.join(test_data_dir, 'input_{}.pb'.format(i))
        tensor = onnx.TensorProto()
        with open(input_file, 'rb') as f:
            tensor.ParseFromString(f.read())
        inputs.append(numpy_helper.to_array(tensor))
    return inputs

def load_outputs(test_data_dir):
    # Load reference outputs
    ref_outputs = []
    ref_outputs_num = len(glob.glob(os.path.join(test_data_dir, 'output_*.pb')))
    for i in range(ref_outputs_num):
        output_file = os.path.join(test_data_dir, 'output_{}.pb'.format(i))
        tensor = onnx.TensorProto()
        with open(output_file, 'rb') as f:
            tensor.ParseFromString(f.read())
        ref_outputs.append(numpy_helper.to_array(tensor))
    return ref_outputs

def mnist_image(data):
    two_d = (np.reshape(data, (28, 28))).astype(np.uint8)
    plt.imshow(two_d, interpolation='nearest')
    return plt

def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()


================================================
FILE: examples/30_PyTensorRT/rebuild.sh
================================================
#!/bin/bash
cd /work/build/tensorrt-laboratory/python
make -j
cd /work/examples/30_PyTensorRT
if [ ! -e infer.cpython-35m-x86_64-linux-gnu.so ]; then
  ln -s /work/build/tensorrt-laboratory/python/tensorrt/infer.cpython-35m-x86_64-linux-gnu.so
fi


================================================
FILE: examples/30_PyTensorRT/server.py
================================================
#!/usr/bin/env python3
import os
import time

import numpy as np

import infer
import infer_test_utils as utils


def main():
    models = infer.InferenceManager(max_exec_concurrency=2)
    mnist = models.register_tensorrt_engine("mnist", "/work/models/onnx/mnist-v1.3/mnist-v1.3.engine")
    models.update_resources()

    print("Input Bindings: {}".format(mnist.input_bindings()))
    print("Output Bindings: {}".format(mnist.output_bindings()))

    inputs = utils.load_inputs("/work/models/onnx/mnist-v1.3/test_data_set_0")
    expected = utils.load_outputs("/work/models/onnx/mnist-v1.3/test_data_set_0")

    start = time.process_time()
    results = [mnist.infer(Input3=input) for input in inputs]
    results = [r.get() for r in results]
    print("Compute Time: {}".format(time.process_time() - start))

    for r, e in zip(results, expected):
        for key, val in r.items():
            print("Output Binding Name: {}; shape{}".format(key, val.shape))
            r = val.reshape((1,10))
            np.testing.assert_almost_equal(r, e, decimal=3) 

    models.serve()
    #mnist_model = models.get_model("mnist")
    #benchmark = infer.InferBench(models)
    #benchmark.run(mnist_model, 1, 0.1)
    #print(results)
 

if __name__ == "__main__":
    main()


================================================
FILE: examples/90_Kubernetes/README.md
================================================
# Kubernetes

Using [Kubernetes on NVIDIA GPUs, aka KONG](https://developer.nvidia.com/kubernetes-gpu) is a great
way of deploying GPU accelerated microservices.  This page will act as a guide for
for both development and production deployment.

* For development, we will use [minikube](https://kubernetes.io/docs/setup/minikube/) to deploy a single-node
Kubernetes cluster.
* For production, we will use a Kubernetes cluster installed by the 
[DeepOps project](https://github.com/nvidia/deepops).

## Prerequisites

* [Kubernetes v1.10.0](https://kubernetes.io) 
* [NVIDIA GPU Device Plugin](https://github.com/NVIDIA/k8s-device-plugin#preparing-your-gpu-nodes)
* [Helm](https://helm.sh)
* [prometheus-operator](https://github.com/coreos/prometheus-operator)
  ```
  helm repo add coreos https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
  ```

## Setup

The following packages will be installed on your Kubernetes cluster:
* [CoreOS's Prometheus Operator](https://github.com/coreos/prometheus-operator) for gathering and monitoring metrics
* [Istio v0.8](https://istio.io) for ingress and load-balancing

After the installation of those packages, we will deploy the following:
* Scalable [K8s Deployment](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/)
of the [TensorRT GRPC example](../02_TensorRT_GRPC) `inference-grpc.x` 
* YAIS specific instance of a Prometheus server that will scrape any Pods labeled `scrape: yais`
* Istio `Gateway` and `VirtualService` to route load-balanced traffic to our gRPC service.

## Install

Starting at this point, you should have a Kubernetes cluter with all the prerequisites.

If you use the [minikube setup](minikube/README.md) you can simply run:
```
./bootstrap-minikube.sh
```

Otherwise, you can choose to install each of the components manually.

### Prometheus Operator

Initialize Helm and install the `prometheus-operator` and `kube-prometheus`

```
cd ../prometheus
./bootstrap.sh
cd ..
```

Monitor `kubectl get pods -n monitoring` and wait everything to come up.

Customize any settings in the [custom-settings.yml](prometheus/custom-settings.yml)
file.  This project is exposing the Grafana server as a `NodePort` and providing custom 
datasource and dashboards for YAIS metrics.

### Istio

Initialize Istio.  I've provided the Istio v0.8 `istio-demo.yml` modified to use a `NodePort`
as `istio/minikube.yml`  If you are using a cloude instance, you can change to a `LoadBalancer`.

```
kubectl create namespace istio-system
kubectl apply -f istio/istio-v1.0-minikube.yml
kubectl label namespace default istio-injection=enabled
```

### YAIS Service

```
kubectl apply -f yais-deploy.yml
```

This does the following:
* `Deployment` - launches the service and resources
* `Service` - provides access policy to the deployment pods
* `ServiceMonitor` - tells our Prometheus server to scrape YAIS metrics
* `Gateway` - ingress host, port and protocol
* `VirtualService - routing ingress to services

### Test the Service

Use the [`devel.sh`](devel.sh) script in the project's root directory.

```
# from project root
./devel.sh
cd build/examples/02_TensorRT_GRPC
./siege.x --port 31380 --rate=1000
```

`31380` is the default `NodePort` for the Istio `ingressgateway`.

Note: If you get errors, sometime it takes a short period of time before the ingress gateway
is updated to reflect the routing. 

### Check the Metrics

```
kubectl get svc -n monitoring | grep grafana
```

The default login is `admin/admin`.  Navigate to the `YAIS` dashboard.  Celebrate.


================================================
FILE: examples/90_Kubernetes/bootstrap-minikube.sh
================================================
#!/bin/bash

if ! [ -x "$(command -v helm)" ]; then
  echo 'Error: helm is not installed.' >&2
  exit 1
fi

# minikube
(cd minikube && ./bootstrap.sh)

# prometheus-operator
(cd prometheus && ./bootstrap.sh)

# istio
kubectl create namespace istio-system
kubectl apply -f istio/istio-v1.0-minikube.yml

sleep 30
kubectl label namespace default istio-injection=enabled

# deploy yais example
kubectl apply -f yais-deploy.yml

# sleep 15
# kubectl label namespace default istio-injection-


================================================
FILE: examples/90_Kubernetes/deploy/build-and-run.sh
================================================
#!/bin/bash

default_engine=/work/models/ResNet-152-b8-fp16.engine
concurrency=${YAIS_CONCURRENCY:-1}
engine=${YAIS_TRT_ENGINE:-$default_engine}

if [ "$engine" = "$default_engine" ]; then
  if [ ! -e $engine ]; then
    cd /work/models
    ./setup.py
  fi
fi

/work/build/examples/02_TensorRT_GRPC/inference-grpc.x \
    --engine=${engine} \
    --contexts=${concurrency}


================================================
FILE: examples/90_Kubernetes/devel/README.md
================================================

## Round 1: External Service

Before deploying a YAIS service with Kubernetes, we will first setup a developer environment
where we will execute our service in a Docker development container. We can still use our
Kubernetes/Prometheus/Grafana environment to gather and visualize metrics.  To do so, we
will create an external service pointing at our host.

Edit the `yais-devel.yml` and modify the IP address of the `Endpoint` to point at the host machine running
minikube (`sudo minikube ip`). 

```
apiVersion: v1
kind: Endpoints
metadata:
  name: yais-devel
subsets:
- addresses:
  - ip: 10.0.0.10  # <== ChangeMe
  ports:
  - name: metrics
    port: 50078
```

```
kubectl apply -f yais-devel.yml
```

This will create a Prometheus `ServiceMonitor` that will scrape from the external service, i.e. the 
Docker development container.  This is a good first start at integrating your service into Kubernetes
without having to do a full blown deployment.

Congrats, your minikube cluster now looking for services labeled `scrape: yais` and if found, will
automatically start scraped the port labels `metrics`.  

The final step is to bring an inference service online and to generate some load on that service.
Launch the YAIS developement container using the `devel.sh` script in the project's root directory.
Make sure all the examples have been built and models have been build, 
see [README::Quickstart](README.md#quickstart). 

```
cd examples/97_SingleProcessMultiSteam

root@dgx:/work/examples/97_SingleProcessMultiSteam# ./launch_service.sh 1 1 /work/models/ResNet-50-b1-fp32.engine
I0709 10:13:41.175212   468 Server.cc:37] gRPC listening on: 0.0.0.0:50051
I0709 10:13:41.175477   468 server.cc:229] Register Service (flowers::Inference) with Server
I0709 10:13:41.175492   468 server.cc:238] Register RPC (flowers::Inference::Compute) with Service (flowers::Inference)
I0709 10:13:41.175500   468 server.cc:243] Initializing Resources for RPC (flowers::Inference::Compute)
I0709 10:13:41.273568   468 TensorRT.cc:561] -- Initialzing TensorRT Resource Manager --
I0709 10:13:41.273602   468 TensorRT.cc:562] Maximum Execution Concurrency: 1
I0709 10:13:41.273609   468 TensorRT.cc:563] Maximum Copy Concurrency: 3
I0709 10:13:42.596443   468 TensorRT.cc:628] -- Registering Model: flowers --
I0709 10:13:42.596500   468 TensorRT.cc:629] Input/Output Tensors require 591.9 KiB
I0709 10:13:42.596511   468 TensorRT.cc:630] Execution Activations require 7.8 MiB
I0709 10:13:42.604210   468 TensorRT.cc:652] -- Allocating TensorRT Resources --
I0709 10:13:42.604228   468 TensorRT.cc:653] Creating 1 TensorRT execution tokens.
I0709 10:13:42.604236   468 TensorRT.cc:654] Creating a Pool of 3 Host/Device Memory Stacks
I0709 10:13:42.604248   468 TensorRT.cc:655] Each Host Stack contains 608.0 KiB
I0709 10:13:42.604256   468 TensorRT.cc:656] Each Device Stack contains 8.5 MiB
I0709 10:13:42.604264   468 TensorRT.cc:657] Total GPU Memory: 25.5 MiB
I0709 10:13:42.606546   468 server.cc:255] Initializing Executor
I0709 10:13:42.606832   468 server.cc:259] Registering Execution Contexts for RPC (flowers::Inference::Compute) with Executor
I0709 10:13:42.606889   468 server.cc:262] Running Server

warmup with client-async.x
1000 requests in 2.60522seconds; inf/sec: 383.845

Starting a shell keeping the services and load-balancer running...
Try /work/build/examples/02_TensorRT_GRPC/siege.x --rate=2000 --port=50051

1 x /work/models/ResNet-50-b1-fp32.engine Subshell:
```

Use `telegraf` and watch the scrape count; the `yais-devel` scraper is set to pull metrics every 2 seconds.
It can up to a minute or so until you see scraping from your k8s cluster.

```
1 x /work/models/ResNet-50-b1-fp32.engine Subshell: telegraf -test -config /work/examples/91_Prometheus/scrape.conf
...
> exposer_bytes_transferred,host=dgx,url=http://localhost:50078/metrics counter=0 1531131559000000000 # <== watch the counter
...
```

## Round 2: Package and Deploy

TODO - We could use some community help here.

## Round 3: Optimize Deploy Contaienr

TODO - We could use some community help here.


================================================
FILE: examples/90_Kubernetes/devel/yais-devel.yml
================================================
---
apiVersion: v1
kind: Service
metadata:
  name: yais-devel
  labels:
    app: yais-devel
spec:
  ports:
  - name: metrics
    port: 51078
    targetPort: 50078
---
apiVersion: v1
kind: Endpoints
metadata:
  name: yais-devel
subsets:
- addresses:
  - ip: 10.0.0.10
  ports:
  - name: metrics
    port: 50078
---
# this will get scraped by the default kube-prometheus
# and the yais-metric prometheus service (if it is running)
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: yais-devel
  labels:
    scrape: yais
spec:
  selector:
    matchLabels:
      app: yais-devel
  endpoints:
  - port: metrics
    interval: 2s
    honorLabels: true


================================================
FILE: examples/90_Kubernetes/istio/README.md
================================================
# Istio

## Install

```
# Download the latest release
curl -L https://git.io/getLatestIstio | sh -

# Istio 1.0
helm template install/kubernetes/helm/istio --name istio --namespace istio-system \
  --set gateways.istio-ingressgateway.type=NodePort \
  --set gateways.istio-egressgateway.type=NodePort  > istio-v1.0-minikube.yml
```

Install Istio and enable the default namespace for injection; however, only
pods with the proper annotations will have sidecars injected.

```
kubectl create namespace istio-system
kubectl apply -f istio-v1.0-minikube.yml
kubectl label namespace default istio-injection=enabled
kubectl get namespace -L istio-injection
```

The annotation required for sidecar injection:
```
apiVersion: extensions/v1beta1
kind: Deployment
...
spec:
  template:
    metadata:
      annotations:                         # <== sidecar
        sidecar.istio.io/inject: "false"   # <== annotation
...
```


================================================
FILE: examples/90_Kubernetes/istio/rendered/istio-v0.8-minikube.yml
================================================
apiVersion: v1
kind: Namespace
metadata:
 name: istio-system
---
# Source: istio/charts/mixer/templates/configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: istio-statsd-prom-bridge
  namespace: istio-system
  labels:
    app: istio-statsd-prom-bridge
    chart: mixer-0.8.0
    release: RELEASE-NAME
    heritage: Tiller
    istio: mixer
data:
  mapping.conf: |-
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: istio-mixer-custom-resources
  namespace: istio-system
  labels:
    app: istio-mixer
    chart: mixer-0.8.0
    release: RELEASE-NAME
    heritage: Tiller
    istio: mixer
data:
  custom-resources.yaml: |-    
    apiVersion: "config.istio.io/v1alpha2"
    kind: attributemanifest
    metadata:
      name: istioproxy
      namespace: istio-system
    spec:
      attributes:
        origin.ip:
          valueType: IP_ADDRESS
        origin.uid:
          valueType: STRING
        origin.user:
          valueType: STRING
        request.headers:
          valueType: STRING_MAP
        request.id:
          valueType: STRING
        request.host:
          valueType: STRING
        request.method:
          valueType: STRING
        request.path:
          valueType: STRING
        request.reason:
          valueType: STRING
        request.referer:
          valueType: STRING
        request.scheme:
          valueType: STRING
        request.total_size:
              valueType: INT64
        request.size:
          valueType: INT64
        request.time:
          valueType: TIMESTAMP
        request.useragent:
          valueType: STRING
        response.code:
          valueType: INT64
        response.duration:
          valueType: DURATION
        response.headers:
          valueType: STRING_MAP
        response.total_size:
              valueType: INT64
        response.size:
          valueType: INT64
        response.time:
          valueType: TIMESTAMP
        source.uid:
          valueType: STRING
        source.user:
          valueType: STRING
        destination.uid:
          valueType: STRING
        connection.id:
          valueType: STRING
        connection.received.bytes:
          valueType: INT64
        connection.received.bytes_total:
          valueType: INT64
        connection.sent.bytes:
          valueType: INT64
        connection.sent.bytes_total:
          valueType: INT64
        connection.duration:
          valueType: DURATION
        connection.mtls:
          valueType: BOOL
        context.protocol:
          valueType: STRING
        context.timestamp:
          valueType: TIMESTAMP
        context.time:
          valueType: TIMESTAMP
        api.service:
          valueType: STRING
        api.version:
          valueType: STRING
        api.operation:
          valueType: STRING
        api.protocol:
          valueType: STRING
        request.auth.principal:
          valueType: STRING
        request.auth.audiences:
          valueType: STRING
        request.auth.presenter:
          valueType: STRING
        request.auth.claims:
          valueType: STRING_MAP
        request.auth.raw_claims:
          valueType: STRING
        request.api_key:
          valueType: STRING
    
    ---
    apiVersion: "config.istio.io/v1alpha2"
    kind: attributemanifest
    metadata:
      name: kubernetes
      namespace: istio-system
    spec:
      attributes:
        source.ip:
          valueType: IP_ADDRESS
        source.labels:
          valueType: STRING_MAP
        source.name:
          valueType: STRING
        source.namespace:
          valueType: STRING
        source.service:
          valueType: STRING
        source.serviceAccount:
          valueType: STRING
        destination.ip:
          valueType: IP_ADDRESS
        destination.labels:
          valueType: STRING_MAP
        destination.name:
          valueType: STRING
        destination.namespace:
          valueType: STRING
        destination.service:
          valueType: STRING
        destination.serviceAccount:
          valueType: STRING
    ---
    apiVersion: "config.istio.io/v1alpha2"
    kind: stdio
    metadata:
      name: handler
      namespace: istio-system
    spec:
      outputAsJson: true
    ---
    apiVersion: "config.istio.io/v1alpha2"
    kind: logentry
    metadata:
      name: accesslog
      namespace: istio-system
    spec:
      severity: '"Info"'
      timestamp: request.time
      variables:
        originIp: origin.ip | ip("0.0.0.0")
        sourceIp: source.ip | ip("0.0.0.0")
        sourceService: source.service | ""
        sourceUser: source.user | source.uid | ""
        sourceNamespace: source.namespace | ""
        destinationIp: destination.ip | ip("0.0.0.0")
        destinationService: destination.service | ""
        destinationNamespace: destination.namespace | ""
        apiName: api.service | ""
        apiVersion: api.version | ""
        apiClaims: request.headers["sec-istio-auth-userinfo"]| ""
        apiKey: request.api_key | request.headers["x-api-key"] | ""
        requestOperation: api.operation | ""
        protocol: request.scheme | "http"
        method: request.method | ""
        url: request.path | ""
        responseCode: response.code | 0
        responseSize: response.size | 0
        requestSize: request.size | 0
        latency: response.duration | "0ms"
        connectionMtls: connection.mtls | false
        userAgent: request.useragent | ""
        responseTimestamp: response.time
        receivedBytes: request.total_size | connection.received.bytes | 0
        sentBytes: response.total_size | connection.sent.bytes | 0
        referer: request.referer | ""
      monitored_resource_type: '"UNSPECIFIED"'
    ---
    apiVersion: "config.istio.io/v1alpha2"
    kind: rule
    metadata:
      name: stdio
      namespace: istio-system
    spec:
      match: "true" # If omitted match is true.
      actions:
      - handler: handler.stdio
        instances:
        - accesslog.logentry
    ---
    apiVersion: "config.istio.io/v1alpha2"
    kind: metric
    metadata:
      name: requestcount
      namespace: istio-system
    spec:
      value: "1"
      dimensions:
        source_service: source.service | "unknown"
        source_version: source.labels["version"] | "unknown"
        destination_service: destination.service | "unknown"
        destination_version: destination.labels["version"] | "unknown"
        response_code: response.code | 200
        connection_mtls: connection.mtls | false
      monitored_resource_type: '"UNSPECIFIED"'
    ---
    apiVersion: "config.istio.io/v1alpha2"
    kind: metric
    metadata:
      name: requestduration
      namespace: istio-system
    spec:
      value: response.duration | "0ms"
      dimensions:
        source_service: source.service | "unknown"
        source_version: source.labels["version"] | "unknown"
        destination_service: destination.service | "unknown"
        destination_version: destination.labels["version"] | "unknown"
        response_code: response.code | 200
        connection_mtls: connection.mtls | false
      monitored_resource_type: '"UNSPECIFIED"'
    ---
    apiVersion: "config.istio.io/v1alpha2"
    kind: metric
    metadata:
      name: requestsize
      namespace: istio-system
    spec:
      value: request.size | 0
      dimensions:
        source_service: source.service | "unknown"
        source_version: source.labels["version"] | "unknown"
        destination_service: destination.service | "unknown"
        destination_version: destination.labels["version"] | "unknown"
        response_code: response.code | 200
        connection_mtls: connection.mtls | false
      monitored_resource_type: '"UNSPECIFIED"'
    ---
    apiVersion: "config.istio.io/v1alpha2"
    kind: metric
    metadata:
      name: responsesize
      namespace: istio-system
    spec:
      value: response.size | 0
      dimensions:
        source_service: source.service | "unknown"
        source_version: source.labels["version"] | "unknown"
        destination_service: destination.service | "unknown"
        destination_version: destination.labels["version"] | "unknown"
        response_code: response.code | 200
        connection_mtls: connection.mtls | false
      monitored_resource_type: '"UNSPECIFIED"'
    ---
    apiVersion: "config.istio.io/v1alpha2"
    kind: metric
    metadata:
      name: tcpbytesent
      namespace: istio-system
      labels:
        istio-protocol: tcp # needed so that mixer will only generate when context.protocol == tcp
    spec:
      value: connection.sent.bytes | 0
      dimensions:
        source_service: source.service | "unknown"
        source_version: source.labels["version"] | "unknown"
        destination_service: destination.service | "unknown"
        destination_version: destination.labels["version"] | "unknown"
        connection_mtls: connection.mtls | false
      monitored_resource_type: '"UNSPECIFIED"'
    ---
    apiVersion: "config.istio.io/v1alpha2"
    kind: metric
    metadata:
      name: tcpbytereceived
      namespace: istio-system
      labels:
        istio-protocol: tcp # needed so that mixer will only generate when context.protocol == tcp
    spec:
      value: connection.received.bytes | 0
      dimensions:
        source_service: source.service | "unknown"
        source_version: source.labels["version"] | "unknown"
        destination_service: destination.service | "unknown"
        destination_version: destination.labels["version"] | "unknown"
        connection_mtls: connection.mtls | false
      monitored_resource_type: '"UNSPECIFIED"'
    ---
    apiVersion: "config.istio.io/v1alpha2"
    kind: prometheus
    metadata:
      name: handler
      namespace: istio-system
    spec:
      metrics:
      - name: request_count
        instance_name: requestcount.metric.istio-system
        kind: COUNTER
        label_names:
        - source_service
        - source_version
        - destination_service
        - destination_version
        - response_code
        - connection_mtls
      - name: request_duration
        instance_name: requestduration.metric.istio-system
        kind: DISTRIBUTION
        label_names:
        - source_service
        - source_version
        - destination_service
        - destination_version
        - response_code
        - connection_mtls
        buckets:
          explicit_buckets:
            bounds: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10]
      - name: request_size
        instance_name: requestsize.metric.istio-system
        kind: DISTRIBUTION
        label_names:
        - source_service
        - source_version
        - destination_service
        - destination_version
        - response_code
        - connection_mtls
        buckets:
          exponentialBuckets:
            numFiniteBuckets: 8
            scale: 1
            growthFactor: 10
      - name: response_size
        instance_name: responsesize.metric.istio-system
        kind: DISTRIBUTION
        label_names:
        - source_service
        - source_version
        - destination_service
        - destination_version
        - response_code
        - connection_mtls
        buckets:
          exponentialBuckets:
            numFiniteBuckets: 8
            scale: 1
            growthFactor: 10
      - name: tcp_bytes_sent
        instance_name: tcpbytesent.metric.istio-system
        kind: COUNTER
        label_names:
        - source_service
        - source_version
        - destination_service
        - destination_version
        - connection_mtls
      - name: tcp_bytes_received
        instance_name: tcpbytereceived.metric.istio-system
        kind: COUNTER
        label_names:
        - source_service
        - source_version
        - destination_service
        - destination_version
        - connection_mtls
    ---
    apiVersion: "config.istio.io/v1alpha2"
    kind: rule
    metadata:
      name: promhttp
      namespace: istio-system
      labels:
        istio-protocol: http
    spec:
      actions:
      - handler: handler.prometheus
        instances:
        - requestcount.metric
        - requestduration.metric
        - requestsize.metric
        - responsesize.metric
    ---
    apiVersion: "config.istio.io/v1alpha2"
    kind: rule
    metadata:
      name: promtcp
      namespace: istio-system
      labels:
        istio-protocol: tcp # needed so that mixer will only execute when context.protocol == TCP
    spec:
      actions:
      - handler: handler.prometheus
        instances:
        - tcpbytesent.metric
        - tcpbytereceived.metric
    ---
    
    apiVersion: "config.istio.io/v1alpha2"
    kind: kubernetesenv
    metadata:
      name: handler
      namespace: istio-system
    spec:
      # when running from mixer root, use the following config after adding a
      # symbolic link to a kubernetes config file via:
      #
      # $ ln -s ~/.kube/config mixer/adapter/kubernetes/kubeconfig
      #
      # kubeconfig_path: "mixer/adapter/kubernetes/kubeconfig"
    
    ---
    apiVersion: "config.istio.io/v1alpha2"
    kind: rule
    metadata:
      name: kubeattrgenrulerule
      namespace: istio-system
    spec:
      actions:
      - handler: handler.kubernetesenv
        instances:
        - attributes.kubernetes
    ---
    apiVersion: "config.istio.io/v1alpha2"
    kind: rule
    metadata:
      name: tcpkubeattrgenrulerule
      namespace: istio-system
    spec:
      match: context.protocol == "tcp"
      actions:
      - handler: handler.kubernetesenv
        instances:
        - attributes.kubernetes
    ---
    apiVersion: "config.istio.io/v1alpha2"
    kind: kubernetes
    metadata:
      name: attributes
      namespace: istio-system
    spec:
      # Pass the required attribute data to the adapter
      source_uid: source.uid | ""
      source_ip: source.ip | ip("0.0.0.0") # default to unspecified ip addr
      destination_uid: destination.uid | ""
      origin_uid: '""'
      origin_ip: ip("0.0.0.0") # default to unspecified ip addr
      attribute_bindings:
        # Fill the new attributes from the adapter produced output.
        # $out refers to an instance of OutputTemplate message
        source.ip: $out.source_pod_ip | ip("0.0.0.0")
        source.labels: $out.source_labels | emptyStringMap()
        source.namespace: $out.source_namespace | "default"
        source.service: $out.source_service | "unknown"
        source.serviceAccount: $out.source_service_account_name | "unknown"
        destination.ip: $out.destination_pod_ip | ip("0.0.0.0")
        destination.labels: $out.destination_labels | emptyStringMap()
        destination.namespace: $out.destination_namespace | "default"
        destination.service: $out.destination_service | "unknown"
        destination.serviceAccount: $out.destination_service_account_name | "unknown"
    ---
    # Configuration needed by Mixer.
    # Mixer cluster is delivered via CDS
    # Specify mixer cluster settings
    apiVersion: networking.istio.io/v1alpha3
    kind: DestinationRule
    metadata:
      name: istio-policy
      namespace: istio-system
    spec:
      host: istio-policy.istio-system.svc.cluster.local
      trafficPolicy:
        connectionPool:
          http:
            http2MaxRequests: 10000
            maxRequestsPerConnection: 10000
    ---
    apiVersion: networking.istio.io/v1alpha3
    kind: DestinationRule
    metadata:
      name: istio-telemetry
      namespace: istio-system
    spec:
      host: istio-telemetry.istio-system.svc.cluster.local
      trafficPolicy:
        connectionPool:
          http:
            http2MaxRequests: 10000
            maxRequestsPerConnection: 10000
    ---
    

---
# Source: istio/charts/prometheus/templates/configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus
  namespace: istio-system
  labels:
    app: prometheus
    chart: prometheus-0.1.0
    release: RELEASE-NAME
    heritage: Tiller
data:
  prometheus.yml: |-
    global:
      scrape_interval: 15s
    scrape_configs:

    - job_name: 'istio-mesh'
      # Override the global default and scrape targets from this job every 5 seconds.
      scrape_interval: 5s

      kubernetes_sd_configs:
      - role: endpoints

      relabel_configs:
      - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
        action: keep
        regex: istio-system;istio-telemetry;prometheus

    - job_name: 'envoy'
      # Override the global default and scrape targets from this job every 5 seconds.
      scrape_interval: 5s
      # metrics_path defaults to '/metrics'
      # scheme defaults to 'http'.

      kubernetes_sd_configs:
      - role: endpoints

      relabel_configs:
      - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
        action: keep
        regex: istio-system;istio-statsd-prom-bridge;statsd-prom

    - job_name: 'istio-policy'
      # Override the global default and scrape targets from this job every 5 seconds.
      scrape_interval: 5s
      # metrics_path defaults to '/metrics'
      # scheme defaults to 'http'.

      kubernetes_sd_configs:
      - role: endpoints

      relabel_configs:
      - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
        action: keep
        regex: istio-system;istio-policy;http-monitoring

    - job_name: 'istio-telemetry'
      # Override the global default and scrape targets from this job every 5 seconds.
      scrape_interval: 5s
      # metrics_path defaults to '/metrics'
      # scheme defaults to 'http'.

      kubernetes_sd_configs:
      - role: endpoints

      relabel_configs:
      - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
        action: keep
        regex: istio-system;istio-telemetry;http-monitoring

    - job_name: 'pilot'
      # Override the global default and scrape targets from this job every 5 seconds.
      scrape_interval: 5s
      # metrics_path defaults to '/metrics'
      # scheme defaults to 'http'.

      kubernetes_sd_configs:
      - role: endpoints

      relabel_configs:
      - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
        action: keep
        regex: istio-system;istio-pilot;http-monitoring

    # scrape config for API servers
    - job_name: 'kubernetes-apiservers'
      kubernetes_sd_configs:
      - role: endpoints
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
      bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      relabel_configs:
      - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
        action: keep
        regex: default;kubernetes;https

    # scrape config for nodes (kubelet)
    - job_name: 'kubernetes-nodes'
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
      bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      kubernetes_sd_configs:
      - role: node
      relabel_configs:
      - action: labelmap
        regex: __meta_kubernetes_node_label_(.+)
      - target_label: __address__
        replacement: kubernetes.default.svc:443
      - source_labels: [__meta_kubernetes_node_name]
        regex: (.+)
        target_label: __metrics_path__
        replacement: /api/v1/nodes/${1}/proxy/metrics

    # Scrape config for Kubelet cAdvisor.
    #
    # This is required for Kubernetes 1.7.3 and later, where cAdvisor metrics
    # (those whose names begin with 'container_') have been removed from the
    # Kubelet metrics endpoint.  This job scrapes the cAdvisor endpoint to
    # retrieve those metrics.
    #
    # In Kubernetes 1.7.0-1.7.2, these metrics are only exposed on the cAdvisor
    # HTTP endpoint; use "replacement: /api/v1/nodes/${1}:4194/proxy/metrics"
    # in that case (and ensure cAdvisor's HTTP server hasn't been disabled with
    # the --cadvisor-port=0 Kubelet flag).
    #
    # This job is not necessary and should be removed in Kubernetes 1.6 and
    # earlier versions, or it will cause the metrics to be scraped twice.
    - job_name: 'kubernetes-cadvisor'
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
      bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      kubernetes_sd_configs:
      - role: node
      relabel_configs:
      - action: labelmap
        regex: __meta_kubernetes_node_label_(.+)
      - target_label: __address__
        replacement: kubernetes.default.svc:443
      - source_labels: [__meta_kubernetes_node_name]
        regex: (.+)
        target_label: __metrics_path__
        replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor

    # scrape config for service endpoints.
    - job_name: 'kubernetes-service-endpoints'
      kubernetes_sd_configs:
      - role: endpoints
      relabel_configs:
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
        action: replace
        target_label: __scheme__
        regex: (https?)
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)
      - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
        action: replace
        target_label: __address__
        regex: ([^:]+)(?::\d+)?;(\d+)
        replacement: $1:$2
      - action: labelmap
        regex: __meta_kubernetes_service_label_(.+)
      - source_labels: [__meta_kubernetes_namespace]
        action: replace
        target_label: kubernetes_namespace
      - source_labels: [__meta_kubernetes_service_name]
        action: replace
        target_label: kubernetes_name

    # Example scrape config for pods
    - job_name: 'kubernetes-pods'
      kubernetes_sd_configs:
      - role: pod

      relabel_configs:
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)
      - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
        action: replace
        regex: ([^:]+)(?::\d+)?;(\d+)
        replacement: $1:$2
        target_label: __address__
      - action: labelmap
        regex: __meta_kubernetes_pod_label_(.+)
      - source_labels: [__meta_kubernetes_namespace]
        action: replace
        target_label: namespace
      - source_labels: [__meta_kubernetes_pod_name]
        action: replace
        target_label: pod_name

---
# Source: istio/templates/configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: istio
  namespace: istio-system
  labels:
    app: istio
    chart: istio-0.8.0
    release: RELEASE-NAME
    heritage: Tiller
data:
  mesh: |-
    #
    # Edit this list to avoid using mTLS to connect to these services.
    # Typically, these are control services (e.g kubernetes API server) that don't have istio sidecar
    # to transparently terminate mTLS authentication.
    # mtlsExcludedServices: ["kubernetes.default.svc.cluster.local"]

    # Set the following variable to true to disable policy checks by the Mixer.
    # Note that metrics will still be reported to the Mixer.
    disablePolicyChecks: false
    # Set enableTracing to false to disable request tracing.
    enableTracing: true
    #
    # To disable the mixer completely (including metrics), comment out
    # the following lines
    mixerCheckServer: istio-policy.istio-system.svc.cluster.local:15004
    mixerReportServer: istio-telemetry.istio-system.svc.cluster.local:15004
    # This is the ingress service name, update if you used a different name
    ingressService: istio-ingress
    #
    # Along with discoveryRefreshDelay, this setting determines how
    # frequently should Envoy fetch and update its internal configuration
    # from istio Pilot. Lower refresh delay results in higher CPU
    # utilization and potential performance loss in exchange for faster
    # convergence. Tweak this value according to your setup.
    rdsRefreshDelay: 10s
    #
    defaultConfig:
      # NOTE: If you change any values in this section, make sure to make
      # the same changes in start up args in istio-ingress pods.
      # See rdsRefreshDelay for explanation about this setting.
      discoveryRefreshDelay: 10s
      #
      # TCP connection timeout between Envoy & the application, and between Envoys.
      connectTimeout: 10s
      #
      ### ADVANCED SETTINGS #############
      # Where should envoy's configuration be stored in the istio-proxy container
      configPath: "/etc/istio/proxy"
      binaryPath: "/usr/local/bin/envoy"
      # The pseudo service name used for Envoy.
      serviceCluster: istio-proxy
      # These settings that determine how long an old Envoy
      # process should be kept alive after an occasional reload.
      drainDuration: 45s
      parentShutdownDuration: 1m0s
      #
      # The mode used to redirect inbound connections to Envoy. This setting
      # has no effect on outbound traffic: iptables REDIRECT is always used for
      # outbound connections.
      # If "REDIRECT", use iptables REDIRECT to NAT and redirect to Envoy.
      # The "REDIRECT" mode loses source addresses during redirection.
      # If "TPROXY", use iptables TPROXY to redirect to Envoy.
      # The "TPROXY" mode preserves both the source and destination IP
      # addresses and ports, so that they can be used for advanced filtering
      # and manipulation.
      # The "TPROXY" mode also configures the sidecar to run with the
      # CAP_NET_ADMIN capability, which is required to use TPROXY.
      #interceptionMode: REDIRECT
      #
      # Port where Envoy listens (on local host) for admin commands
      # You can exec into the istio-proxy container in a pod and
      # curl the admin port (curl http://localhost:15000/) to obtain
      # diagnostic information from Envoy. See
      # https://lyft.github.io/envoy/docs/operations/admin.html
      # for more details
      proxyAdminPort: 15000
      #
      # Zipkin trace collector
      zipkinAddress: zipkin.istio-system:9411
      #
      # Statsd metrics collector converts statsd metrics into Prometheus metrics.
      statsdUdpAddress: istio-statsd-prom-bridge.istio-system:9125
      #
      # Mutual TLS authentication between sidecars and istio control plane.
      controlPlaneAuthPolicy: NONE
      #
      # Address where istio Pilot service is running
      discoveryAddress: istio-pilot.istio-system:15007

---
# Source: istio/templates/sidecar-injector-configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: istio-sidecar-injector
  namespace: istio-system
  labels:
    app: istio
    chart: istio-0.8.0
    release: RELEASE-NAME
    heritage: Tiller
    istio: sidecar-injector
data:
  config: |-
    policy: enabled
    template: |-
      initContainers:
      - name: istio-init
        image: docker.io/istio/proxy_init:0.8.0
        args:
        - "-p"
        - [[ .MeshConfig.ProxyListenPort ]]
        - "-u"
        - 1337
        - "-m"
        - [[ or (index .ObjectMeta.Annotations "sidecar.istio.io/interceptionMode") .ProxyConfig.InterceptionMode.String ]]
        - "-i"
        [[ if (isset .ObjectMeta.Annotations "traffic.sidecar.istio.io/includeOutboundIPRanges") -]]
        - "[[ index .ObjectMeta.Annotations "traffic.sidecar.istio.io/includeOutboundIPRanges"  ]]"
        [[ else -]]
        - "*"
        [[ end -]]
        - "-x"
        [[ if (isset .ObjectMeta.Annotations "traffic.sidecar.istio.io/excludeOutboundIPRanges") -]]
        - "[[ index .ObjectMeta.Annotations "traffic.sidecar.istio.io/excludeOutboundIPRanges"  ]]"
        [[ else -]]
        - ""
        [[ end -]]
        - "-b"
        [[ if (isset .ObjectMeta.Annotations "traffic.sidecar.istio.io/includeInboundPorts") -]]
        - "[[ index .ObjectMeta.Annotations "traffic.sidecar.istio.io/includeInboundPorts"  ]]"
        [[ else -]]
        - [[ range .Spec.Containers -]][[ range .Ports -]][[ .ContainerPort -]], [[ end -]][[ end -]][[ end]]
        - "-d"
        [[ if (isset .ObjectMeta.Annotations "traffic.sidecar.istio.io/excludeInboundPorts") -]]
        - "[[ index .ObjectMeta.Annotations "traffic.sidecar.istio.io/excludeInboundPorts" ]]"
        [[ else -]]
        - ""
        [[ end -]]
        imagePullPolicy: IfNotPresent
        securityContext:
          capabilities:
            add:
            - NET_ADMIN
          privileged: true
        restartPolicy: Always
      
      containers:
      - name: istio-proxy
        image: [[ if (isset .ObjectMeta.Annotations "sidecar.istio.io/proxyImage") -]]
        "[[ index .ObjectMeta.Annotations "sidecar.istio.io/proxyImage" ]]"
        [[ else -]]
        docker.io/istio/proxyv2:0.8.0
        [[ end -]]
        args:
        - proxy
        - sidecar
        - --configPath
        - [[ .ProxyConfig.ConfigPath ]]
        - --binaryPath
        - [[ .ProxyConfig.BinaryPath ]]
        - --serviceCluster
        [[ if ne "" (index .ObjectMeta.Labels "app") -]]
        - [[ index .ObjectMeta.Labels "app" ]]
        [[ else -]]
        - "istio-proxy"
        [[ end -]]
        - --drainDuration
        - [[ formatDuration .ProxyConfig.DrainDuration ]]
        - --parentShutdownDuration
        - [[ formatDuration .ProxyConfig.ParentShutdownDuration ]]
        - --discoveryAddress
        - [[ .ProxyConfig.DiscoveryAddress ]]
        - --discoveryRefreshDelay
        - [[ formatDuration .ProxyConfig.DiscoveryRefreshDelay ]]
        - --zipkinAddress
        - [[ .ProxyConfig.ZipkinAddress ]]
        - --connectTimeout
        - [[ formatDuration .ProxyConfig.ConnectTimeout ]]
        - --statsdUdpAddress
        - [[ .ProxyConfig.StatsdUdpAddress ]]
        - --proxyAdminPort
        - [[ .ProxyConfig.ProxyAdminPort ]]
        - --controlPlaneAuthPolicy
        - [[ .ProxyConfig.ControlPlaneAuthPolicy ]]
        env:
        - name: POD_NAME
          valueFrom:
            fieldRef:
              fieldPath: metadata.name
        - name: POD_NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: metadata.namespace
        - name: INSTANCE_IP
          valueFrom:
            fieldRef:
              fieldPath: status.podIP
        - name: ISTIO_META_POD_NAME
          valueFrom:
            fieldRef:
              fieldPath: metadata.name
        - name: ISTIO_META_INTERCEPTION_MODE
          value: [[ or (index .ObjectMeta.Annotations "sidecar.istio.io/interceptionMode") .ProxyConfig.InterceptionMode.String ]]
        imagePullPolicy: IfNotPresent
        securityContext:
            privileged: false
            readOnlyRootFilesystem: true
            [[ if eq (or (index .ObjectMeta.Annotations "sidecar.istio.io/interceptionMode") .ProxyConfig.InterceptionMode.String) "TPROXY" -]]
            capabilities:
              add:
              - NET_ADMIN
            [[ else -]]
            runAsUser: 1337
            [[ end -]]
        restartPolicy: Always
        resources:
          requests:
            cpu: 100m
            memory: 128Mi
          
        volumeMounts:
        - mountPath: /etc/istio/proxy
          name: istio-envoy
        - mountPath: /etc/certs/
          name: istio-certs
          readOnly: true
      volumes:
      - emptyDir:
          medium: Memory
        name: istio-envoy
      - name: istio-certs
        secret:
          optional: true
          [[ if eq .Spec.ServiceAccountName "" -]]
          secretName: istio.default
          [[ else -]]
          secretName: [[ printf "istio.%s" .Spec.ServiceAccountName ]]
          [[ end -]]


---
# Source: istio/charts/egressgateway/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: istio-egressgateway-service-account
  namespace: istio-system
  labels:
    app: egressgateway
    chart: egressgateway-0.8.0
    heritage: Tiller
    release: RELEASE-NAME

---
# Source: istio/charts/ingressgateway/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: istio-ingressgateway-service-account
  namespace: istio-system
  labels:
    app: ingressgateway
    chart: ingressgateway-0.8.0
    heritage: Tiller
    release: RELEASE-NAME

---
# Source: istio/charts/mixer/templates/create-custom-resources-job.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: istio-mixer-post-install-account
  namespace: istio-system
  labels:
    app: mixer
    chart: mixer-0.8.0
    heritage: Tiller
    release: RELEASE-NAME
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
  name: istio-mixer-post-install-istio-system
  namespace: istio-system
  labels:
    app: mixer
    chart: mixer-0.8.0
    heritage: Tiller
    release: RELEASE-NAME
rules:
- apiGroups: ["config.istio.io"] # istio CRD watcher
  resources: ["*"]
  verbs: ["create", "get", "list", "watch", "patch"]
- apiGroups: ["networking.istio.io"] # needed to create mixer destination rules
  resources: ["*"]
  verbs: ["*"]
- apiGroups: ["apiextensions.k8s.io"]
  resources: ["customresourcedefinitions"]
  verbs: ["get", "list", "watch"]
- apiGroups: [""]
  resources: ["configmaps", "endpoints", "pods", "services", "namespaces", "secrets"]
  verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
  name: istio-mixer-post-install-role-binding-istio-system
  labels:
    app: mixer
    chart: mixer-0.8.0
    heritage: Tiller
    release: RELEASE-NAME
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: istio-mixer-post-install-istio-system
subjects:
  - kind: ServiceAccount
    name: istio-mixer-post-install-account
    namespace: istio-system
---

apiVersion: batch/v1
kind: Job
metadata:
  name: istio-mixer-post-install
  namespace: istio-system
  annotations:
    "helm.sh/hook": post-install
    "helm.sh/hook-delete-policy": before-hook-creation
  labels:
    app: mixer
    chart: mixer-0.8.0
    release: RELEASE-NAME
    heritage: Tiller
spec:
  template:
    metadata:
      name: istio-mixer-post-install
      labels:
        app: mixer
        release: RELEASE-NAME
    spec:
      serviceAccountName: istio-mixer-post-install-account
      containers:
        - name: hyperkube
          image: "quay.io/coreos/hyperkube:v1.7.6_coreos.0"
          command:
            - ./kubectl
            - apply
            - -f
            - /tmp/mixer/custom-resources.yaml
          volumeMounts:
            - mountPath: "/tmp/mixer"
              name: tmp-configmap-mixer
      volumes:
        - name: tmp-configmap-mixer
          configMap:
            name: istio-mixer-custom-resources
      restartPolicy: Never # CRD might take some time till they are available to consume

---
# Source: istio/charts/mixer/templates/serviceaccount.yaml

apiVersion: v1
kind: ServiceAccount
metadata:
  name: istio-mixer-service-account
  namespace: istio-system
  labels:
    app: mixer
    chart: mixer-0.8.0
    heritage: Tiller
    release: RELEASE-NAME

---
# Source: istio/charts/pilot/templates/serviceaccount.yaml

apiVersion: v1
kind: ServiceAccount
metadata:
  name: istio-pilot-service-account
  namespace: istio-system
  labels:
    app: istio-pilot
    chart: pilot-0.8.0
    heritage: Tiller
    release: RELEASE-NAME

---
# Source: istio/charts/prometheus/templates/serviceaccount.yaml

apiVersion: v1
kind: ServiceAccount
metadata:
  name: prometheus
  namespace: istio-system

---
# Source: istio/charts/security/templates/serviceaccount.yaml

apiVersion: v1
kind: ServiceAccount
metadata:
  name: istio-citadel-service-account
  namespace: istio-system
  labels:
    app: security
    chart: security-0.8.0
    heritage: Tiller
    release: RELEASE-NAME
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: istio-cleanup-old-ca-service-account
  namespace: istio-system
  labels:
    app: security
    chart: security-0.8.0
    heritage: Tiller
    release: RELEASE-NAME

---
# Source: istio/charts/sidecarInjectorWebhook/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: istio-sidecar-injector-service-account
  namespace: istio-system
  labels:
    app: istio-sidecar-injector
    chart: sidecarInjectorWebhook-0.8.0
    heritage: Tiller
    release: RELEASE-NAME

---
# Source: istio/charts/mixer/templates/crds.yaml
# Mixer CRDs
kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: rules.config.istio.io
  labels:
    app: mixer
    package: istio.io.mixer
    istio: core
spec:
  group: config.istio.io
  names:
    kind: rule
    plural: rules
    singular: rule
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: attributemanifests.config.istio.io
  labels:
    app: mixer
    package: istio.io.mixer
    istio: core
spec:
  group: config.istio.io
  names:
    kind: attributemanifest
    plural: attributemanifests
    singular: attributemanifest
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: circonuses.config.istio.io
  labels:
    app: mixer
    package: circonus
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: circonus
    plural: circonuses
    singular: circonus
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: deniers.config.istio.io
  labels:
    app: mixer
    package: denier
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: denier
    plural: deniers
    singular: denier
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: fluentds.config.istio.io
  labels:
    app: mixer
    package: fluentd
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: fluentd
    plural: fluentds
    singular: fluentd
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: kubernetesenvs.config.istio.io
  labels:
    app: mixer
    package: kubernetesenv
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: kubernetesenv
    plural: kubernetesenvs
    singular: kubernetesenv
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: listcheckers.config.istio.io
  labels:
    app: mixer
    package: listchecker
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: listchecker
    plural: listcheckers
    singular: listchecker
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: memquotas.config.istio.io
  labels:
    app: mixer
    package: memquota
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: memquota
    plural: memquotas
    singular: memquota
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: noops.config.istio.io
  labels:
    app: mixer
    package: noop
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: noop
    plural: noops
    singular: noop
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: opas.config.istio.io
  labels:
    app: mixer
    package: opa
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: opa
    plural: opas
    singular: opa
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: prometheuses.config.istio.io
  labels:
    app: mixer
    package: prometheus
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: prometheus
    plural: prometheuses
    singular: prometheus
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: rbacs.config.istio.io
  labels:
    app: mixer
    package: rbac
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: rbac
    plural: rbacs
    singular: rbac
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: servicecontrols.config.istio.io
  labels:
    app: mixer
    package: servicecontrol
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: servicecontrol
    plural: servicecontrols
    singular: servicecontrol
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: solarwindses.config.istio.io
  labels:
    app: mixer
    package: solarwinds
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: solarwinds
    plural: solarwindses
    singular: solarwinds
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: stackdrivers.config.istio.io
  labels:
    app: mixer
    package: stackdriver
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: stackdriver
    plural: stackdrivers
    singular: stackdriver
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: statsds.config.istio.io
  labels:
    app: mixer
    package: statsd
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: statsd
    plural: statsds
    singular: statsd
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: stdios.config.istio.io
  labels:
    app: mixer
    package: stdio
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: stdio
    plural: stdios
    singular: stdio
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: apikeys.config.istio.io
  labels:
    app: mixer
    package: apikey
    istio: mixer-instance
spec:
  group: config.istio.io
  names:
    kind: apikey
    plural: apikeys
    singular: apikey
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: authorizations.config.istio.io
  labels:
    app: mixer
    package: authorization
    istio: mixer-instance
spec:
  group: config.istio.io
  names:
    kind: authorization
    plural: authorizations
    singular: authorization
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: checknothings.config.istio.io
  labels:
    app: mixer
    package: checknothing
    istio: mixer-instance
spec:
  group: config.istio.io
  names:
    kind: checknothing
    plural: checknothings
    singular: checknothing
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: kuberneteses.config.istio.io
  labels:
    app: mixer
    package: adapter.template.kubernetes
    istio: mixer-instance
spec:
  group: config.istio.io
  names:
    kind: kubernetes
    plural: kuberneteses
    singular: kubernetes
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: listentries.config.istio.io
  labels:
    app: mixer
    package: listentry
    istio: mixer-instance
spec:
  group: config.istio.io
  names:
    kind: listentry
    plural: listentries
    singular: listentry
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: logentries.config.istio.io
  labels:
    app: mixer
    package: logentry
    istio: mixer-instance
spec:
  group: config.istio.io
  names:
    kind: logentry
    plural: logentries
    singular: logentry
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: metrics.config.istio.io
  labels:
    app: mixer
    package: metric
    istio: mixer-instance
spec:
  group: config.istio.io
  names:
    kind: metric
    plural: metrics
    singular: metric
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: quotas.config.istio.io
  labels:
    app: mixer
    package: quota
    istio: mixer-instance
spec:
  group: config.istio.io
  names:
    kind: quota
    plural: quotas
    singular: quota
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: reportnothings.config.istio.io
  labels:
    app: mixer
    package: reportnothing
    istio: mixer-instance
spec:
  group: config.istio.io
  names:
    kind: reportnothing
    plural: reportnothings
    singular: reportnothing
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: servicecontrolreports.config.istio.io
  labels:
    app: mixer
    package: servicecontrolreport
    istio: mixer-instance
spec:
  group: config.istio.io
  names:
    kind: servicecontrolreport
    plural: servicecontrolreports
    singular: servicecontrolreport
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: tracespans.config.istio.io
  labels:
    app: mixer
    package: tracespan
    istio: mixer-instance
spec:
  group: config.istio.io
  names:
    kind: tracespan
    plural: tracespans
    singular: tracespan
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: serviceroles.config.istio.io
  labels:
    app: mixer
    package: istio.io.mixer
    istio: rbac
spec:
  group: config.istio.io
  names:
    kind: ServiceRole
    plural: serviceroles
    singular: servicerole
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: servicerolebindings.config.istio.io
  labels:
    app: mixer
    package: istio.io.mixer
    istio: rbac
spec:
  group: config.istio.io
  names:
    kind: ServiceRoleBinding
    plural: servicerolebindings
    singular: servicerolebinding
  scope: Namespaced
  version: v1alpha2

---
# Source: istio/charts/pilot/templates/crds.yaml
apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition
metadata:
  name: destinationpolicies.config.istio.io
  labels:
    app: istio-pilot
spec:
  group: config.istio.io
  names:
    kind: DestinationPolicy
    listKind: DestinationPolicyList
    plural: destinationpolicies
    singular: destinationpolicy
  scope: Namespaced
  version: v1alpha2
---
apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition
metadata:
  name: egressrules.config.istio.io
  labels:
    app: istio-pilot
spec:
  group: config.istio.io
  names:
    kind: EgressRule
    listKind: EgressRuleList
    plural: egressrules
    singular: egressrule
  scope: Namespaced
  version: v1alpha2
---
apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition
metadata:
  name: routerules.config.istio.io
  labels:
    app: istio-pilot
spec:
  group: config.istio.io
  names:
    kind: RouteRule
    listKind: RouteRuleList
    plural: routerules
    singular: routerule
  scope: Namespaced
  version: v1alpha2
---
apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition
metadata:
  name: virtualservices.networking.istio.io
  labels:
    app: istio-pilot
spec:
  group: networking.istio.io
  names:
    kind: VirtualService
    listKind: VirtualServiceList
    plural: virtualservices
    singular: virtualservice
  scope: Namespaced
  version: v1alpha3
---
apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition
metadata:
  name: destinationrules.networking.istio.io
  labels:
    app: istio-pilot
spec:
  group: networking.istio.io
  names:
    kind: DestinationRule
    listKind: DestinationRuleList
    plural: destinationrules
    singular: destinationrule
  scope: Namespaced
  version: v1alpha3
---
apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition
metadata:
  name: serviceentries.networking.istio.io
  labels:
    app: istio-pilot
spec:
  group: networking.istio.io
  names:
    kind: ServiceEntry
    listKind: ServiceEntryList
    plural: serviceentries
    singular: serviceentry
  scope: Namespaced
  version: v1alpha3
---
apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition
metadata:
  name: gateways.networking.istio.io
  labels:
    app: istio-pilot
spec:
  group: networking.istio.io
  names:
    kind: Gateway
    plural: gateways
    singular: gateway
  scope: Namespaced
  version: v1alpha3
---
kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: policies.authentication.istio.io
spec:
  group: authentication.istio.io
  names:
    kind: Policy
    plural: policies
    singular: policy
  scope: Namespaced
  version: v1alpha1
---
kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: httpapispecbindings.config.istio.io
spec:
  group: config.istio.io
  names:
    kind: HTTPAPISpecBinding
    plural: httpapispecbindings
    singular: httpapispecbinding
  scope: Namespaced
  version: v1alpha2
---
kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: httpapispecs.config.istio.io
spec:
  group: config.istio.io
  names:
    kind: HTTPAPISpec
    plural: httpapispecs
    singular: httpapispec
  scope: Namespaced
  version: v1alpha2
---
kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: quotaspecbindings.config.istio.io
spec:
  group: config.istio.io
  names:
    kind: QuotaSpecBinding
    plural: quotaspecbindings
    singular: quotaspecbinding
  scope: Namespaced
  version: v1alpha2
---
kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: quotaspecs.config.istio.io
spec:
  group: config.istio.io
  names:
    kind: QuotaSpec
    plural: quotaspecs
    singular: quotaspec
  scope: Namespaced
  version: v1alpha2


---
# Source: istio/charts/mixer/templates/clusterrole.yaml

apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
  name: istio-mixer-istio-system
  namespace: istio-system
  labels:
    app: mixer
    chart: mixer-0.8.0
    heritage: Tiller
    release: RELEASE-NAME
rules:
- apiGroups: ["config.istio.io"] # istio CRD watcher
  resources: ["*"]
  verbs: ["create", "get", "list", "watch", "patch"]
- apiGroups: ["apiextensions.k8s.io"]
  resources: ["customresourcedefinitions"]
  verbs: ["get", "list", "watch"]
- apiGroups: [""]
  resources: ["configmaps", "endpoints", "pods", "services", "namespaces", "secrets"]
  verbs: ["get", "list", "watch"]

---
# Source: istio/charts/pilot/templates/clusterrole.yaml

apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
  name: istio-pilot-istio-system
  namespace: istio-system
  labels:
    app: istio-pilot
    chart: pilot-0.8.0
    heritage: Tiller
    release: RELEASE-NAME
rules:
- apiGroups: ["config.istio.io"]
  resources: ["*"]
  verbs: ["*"]
- apiGroups: ["networking.istio.io"]
  resources: ["*"]
  verbs: ["*"]
- apiGroups: ["authentication.istio.io"]
  resources: ["*"]
  verbs: ["*"]
- apiGroups: ["apiextensions.k8s.io"]
  resources: ["customresourcedefinitions"]
  verbs: ["*"]
- apiGroups: ["extensions"]
  resources: ["thirdpartyresources", "thirdpartyresources.extensions", "ingresses", "ingresses/status"]
  verbs: ["*"]
- apiGroups: [""]
  resources: ["configmaps"]
  verbs: ["create", "get", "list", "watch", "update"]
- apiGroups: [""]
  resources: ["endpoints", "pods", "services"]
  verbs: ["get", "list", "watch"]
- apiGroups: [""]
  resources: ["namespaces", "nodes", "secrets"]
  verbs: ["get", "list", "watch"]

---
# Source: istio/charts/prometheus/templates/clusterrole.yaml

---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
  name: prometheus-istio-system
  namespace: istio-system
rules:
- apiGroups: [""]
  resources:
  - nodes
  - services
  - endpoints
  - pods
  - nodes/proxy
  verbs: ["get", "list", "watch"]
- apiGroups: [""]
  resources:
  - configmaps
  verbs: ["get"]
- nonResourceURLs: ["/metrics"]
  verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
  name: prometheus-istio-system
  namespace: istio-system
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: prometheus-istio-system
subjects:
- kind: ServiceAccount
  name: prometheus
  namespace: istio-system
---


---
# Source: istio/charts/security/templates/clusterrole.yaml

apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
  name: istio-citadel-istio-system
  namespace: istio-system
  labels:
    app: security
    chart: security-0.8.0
    heritage: Tiller
    release: RELEASE-NAME
rules:
- apiGroups: [""]
  resources: ["secrets"]
  verbs: ["create", "get", "watch", "list", "update", "delete"]
- apiGroups: [""]
  resources: ["serviceaccounts"]
  verbs: ["get", "watch", "list"]
- apiGroups: [""]
  resources: ["services"]
  verbs: ["get", "watch", "list"]
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: Role
metadata:
  name: istio-cleanup-old-ca-istio-system
  namespace: istio-system
  labels:
    app: security
    chart: security-0.8.0
    heritage: Tiller
    release: RELEASE-NAME
rules:
- apiGroups: [""]
  resources: ["deployments", "serviceaccounts", "services"]
  verbs: ["get", "delete"]
- apiGroups: ["extensions"]
  resources: ["deployments", "replicasets"]
  verbs: ["get", "list", "update", "delete"]

---
# Source: istio/charts/sidecarInjectorWebhook/templates/clusterrole.yaml

apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
  name: istio-sidecar-injector-istio-system
  labels:
    app: istio-sidecar-injector
    chart: sidecarInjectorWebhook-0.8.0
    heritage: Tiller
    release: RELEASE-NAME
rules:
- apiGroups: ["*"]
  resources: ["configmaps"]
  verbs: ["get", "list", "watch"]
- apiGroups: ["admissionregistration.k8s.io"]
  resources: ["mutatingwebhookconfigurations"]
  verbs: ["get", "list", "watch", "patch"]

---
# Source: istio/charts/mixer/templates/clusterrolebinding.yaml

apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
  name: istio-mixer-admin-role-binding-istio-system
  labels:
    app: mixer
    chart: mixer-0.8.0
    heritage: Tiller
    release: RELEASE-NAME
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: istio-mixer-istio-system
subjects:
  - kind: ServiceAccount
    name: istio-mixer-service-account
    namespace: istio-system

---
# Source: istio/charts/pilot/templates/clusterrolebinding.yaml

apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
  name: istio-pilot-istio-system
  labels:
    app: istio-pilot
    chart: pilot-0.8.0
    heritage: Tiller
    release: RELEASE-NAME
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: istio-pilot-istio-system
subjects:
  - kind: ServiceAccount
    name: istio-pilot-service-account
    namespace: istio-system

---
# Source: istio/charts/security/templates/clusterrolebinding.yaml

apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
  name: istio-citadel-istio-system
  labels:
    app: security
    chart: security-0.8.0
    heritage: Tiller
    release: RELEASE-NAME
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: istio-citadel-istio-system
subjects:
  - kind: ServiceAccount
    name: istio-citadel-service-account
    namespace: istio-system
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: RoleBinding
metadata:
  name: istio-cleanup-old-ca-istio-system
  namespace: istio-system
  labels:
    app: security
    chart: security-0.8.0
    heritage: Tiller
    release: RELEASE-NAME
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: Role
  name: istio-cleanup-old-ca-istio-system
subjects:
  - kind: ServiceAccount
    name: istio-cleanup-old-ca-service-account
    namespace: istio-system

---
# Source: istio/charts/sidecarInjectorWebhook/templates/clusterrolebinding.yaml

apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
  name: istio-sidecar-injector-admin-role-binding-istio-system
  labels:
    app: istio-sidecar-injector
    chart: sidecarInjectorWebhook-0.8.0
    heritage: Tiller
    release: RELEASE-NAME
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: istio-sidecar-injector-istio-system
subjects:
  - kind: ServiceAccount
    name: istio-sidecar-injector-service-account
    namespace: istio-system
---
# Source: istio/charts/egressgateway/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: istio-egressgateway
  namespace: istio-system    
  labels:
    chart: egressgateway-0.8.0
    release: RELEASE-NAME
    heritage: Tiller
    istio: egressgateway
spec:
  type: ClusterIP
  selector:
    istio: egressgateway
  ports:
    -
      name: http
      port: 80
    -
      name: https
      port: 443

---
# Source: istio/charts/grafana/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: grafana
  namespace: istio-system
  annotations:
    auth.istio.io/3000: NONE
  labels:
    app: grafana
    chart: grafana-0.1.0
    release: RELEASE-NAME
    heritage: Tiller
spec:
  type: ClusterIP
  ports:
    - port: 3000
      targetPort: 3000
      protocol: TCP
      name: http
  selector:
    app: grafana

---
# Source: istio/charts/ingressgateway/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: istio-ingressgateway
  namespace: istio-system    
  labels:
    chart: ingressgateway-0.8.0
    release: RELEASE-NAME
    heritage: Tiller
    istio: ingressgateway
spec:
  type: NodePort
  selector:
    istio: ingressgateway
  ports:
    -
      name: http
      nodePort: 31380
      port: 80
    -
      name: https
      nodePort: 31390
      port: 443
    -
      name: tcp
      nodePort: 31400
      port: 31400

---
# Source: istio/charts/mixer/templates/service.yaml

apiVersion: v1
kind: Service
metadata:
  name: istio-policy
  namespace: istio-system
  labels:
    chart: mixer-0.8.0
    release: RELEASE-NAME
    istio: mixer
spec:
  ports:
  - name: grpc-mixer
    port: 9091
  - name: grpc-mixer-mtls
    port: 15004
  - name: http-monitoring
    port: 9093
  selector:
    istio: mixer
    istio-mixer-type: policy
---
apiVersion: v1
kind: Service
metadata:
  name: istio-telemetry
  namespace: istio-system
  labels:
    chart: mixer-0.8.0
    release: RELEASE-NAME
    istio: mixer
spec:
  ports:
  - name: grpc-mixer
    port: 9091
  - name: grpc-mixer-mtls
    port: 15004
  - name: http-monitoring
    port: 9093
  - name: prometheus
    port: 42422
  selector:
    istio: mixer
    istio-mixer-type: telemetry
---

---
# Source: istio/charts/mixer/templates/statsdtoprom.yaml

---
apiVersion: v1
kind: Service
metadata:
  name: istio-statsd-prom-bridge
  namespace: istio-system
  labels:
    chart: mixer-0.8.0
    release: RELEASE-NAME
    istio: statsd-prom-bridge
spec:
  ports:
  - name: statsd-prom
    port: 9102
  - name: statsd-udp
    port: 9125
    protocol: UDP
  selector:
    istio: statsd-prom-bridge

---

apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: istio-statsd-prom-bridge
  namespace: istio-system
  labels:
    chart: mixer-0.8.0
    release: RELEASE-NAME
    istio: mixer
spec:
  template:
    metadata:
      labels:
        istio: statsd-prom-bridge
      annotations:
        sidecar.istio.io/inject: "false"
    spec:
      serviceAccountName: istio-mixer-service-account
      volumes:
      - name: config-volume
        configMap:
          name: istio-statsd-prom-bridge
      containers:
      - name: statsd-prom-bridge
        image: "prom/statsd-exporter:latest"
        imagePullPolicy: IfNotPresent
        ports:
        - containerPort: 9102
        - containerPort: 9125
          protocol: UDP
        args:
        - '-statsd.mapping-config=/etc/statsd/mapping.conf'
        resources:
            {}
            
        volumeMounts:
        - name: config-volume
          mountPath: /etc/statsd

---
# Source: istio/charts/pilot/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: istio-pilot
  namespace: istio-system
  labels:
    app: istio-pilot
    chart: pilot-0.8.0
    release: RELEASE-NAME
    heritage: Tiller
spec:
  ports:
  - port: 15003
    name: http-old-discovery # mTLS or non-mTLS depending on auth setting
  - port: 15005
    name: https-discovery # always mTLS
  - port: 15007
    name: http-discovery # always plain-text
  - port: 15010
    name: grpc-xds # direct
  - port: 15011
    name: https-xds # mTLS
  - port: 8080
    name: http-legacy-discovery # direct
  - port: 9093
    name: http-monitoring
  selector:
    istio: pilot

---
# Source: istio/charts/prometheus/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: prometheus
  namespace: istio-system
  annotations:
    prometheus.io/scrape: 'true'
  labels:
    name: prometheus
spec:
  selector:
    app: prometheus
  ports:
  - name: http-prometheus
    protocol: TCP
    port: 9090

---
# Source: istio/charts/security/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  # we use the normal name here (e.g. 'prometheus')
  # as grafana is configured to use this as a data source
  name: istio-citadel
  namespace: istio-system
  labels:
    app: istio-citadel
spec:
  ports:
    - name: grpc-citadel
      port: 8060
      targetPort: 8060
      protocol: TCP
    - name: http-monitoring
      port: 9093
  selector:
    istio: citadel

---
# Source: istio/charts/servicegraph/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: servicegraph
  namespace: istio-system
  labels:
    app: servicegraph
    chart: servicegraph-0.1.0
    release: RELEASE-NAME
    heritage: Tiller
spec:
  type: ClusterIP
  ports:
    - port: 8088
      targetPort: 8088
      protocol: TCP
      name: http
  selector:
    app: servicegraph

---
# Source: istio/charts/sidecarInjectorWebhook/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: istio-sidecar-injector
  namespace: istio-system
  labels:
    istio: sidecar-injector
spec:
  ports:
  - port: 443
  selector:
    istio: sidecar-injector

---
# Source: istio/charts/egressgateway/templates/deployment.yaml
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: istio-egressgateway
  namespace: istio-system
  labels:
    app: egressgateway
    chart: egressgateway-0.8.0
    release: RELEASE-NAME
    heritage: Tiller
    istio: egressgateway
spec:
  replicas: 
  template:
    metadata:
      labels:
        istio: egressgateway
      annotations:
        sidecar.istio.io/inject: "false"
    spec:
      serviceAccountName: istio-egressgateway-service-account
      containers:
        - name: egressgateway
          image: "docker.io/istio/proxyv2:0.8.0"
          imagePullPolicy: IfNotPresent
          ports:
            - containerPort: 80
            - containerPort: 443
          args:
          - proxy
          - router
          - -v
          - "2"
          - --discoveryRefreshDelay
          - '1s' #discoveryRefreshDelay
          - --drainDuration
          - '45s' #drainDuration
          - --parentShutdownDuration
          - '1m0s' #parentShutdownDuration
          - --connectTimeout
          - '10s' #connectTimeout
          - --serviceCluster
          - istio-egressgateway
          - --zipkinAddress
          - zipkin:9411
          - --statsdUdpAddress
          - istio-statsd-prom-bridge:9125
          - --proxyAdminPort
          - "15000"
          - --controlPlaneAuthPolicy
          - NONE
          - --discoveryAddress
          - istio-pilot:8080
          resources:
            {}
            
          env:
          - name: POD_NAME
            valueFrom:
              fieldRef:
                apiVersion: v1
                fieldPath: metadata.name
          - name: POD_NAMESPACE
            valueFrom:
              fieldRef:
                apiVersion: v1
                fieldPath: metadata.namespace
          - name: INSTANCE_IP
            valueFrom:
              fieldRef:
                fieldPath: status.podIP
          - name: ISTIO_META_POD_NAME
            valueFrom:
              fieldRef:
                fieldPath: metadata.name
          volumeMounts:
          - name: istio-certs
            mountPath: /etc/certs
            readOnly: true
      volumes:
      - name: istio-certs
        secret:
          secretName: "istio.default"
          optional: true
      affinity:      
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
                - ppc64le
                - s390x
          preferredDuringSchedulingIgnoredDuringExecution:
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - ppc64le
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - s390x

---
# Source: istio/charts/grafana/templates/deployment.yaml
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: grafana
  namespace: istio-system
  labels:
    app: grafana
    chart: grafana-0.1.0
    release: RELEASE-NAME
    heritage: Tiller
spec:
  replicas: 1
  template:
    metadata:
      labels:
        app: grafana
      annotations:
        sidecar.istio.io/inject: "false"
    spec:
      containers:
        - name: grafana
          image: "docker.io/istio/grafana:0.8.0"
          imagePullPolicy: IfNotPresent
          ports:
            - containerPort: 3000
          readinessProbe:
            httpGet:
              path: /login
              port: 3000
          env:
          - name: GRAFANA_PORT
            value: "3000"
          - name: GF_AUTH_BASIC_ENABLED
            value: "false"
          - name: GF_AUTH_ANONYMOUS_ENABLED
            value: "true"
          - name: GF_AUTH_ANONYMOUS_ORG_ROLE
            value: Admin
          - name: GF_PATHS_DATA
            value: /data/grafana
          resources:
            {}
            
          volumeMounts:
          - name: data
            mountPath: /data/grafana
      affinity:      
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
                - ppc64le
                - s390x
          preferredDuringSchedulingIgnoredDuringExecution:
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - ppc64le
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - s390x
      volumes:
      - name: data
        emptyDir: {}
---
# Source: istio/charts/ingressgateway/templates/deployment.yaml
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: istio-ingressgateway
  namespace: istio-system
  labels:
    app: ingressgateway
    chart: ingressgateway-0.8.0
    release: RELEASE-NAME
    heritage: Tiller
    istio: ingressgateway
spec:
  replicas: 
  template:
    metadata:
      labels:
        istio: ingressgateway
      annotations:
        sidecar.istio.io/inject: "false"
    spec:
      serviceAccountName: istio-ingressgateway-service-account
      containers:
        - name: ingressgateway
          image: "docker.io/istio/proxyv2:0.8.0"
          imagePullPolicy: IfNotPresent
          ports:
            - containerPort: 80
            - containerPort: 443
            - containerPort: 31400
          args:
          - proxy
          - router
          - -v
          - "2"
          - --discoveryRefreshDelay
          - '1s' #discoveryRefreshDelay
          - --drainDuration
          - '45s' #drainDuration
          - --parentShutdownDuration
          - '1m0s' #parentShutdownDuration
          - --connectTimeout
          - '10s' #connectTimeout
          - --serviceCluster
          - istio-ingressgateway
          - --zipkinAddress
          - zipkin:9411
          - --statsdUdpAddress
          - istio-statsd-prom-bridge:9125
          - --proxyAdminPort
          - "15000"
          - --controlPlaneAuthPolicy
          - NONE
          - --discoveryAddress
          - istio-pilot:8080
          resources:
            {}
            
          env:
          - name: POD_NAME
            valueFrom:
              fieldRef:
                apiVersion: v1
                fieldPath: metadata.name
          - name: POD_NAMESPACE
            valueFrom:
              fieldRef:
                apiVersion: v1
                fieldPath: metadata.namespace
          - name: INSTANCE_IP
            valueFrom:
              fieldRef:
                apiVersion: v1
                fieldPath: status.podIP
          - name: ISTIO_META_POD_NAME
            valueFrom:
              fieldRef:
                fieldPath: metadata.name
          volumeMounts:
          - name: istio-certs
            mountPath: /etc/certs
            readOnly: true
          - name: ingressgateway-certs
            mountPath: "/etc/istio/ingressgateway-certs"
            readOnly: true
      volumes:
      - name: istio-certs
        secret:
          secretName: "istio.default"
          optional: true
      - name: ingressgateway-certs
        secret:
          secretName: "istio-ingressgateway-certs"
          optional: true
      affinity:      
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
                - ppc64le
                - s390x
          preferredDuringSchedulingIgnoredDuringExecution:
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - ppc64le
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - s390x

---
# Source: istio/charts/mixer/templates/deployment.yaml

apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: istio-policy
  namespace: istio-system
  labels:
    chart: mixer-0.8.0
    release: RELEASE-NAME
    istio: mixer
spec:
  replicas: 1
  template:
    metadata:
      labels:
        istio: mixer
        istio-mixer-type: policy
      annotations:
        sidecar.istio.io/inject: "false"
    spec:
      serviceAccountName: istio-mixer-service-account
      volumes:
      - name: istio-certs
        secret:
          secretName: istio.istio-mixer-service-account
          optional: true
      affinity:      
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
                - ppc64le
                - s390x
          preferredDuringSchedulingIgnoredDuringExecution:
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - ppc64le
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - s390x
      containers:
      - name: mixer
        image: "docker.io/istio/mixer:0.8.0"
        imagePullPolicy: IfNotPresent
        ports:
        - containerPort: 9092
        - containerPort: 9093
        - containerPort: 42422
        args:
          - --address
          - tcp://127.0.0.1:9092
          - --configStoreURL=k8s://
          - --configDefaultNamespace=istio-system
          - --trace_zipkin_url=http://zipkin:9411/api/v1/spans
        resources:
            {}
            
      - name: istio-proxy
        image: "docker.io/istio/proxyv2:0.8.0"
        imagePullPolicy: IfNotPresent
        ports:
        - containerPort: 9091
        - containerPort: 15004
        args:
        - proxy
        - --serviceCluster
        - istio-policy
        - --templateFile
        - /etc/istio/proxy/envoy_policy.yaml.tmpl
        - --controlPlaneAuthPolicy
        - NONE
        env:
        - name: POD_NAME
          valueFrom:
            fieldRef:
              apiVersion: v1
              fieldPath: metadata.name
        - name: POD_NAMESPACE
          valueFrom:
            fieldRef:
              apiVersion: v1
              fieldPath: metadata.namespace
        - name: INSTANCE_IP
          valueFrom:
            fieldRef:
              apiVersion: v1
              fieldPath: status.podIP
        resources:
            requests:
              cpu: 100m
              memory: 128Mi
            
        volumeMounts:
        - name: istio-certs
          mountPath: /etc/certs
          readOnly: true

---
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: istio-telemetry
  namespace: istio-system
  labels:
    chart: mixer-0.8.0
    release: RELEASE-NAME
    istio: mixer
spec:
  replicas: 1
  template:
    metadata:
      labels:
        istio: mixer
        istio-mixer-type: telemetry
      annotations:
        sidecar.istio.io/inject: "false"
    spec:
      serviceAccountName: istio-mixer-service-account
      volumes:
      - name: istio-certs
        secret:
          secretName: istio.istio-mixer-service-account
          optional: true
      containers:
      - name: mixer
        image: "docker.io/istio/mixer:0.8.0"
        imagePullPolicy: IfNotPresent
        ports:
        - containerPort: 9092
        - containerPort: 9093
        - containerPort: 42422
        args:
          - --address
          - tcp://127.0.0.1:9092
          - --configStoreURL=k8s://
          - --configDefaultNamespace=istio-system
          - --trace_zipkin_url=http://zipkin:9411/api/v1/spans
        resources:
            {}
            
      - name: istio-proxy
        image: "docker.io/istio/proxyv2:0.8.0"
        imagePullPolicy: IfNotPresent
        ports:
        - containerPort: 9091
        - containerPort: 15004
        args:
        - proxy
        - --serviceCluster
        - istio-telemetry
        - --templateFile
        - /etc/istio/proxy/envoy_telemetry.yaml.tmpl
        - --controlPlaneAuthPolicy
        - NONE
        env:
        - name: POD_NAME
          valueFrom:
            fieldRef:
              apiVersion: v1
              fieldPath: metadata.name
        - name: POD_NAMESPACE
          valueFrom:
            fieldRef:
              apiVersion: v1
              fieldPath: metadata.namespace
        - name: INSTANCE_IP
          valueFrom:
            fieldRef:
              apiVersion: v1
              fieldPath: status.podIP
        resources:
            requests:
              cpu: 100m
              memory: 128Mi
            
        volumeMounts:
        - name: istio-certs
          mountPath: /etc/certs
          readOnly: true

--- 

---
# Source: istio/charts/pilot/templates/deployment.yaml
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: istio-pilot
  namespace: istio-system
  # TODO: default tempate doesn't have this, which one is right ?
  labels:
    app: istio-pilot
    chart: pilot-0.8.0
    release: RELEASE-NAME
    heritage: Tiller
    istio: pilot
  annotations:
    checksum/config-volume: f8da08b6b8c170dde721efd680270b2901e750d4aa186ebb6c22bef5b78a43f9
spec:
  replicas: 1
  template:
    metadata:
      labels:
        istio: pilot
      annotations:
        sidecar.istio.io/inject: "false"
    spec:
      serviceAccountName: istio-pilot-service-account
      containers:
        - name: discovery
          image: "docker.io/istio/pilot:0.8.0"
          imagePullPolicy: IfNotPresent
          args:
          - "discovery"
# TODO(sdake) remove when secrets are automagically registered
          ports:
          - containerPort: 8080
          - containerPort: 15010
          readinessProbe:
            httpGet:
              path: /v1/registration
              port: 8080
            initialDelaySeconds: 30
            periodSeconds: 30
            timeoutSeconds: 5
          env:
          - name: POD_NAME
            valueFrom:
              fieldRef:
                apiVersion: v1
                fieldPath: metadata.name
          - name: POD_NAMESPACE
            valueFrom:
              fieldRef:
                apiVersion: v1
                fieldPath: metadata.namespace
          - name: PILOT_THROTTLE
            value: "500"
          - name: PILOT_CACHE_SQUASH
            value: "5"
          resources:
            {}
            
          volumeMounts:
          - name: config-volume
            mountPath: /etc/istio/config
          - name: istio-certs
            mountPath: /etc/certs
            readOnly: true
        - name: istio-proxy
          image: "docker.io/istio/proxyv2:0.8.0"
          imagePullPolicy: IfNotPresent
          ports:
          - containerPort: 15003
          - containerPort: 15005
          - containerPort: 15007
          - containerPort: 15011
          args:
          - proxy
          - --serviceCluster
          - istio-pilot
          - --templateFile
          - /etc/istio/proxy/envoy_pilot.yaml.tmpl
          - --controlPlaneAuthPolicy
          - NONE
          env:
          - name: POD_NAME
            valueFrom:
              fieldRef:
                apiVersion: v1
                fieldPath: metadata.name
          - name: POD_NAMESPACE
            valueFrom:
              fieldRef:
                apiVersion: v1
                fieldPath: metadata.namespace
          - name: INSTANCE_IP
            valueFrom:
              fieldRef:
                apiVersion: v1
                fieldPath: status.podIP
          resources:
            requests:
              cpu: 100m
              memory: 128Mi
            
          volumeMounts:
          - name: istio-certs
            mountPath: /etc/certs
            readOnly: true
      volumes:
      - name: config-volume
        configMap:
          name: istio
      - name: istio-certs
        secret:
          secretName: "istio.istio-pilot-service-account"
      affinity:      
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
                - ppc64le
                - s390x
          preferredDuringSchedulingIgnoredDuringExecution:
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - ppc64le
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - s390x

---
# Source: istio/charts/prometheus/templates/deployment.yaml
# TODO: the original template has service account, roles, etc
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: prometheus
  namespace: istio-system
  labels:
    app: prometheus
    chart: prometheus-0.1.0
    release: RELEASE-NAME
    heritage: Tiller
spec:
  replicas: 1
  selector:
    matchLabels:
      app: prometheus
  template:
    metadata:
      labels:
        app: prometheus
      annotations:
        sidecar.istio.io/inject: "false"
    spec:
      serviceAccountName: prometheus

      containers:
        - name: prometheus
          image: "docker.io/prom/prometheus:latest"
          imagePullPolicy: IfNotPresent
          args:
            - '--storage.tsdb.retention=6h'
            - '--config.file=/etc/prometheus/prometheus.yml'
          ports:
            - containerPort: 9090
              name: http
          livenessProbe:
            httpGet:
              path: /-/healthy
              port: 9090
          readinessProbe:
            httpGet:
              path: /-/ready
              port: 9090
          resources:
            {}
            
          volumeMounts:
          - name: config-volume
            mountPath: /etc/prometheus
      volumes:
      - name: config-volume
        configMap:
          name: prometheus
      affinity:      
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
                - ppc64le
                - s390x
          preferredDuringSchedulingIgnoredDuringExecution:
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - ppc64le
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - s390x

---
# Source: istio/charts/security/templates/deployment.yaml
# istio CA watching all namespaces
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: istio-citadel
  namespace: istio-system
  labels:
    app: security
    chart: security-0.8.0
    release: RELEASE-NAME
    heritage: Tiller
    istio: citadel
spec:
  replicas: 1
  template:
    metadata:
      labels:
        istio: citadel
      annotations:
        sidecar.istio.io/inject: "false"
    spec:
      serviceAccountName: istio-citadel-service-account
      containers:
        - name: citadel
          image: "docker.io/istio/citadel:0.8.0"
          imagePullPolicy: IfNotPresent
          args:
            - --append-dns-names=true
            - --grpc-port=8060
            - --grpc-hostname=citadel
            - --self-signed-ca=true
            - --citadel-storage-namespace=istio-system
          resources:
            {}
            
      affinity:      
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
                - ppc64le
                - s390x
          preferredDuringSchedulingIgnoredDuringExecution:
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - ppc64le
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - s390x

---
# Source: istio/charts/servicegraph/templates/deployment.yaml
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: servicegraph
  namespace: istio-system
  labels:
    app: servicegraph
    chart: servicegraph-0.1.0
    release: RELEASE-NAME
    heritage: Tiller
spec:
  replicas: 1
  template:
    metadata:
      labels:
        app: servicegraph
      annotations:
        sidecar.istio.io/inject: "false"
    spec:
      containers:
        - name: servicegraph
          image: "docker.io/istio/servicegraph:0.8.0"
          imagePullPolicy: IfNotPresent
          ports:
            - containerPort: 8088
          args:
          - --prometheusAddr=http://prometheus:9090
          livenessProbe:
            httpGet:
              path: /graph
              port: 8088
          readinessProbe:
            httpGet:
              path: /graph
              port: 8088
          resources:
            {}
            
      affinity:      
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
                - ppc64le
                - s390x
          preferredDuringSchedulingIgnoredDuringExecution:
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - ppc64le
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - s390x

---
# Source: istio/charts/sidecarInjectorWebhook/templates/deployment.yaml
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: istio-sidecar-injector
  namespace: istio-system
  labels:
    app: sidecarInjectorWebhook
    chart: sidecarInjectorWebhook-0.8.0
    release: RELEASE-NAME
    heritage: Tiller
    istio: sidecar-injector
spec:
  replicas: 
  template:
    metadata:
      labels:
        istio: sidecar-injector
    spec:
      serviceAccountName: istio-sidecar-injector-service-account
      containers:
        - name: sidecar-injector-webhook
          image: "docker.io/istio/sidecar_injector:0.8.0"
          imagePullPolicy: IfNotPresent
          args:
            - --caCertFile=/etc/istio/certs/root-cert.pem
            - --tlsCertFile=/etc/istio/certs/cert-chain.pem
            - --tlsKeyFile=/etc/istio/certs/key.pem
            - --injectConfig=/etc/istio/inject/config
            - --meshConfig=/etc/istio/config/mesh
            - --healthCheckInterval=2s
            - --healthCheckFile=/health
          volumeMounts:
          - name: config-volume
            mountPath: /etc/istio/config
            readOnly: true
          - name: certs
            mountPath: /etc/istio/certs
            readOnly: true
          - name: inject-config
            mountPath: /etc/istio/inject
            readOnly: true
          livenessProbe:
            exec:
              command:
                - /usr/local/bin/sidecar-injector
                - probe
                - --probe-path=/health
                - --interval=2s
            initialDelaySeconds: 4
            periodSeconds: 4
          readinessProbe:
            exec:
              command:
                - /usr/local/bin/sidecar-injector
                - probe
                - --probe-path=/health
                - --interval=2s
            initialDelaySeconds: 4
            periodSeconds: 4
      volumes:
      - name: config-volume
        configMap:
          name: istio
      - name: certs
        secret:
          secretName: istio.istio-sidecar-injector-service-account
      - name: inject-config
        configMap:
          name: istio-sidecar-injector
          items:
          - key: config
            path: config
      affinity:      
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
                - ppc64le
                - s390x
          preferredDuringSchedulingIgnoredDuringExecution:
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - ppc64le
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - s390x

---
# Source: istio/charts/tracing/templates/deployment.yaml
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: istio-tracing
  namespace: istio-system
  labels:
    app: istio-tracing
    chart: tracing-0.1.0
    release: RELEASE-NAME
    heritage: Tiller
spec:
  replicas: 1
  template:
    metadata:
      labels:
        app: jaeger
      annotations:
        sidecar.istio.io/inject: "false"
    spec:
      containers:
        - name: jaeger
          image: "jaegertracing/all-in-one:1.5"
          imagePullPolicy: IfNotPresent
          ports:
            - containerPort: 9411
            - containerPort: 16686
            - containerPort: 5775
              protocol: UDP
            - containerPort: 6831
              protocol: UDP
            - containerPort: 6832
              protocol: UDP
          env:
          - name: POD_NAMESPACE
            valueFrom:
              fieldRef:
                apiVersion: v1
                fieldPath: metadata.namespace
          - name: COLLECTOR_ZIPKIN_HTTP_PORT
            value: "9411"
          - name: MEMORY_MAX_TRACES
            value: "50000"
          livenessProbe:
            httpGet:
              path: /
              port: 16686
          readinessProbe:
            httpGet:
              path: /
              port: 16686
          resources:
            {}
            
      affinity:      
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
                - ppc64le
                - s390x
          preferredDuringSchedulingIgnoredDuringExecution:
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - ppc64le
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - s390x

---
# Source: istio/charts/security/templates/cleanup-old-ca.yaml

apiVersion: batch/v1
kind: Job
metadata:
  name: istio-cleanup-old-ca
  namespace: istio-system
  annotations:
    "helm.sh/hook": post-install
    "helm.sh/hook-delete-policy": hook-succeeded
  labels:
    app: security
    chart: security-0.8.0
    release: RELEASE-NAME
    heritage: Tiller
spec:
  template:
    metadata:
      name: istio-cleanup-old-ca
      labels:
        app: security
        release: RELEASE-NAME
    spec:
      serviceAccountName: istio-cleanup-old-ca-service-account
      containers:
        - name: hyperkube
          image: "quay.io/coreos/hyperkube:v1.7.6_coreos.0"
          command:
          - /bin/bash
          - -c
          - >
              NS="-n istio-system";
              ./kubectl get deploy istio-ca $NS;
              if [[ $? = 0 ]]; then ./kubectl delete deploy istio-ca $NS; fi;
              ./kubectl get serviceaccount istio-ca-service-account $NS;
              if [[ $? = 0 ]]; then ./kubectl delete serviceaccount istio-ca-service-account $NS; fi;
              ./kubectl get service istio-ca-ilb $NS;
              if [[ $? = 0 ]]; then ./kubectl delete service istio-ca-ilb $NS; fi
      restartPolicy: Never
---
# Source: istio/charts/egressgateway/templates/autoscale.yaml

apiVersion: autoscaling/v2beta1
kind: HorizontalPodAutoscaler
metadata:
    name: istio-egressgateway
    namespace: istio-system
spec:
    maxReplicas: 1
    minReplicas: 1
    scaleTargetRef:
      apiVersion: apps/v1beta1
      kind: Deployment
      name: istio-egressgateway
    metrics:
      - type: Resource
        resource:
          name: cpu
          targetAverageUtilization: 80


---
# Source: istio/charts/ingressgateway/templates/autoscale.yaml

apiVersion: autoscaling/v2beta1
kind: HorizontalPodAutoscaler
metadata:
    name: istio-ingressgateway
    namespace: istio-system
spec:
    maxReplicas: 1
    minReplicas: 1
    scaleTargetRef:
      apiVersion: apps/v1beta1
      kind: Deployment
      name: istio-ingressgateway
    metrics:
      - type: Resource
        resource:
          name: cpu
          targetAverageUtilization: 80


---
# Source: istio/charts/tracing/templates/service.yaml
apiVersion: v1
kind: List
items:
- apiVersion: v1
  kind: Service
  metadata:
    name: zipkin
    namespace: istio-system
    labels:
      app: jaeger
      chart: tracing-0.1.0
      release: RELEASE-NAME
      heritage: Tiller
  spec:
    type: ClusterIP
    ports:
      - port: 9411
        targetPort: 9411
        protocol: TCP
        name: http
    selector:
      app: jaeger
- apiVersion: v1
  kind: Service
  metadata:
    name: tracing
    namespace: istio-system
    labels:
      app: jaeger
      chart: tracing-0.1.0
      release: RELEASE-NAME
      heritage: Tiller
  spec:
    ports:
      - name: query-http
        port: 80
        protocol: TCP
        targetPort: 16686
    selector:
      app: jaeger
    type: LoadBalancer


---
# Source: istio/charts/sidecarInjectorWebhook/templates/mutatingwebhook.yaml
apiVersion: admissionregistration.k8s.io/v1beta1
kind: MutatingWebhookConfiguration
metadata:
  name: istio-sidecar-injector
  namespace: istio-system
  labels:
    app: istio-sidecar-injector
    chart: sidecarInjectorWebhook-0.8.0
    release: RELEASE-NAME
    heritage: Tiller
webhooks:
  - name: sidecar-injector.istio.io
    clientConfig:
      service:
        name: istio-sidecar-injector
        namespace: istio-system
        path: "/inject"
      caBundle: ""
    rules:
      - operations: [ "CREATE" ]
        apiGroups: [""]
        apiVersions: ["v1"]
        resources: ["pods"]
    failurePolicy: Fail
    namespaceSelector:
      matchLabels:
        istio-injection: enabled

---
# Source: istio/charts/grafana/templates/ingress.yaml

---
# Source: istio/charts/mixer/templates/config.yaml


---
# Source: istio/charts/prometheus/templates/ingress.yaml

---
# Source: istio/charts/servicegraph/templates/ingress.yaml

---
# Source: istio/charts/tracing/templates/ingress.yaml

---
# Source: istio/charts/tracing/templates/service-jaeger.yaml


================================================
FILE: examples/90_Kubernetes/istio/rendered/istio-v1.0-minikube.yml
================================================
apiVersion: v1
kind: Namespace
metadata:
 name: istio-system
---
# Source: istio/charts/galley/templates/configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: istio-galley-configuration
  namespace: istio-system
  labels:
    app: istio-galley
    chart: galley-1.0.0
    release: istio
    heritage: Tiller
    istio: mixer
data:
  validatingwebhookconfiguration.yaml: |-    
    apiVersion: admissionregistration.k8s.io/v1beta1
    kind: ValidatingWebhookConfiguration
    metadata:
      name: istio-galley
      namespace: istio-system
      labels:
        app: istio-galley
        chart: galley-1.0.0
        release: istio
        heritage: Tiller
    webhooks:
      - name: pilot.validation.istio.io
        clientConfig:
          service:
            name: istio-galley
            namespace: istio-system
            path: "/admitpilot"
          caBundle: ""
        rules:
          - operations:
            - CREATE
            - UPDATE
            apiGroups:
            - config.istio.io
            apiVersions:
            - v1alpha2
            resources:
            - httpapispecs
            - httpapispecbindings
            - quotaspecs
            - quotaspecbindings
          - operations:
            - CREATE
            - UPDATE
            apiGroups:
            - rbac.istio.io
            apiVersions:
            - "*"
            resources:
            - "*"
          - operations:
            - CREATE
            - UPDATE
            apiGroups:
            - authentication.istio.io
            apiVersions:
            - "*"
            resources:
            - "*"
          - operations:
            - CREATE
            - UPDATE
            apiGroups:
            - networking.istio.io
            apiVersions:
            - "*"
            resources:
            - destinationrules
            - envoyfilters
            - gateways
            # disabled per @costinm's request
            # - serviceentries
            - virtualservices
        failurePolicy: Fail
      - name: mixer.validation.istio.io
        clientConfig:
          service:
            name: istio-galley
            namespace: istio-system
            path: "/admitmixer"
          caBundle: ""
        rules:
          - operations:
            - CREATE
            - UPDATE
            apiGroups:
            - config.istio.io
            apiVersions:
            - v1alpha2
            resources:
            - rules
            - attributemanifests
            - circonuses
            - deniers
            - fluentds
            - kubernetesenvs
            - listcheckers
            - memquotas
            - noops
            - opas
            - prometheuses
            - rbacs
            - servicecontrols
            - solarwindses
            - stackdrivers
            - statsds
            - stdios
            - apikeys
            - authorizations
            - checknothings
            # - kuberneteses
            - listentries
            - logentries
            - metrics
            - quotas
            - reportnothings
            - servicecontrolreports
            - tracespans
        failurePolicy: Fail


---
# Source: istio/charts/mixer/templates/configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: istio-statsd-prom-bridge
  namespace: istio-system
  labels:
    app: istio-statsd-prom-bridge
    chart: mixer-1.0.0
    release: istio
    heritage: Tiller
    istio: mixer
data:
  mapping.conf: |-

---
# Source: istio/charts/prometheus/templates/configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus
  namespace: istio-system
  labels:
    app: prometheus
    chart: prometheus-0.1.0
    release: istio
    heritage: Tiller
data:
  prometheus.yml: |-
    global:
      scrape_interval: 15s
    scrape_configs:

    - job_name: 'istio-mesh'
      # Override the global default and scrape targets from this job every 5 seconds.
      scrape_interval: 5s

      kubernetes_sd_configs:
      - role: endpoints
        namespaces:
          names:
          - istio-system

      relabel_configs:
      - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
        action: keep
        regex: istio-telemetry;prometheus

    - job_name: 'envoy'
      # Override the global default and scrape targets from this job every 5 seconds.
      scrape_interval: 5s
      # metrics_path defaults to '/metrics'
      # scheme defaults to 'http'.

      kubernetes_sd_configs:
      - role: endpoints
        namespaces:
          names:
          - istio-system

      relabel_configs:
      - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
        action: keep
        regex: istio-statsd-prom-bridge;statsd-prom

    - job_name: 'istio-policy'
      # Override the global default and scrape targets from this job every 5 seconds.
      scrape_interval: 5s
      # metrics_path defaults to '/metrics'
      # scheme defaults to 'http'.

      kubernetes_sd_configs:
      - role: endpoints
        namespaces:
          names:
          - istio-system


      relabel_configs:
      - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
        action: keep
        regex: istio-policy;http-monitoring

    - job_name: 'istio-telemetry'
      # Override the global default and scrape targets from this job every 5 seconds.
      scrape_interval: 5s
      # metrics_path defaults to '/metrics'
      # scheme defaults to 'http'.

      kubernetes_sd_configs:
      - role: endpoints
        namespaces:
          names:
          - istio-system

      relabel_configs:
      - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
        action: keep
        regex: istio-telemetry;http-monitoring

    - job_name: 'pilot'
      # Override the global default and scrape targets from this job every 5 seconds.
      scrape_interval: 5s
      # metrics_path defaults to '/metrics'
      # scheme defaults to 'http'.

      kubernetes_sd_configs:
      - role: endpoints
        namespaces:
          names:
          - istio-system

      relabel_configs:
      - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
        action: keep
        regex: istio-pilot;http-monitoring

    - job_name: 'galley'
      # Override the global default and scrape targets from this job every 5 seconds.
      scrape_interval: 5s
      # metrics_path defaults to '/metrics'
      # scheme defaults to 'http'.

      kubernetes_sd_configs:
      - role: endpoints
        namespaces:
          names:
          - istio-system

      relabel_configs:
      - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
        action: keep
        regex: istio-galley;http-monitoring

    # scrape config for API servers
    - job_name: 'kubernetes-apiservers'
      kubernetes_sd_configs:
      - role: endpoints
        namespaces:
          names:
          - default
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
      bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      relabel_configs:
      - source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
        action: keep
        regex: kubernetes;https

    # scrape config for nodes (kubelet)
    - job_name: 'kubernetes-nodes'
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
      bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      kubernetes_sd_configs:
      - role: node
      relabel_configs:
      - action: labelmap
        regex: __meta_kubernetes_node_label_(.+)
      - target_label: __address__
        replacement: kubernetes.default.svc:443
      - source_labels: [__meta_kubernetes_node_name]
        regex: (.+)
        target_label: __metrics_path__
        replacement: /api/v1/nodes/${1}/proxy/metrics

    # Scrape config for Kubelet cAdvisor.
    #
    # This is required for Kubernetes 1.7.3 and later, where cAdvisor metrics
    # (those whose names begin with 'container_') have been removed from the
    # Kubelet metrics endpoint.  This job scrapes the cAdvisor endpoint to
    # retrieve those metrics.
    #
    # In Kubernetes 1.7.0-1.7.2, these metrics are only exposed on the cAdvisor
    # HTTP endpoint; use "replacement: /api/v1/nodes/${1}:4194/proxy/metrics"
    # in that case (and ensure cAdvisor's HTTP server hasn't been disabled with
    # the --cadvisor-port=0 Kubelet flag).
    #
    # This job is not necessary and should be removed in Kubernetes 1.6 and
    # earlier versions, or it will cause the metrics to be scraped twice.
    - job_name: 'kubernetes-cadvisor'
      scheme: https
      tls_config:
        ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
      bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
      kubernetes_sd_configs:
      - role: node
      relabel_configs:
      - action: labelmap
        regex: __meta_kubernetes_node_label_(.+)
      - target_label: __address__
        replacement: kubernetes.default.svc:443
      - source_labels: [__meta_kubernetes_node_name]
        regex: (.+)
        target_label: __metrics_path__
        replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor

    # scrape config for service endpoints.
    - job_name: 'kubernetes-service-endpoints'
      kubernetes_sd_configs:
      - role: endpoints
      relabel_configs:
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
        action: replace
        target_label: __scheme__
        regex: (https?)
      - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)
      - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
        action: replace
        target_label: __address__
        regex: ([^:]+)(?::\d+)?;(\d+)
        replacement: $1:$2
      - action: labelmap
        regex: __meta_kubernetes_service_label_(.+)
      - source_labels: [__meta_kubernetes_namespace]
        action: replace
        target_label: kubernetes_namespace
      - source_labels: [__meta_kubernetes_service_name]
        action: replace
        target_label: kubernetes_name

    # Example scrape config for pods
    - job_name: 'kubernetes-pods'
      kubernetes_sd_configs:
      - role: pod

      relabel_configs:
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)
      - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
        action: replace
        regex: ([^:]+)(?::\d+)?;(\d+)
        replacement: $1:$2
        target_label: __address__
      - action: labelmap
        regex: __meta_kubernetes_pod_label_(.+)
      - source_labels: [__meta_kubernetes_namespace]
        action: replace
        target_label: namespace
      - source_labels: [__meta_kubernetes_pod_name]
        action: replace
        target_label: pod_name

---
# Source: istio/charts/security/templates/configmap.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: istio-security-custom-resources
  namespace: istio-system
  labels:
    app: istio-security
    chart: security-1.0.0
    release: istio
    heritage: Tiller
    istio: security
data:
  custom-resources.yaml: |-
  run.sh: |-    
    #!/bin/sh
    
    set -x
    
    if [ "$#" -ne "1" ]; then
        echo "first argument should be path to custom resource yaml"
        exit 1
    fi
    
    pathToResourceYAML=${1}
    
    /kubectl get validatingwebhookconfiguration istio-galley 2>/dev/null
    if [ "$?" -eq 0 ]; then
        echo "istio-galley validatingwebhookconfiguration found - waiting for istio-galley deployment to be ready"
        while true; do
            /kubectl -n istio-system get deployment istio-galley 2>/dev/null
            if [ "$?" -eq 0 ]; then
                break
            fi
            sleep 1
        done
        /kubectl -n istio-system rollout status deployment istio-galley
        if [ "$?" -ne 0 ]; then
            echo "istio-galley deployment rollout status check failed"
            exit 1
        fi
        echo "istio-galley deployment ready for configuration validation"
    fi
    sleep 5
    /kubectl apply -f ${pathToResourceYAML}
    

---
# Source: istio/templates/configmap.yaml

apiVersion: v1
kind: ConfigMap
metadata:
  name: istio
  namespace: istio-system
  labels:
    app: istio
    chart: istio-1.0.0
    release: istio
    heritage: Tiller
data:
  mesh: |-
    # Set the following variable to true to disable policy checks by the Mixer.
    # Note that metrics will still be reported to the Mixer.
    disablePolicyChecks: false

    # Set enableTracing to false to disable request tracing.
    enableTracing: true

    # Set accessLogFile to empty string to disable access log.
    accessLogFile: "/dev/stdout"
    #
    # Deprecated: mixer is using EDS
    mixerCheckServer: istio-policy.istio-system.svc.cluster.local:9091
    mixerReportServer: istio-telemetry.istio-system.svc.cluster.local:9091

    # Unix Domain Socket through which envoy communicates with NodeAgent SDS to get
    # key/cert for mTLS. Use secret-mount files instead of SDS if set to empty. 
    sdsUdsPath: ""
    
    # How frequently should Envoy fetch key/cert from NodeAgent.
    sdsRefreshDelay: 15s

    #
    defaultConfig:
      #
      # TCP connection timeout between Envoy & the application, and between Envoys.
      connectTimeout: 10s
      #
      ### ADVANCED SETTINGS #############
      # Where should envoy's configuration be stored in the istio-proxy container
      configPath: "/etc/istio/proxy"
      binaryPath: "/usr/local/bin/envoy"
      # The pseudo service name used for Envoy.
      serviceCluster: istio-proxy
      # These settings that determine how long an old Envoy
      # process should be kept alive after an occasional reload.
      drainDuration: 45s
      parentShutdownDuration: 1m0s
      #
      # The mode used to redirect inbound connections to Envoy. This setting
      # has no effect on outbound traffic: iptables REDIRECT is always used for
      # outbound connections.
      # If "REDIRECT", use iptables REDIRECT to NAT and redirect to Envoy.
      # The "REDIRECT" mode loses source addresses during redirection.
      # If "TPROXY", use iptables TPROXY to redirect to Envoy.
      # The "TPROXY" mode preserves both the source and destination IP
      # addresses and ports, so that they can be used for advanced filtering
      # and manipulation.
      # The "TPROXY" mode also configures the sidecar to run with the
      # CAP_NET_ADMIN capability, which is required to use TPROXY.
      #interceptionMode: REDIRECT
      #
      # Port where Envoy listens (on local host) for admin commands
      # You can exec into the istio-proxy container in a pod and
      # curl the admin port (curl http://localhost:15000/) to obtain
      # diagnostic information from Envoy. See
      # https://lyft.github.io/envoy/docs/operations/admin.html
      # for more details
      proxyAdminPort: 15000
      #
      # Zipkin trace collector
      zipkinAddress: zipkin.istio-system:9411
      #
      # Statsd metrics collector converts statsd metrics into Prometheus metrics.
      statsdUdpAddress: istio-statsd-prom-bridge.istio-system:9125
      #
      # Mutual TLS authentication between sidecars and istio control plane.
      controlPlaneAuthPolicy: NONE
      #
      # Address where istio Pilot service is running
      discoveryAddress: istio-pilot.istio-system:15007

---
# Source: istio/templates/sidecar-injector-configmap.yaml

apiVersion: v1
kind: ConfigMap
metadata:
  name: istio-sidecar-injector
  namespace: istio-system
  labels:
    app: istio
    chart: istio-1.0.0
    release: istio
    heritage: Tiller
    istio: sidecar-injector
data:
  config: |-
    policy: enabled
    template: |-
      initContainers:
      - name: istio-init
        image: "docker.io/istio/proxy_init:1.0.0"
        args:
        - "-p"
        - [[ .MeshConfig.ProxyListenPort ]]
        - "-u"
        - 1337
        - "-m"
        - [[ or (index .ObjectMeta.Annotations "sidecar.istio.io/interceptionMode") .ProxyConfig.InterceptionMode.String ]]
        - "-i"
        [[ if (isset .ObjectMeta.Annotations "traffic.sidecar.istio.io/includeOutboundIPRanges") -]]
        - "[[ index .ObjectMeta.Annotations "traffic.sidecar.istio.io/includeOutboundIPRanges"  ]]"
        [[ else -]]
        - "*"
        [[ end -]]
        - "-x"
        [[ if (isset .ObjectMeta.Annotations "traffic.sidecar.istio.io/excludeOutboundIPRanges") -]]
        - "[[ index .ObjectMeta.Annotations "traffic.sidecar.istio.io/excludeOutboundIPRanges"  ]]"
        [[ else -]]
        - ""
        [[ end -]]
        - "-b"
        [[ if (isset .ObjectMeta.Annotations "traffic.sidecar.istio.io/includeInboundPorts") -]]
        - "[[ index .ObjectMeta.Annotations "traffic.sidecar.istio.io/includeInboundPorts"  ]]"
        [[ else -]]
        - [[ range .Spec.Containers -]][[ range .Ports -]][[ .ContainerPort -]], [[ end -]][[ end -]][[ end]]
        - "-d"
        [[ if (isset .ObjectMeta.Annotations "traffic.sidecar.istio.io/excludeInboundPorts") -]]
        - "[[ index .ObjectMeta.Annotations "traffic.sidecar.istio.io/excludeInboundPorts" ]]"
        [[ else -]]
        - ""
        [[ end -]]
        imagePullPolicy: IfNotPresent
        securityContext:
          capabilities:
            add:
            - NET_ADMIN
          privileged: true
        restartPolicy: Always
      
      containers:
      - name: istio-proxy
        image: [[ if (isset .ObjectMeta.Annotations "sidecar.istio.io/proxyImage") -]]
        "[[ index .ObjectMeta.Annotations "sidecar.istio.io/proxyImage" ]]"
        [[ else -]]
        docker.io/istio/proxyv2:1.0.0
        [[ end -]]
        args:
        - proxy
        - sidecar
        - --configPath
        - [[ .ProxyConfig.ConfigPath ]]
        - --binaryPath
        - [[ .ProxyConfig.BinaryPath ]]
        - --serviceCluster
        [[ if ne "" (index .ObjectMeta.Labels "app") -]]
        - [[ index .ObjectMeta.Labels "app" ]]
        [[ else -]]
        - "istio-proxy"
        [[ end -]]
        - --drainDuration
        - [[ formatDuration .ProxyConfig.DrainDuration ]]
        - --parentShutdownDuration
        - [[ formatDuration .ProxyConfig.ParentShutdownDuration ]]
        - --discoveryAddress
        - [[ .ProxyConfig.DiscoveryAddress ]]
        - --discoveryRefreshDelay
        - [[ formatDuration .ProxyConfig.DiscoveryRefreshDelay ]]
        - --zipkinAddress
        - [[ .ProxyConfig.ZipkinAddress ]]
        - --connectTimeout
        - [[ formatDuration .ProxyConfig.ConnectTimeout ]]
        - --statsdUdpAddress
        - [[ .ProxyConfig.StatsdUdpAddress ]]
        - --proxyAdminPort
        - [[ .ProxyConfig.ProxyAdminPort ]]
        - --controlPlaneAuthPolicy
        - [[ or (index .ObjectMeta.Annotations "sidecar.istio.io/controlPlaneAuthPolicy") .ProxyConfig.ControlPlaneAuthPolicy ]]
        env:
        - name: POD_NAME
          valueFrom:
            fieldRef:
              fieldPath: metadata.name
        - name: POD_NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: metadata.namespace
        - name: INSTANCE_IP
          valueFrom:
            fieldRef:
              fieldPath: status.podIP
        - name: ISTIO_META_POD_NAME
          valueFrom:
            fieldRef:
              fieldPath: metadata.name
        - name: ISTIO_META_INTERCEPTION_MODE
          value: [[ or (index .ObjectMeta.Annotations "sidecar.istio.io/interceptionMode") .ProxyConfig.InterceptionMode.String ]]
        imagePullPolicy: IfNotPresent
        securityContext:
          privileged: false
          readOnlyRootFilesystem: true
          [[ if eq (or (index .ObjectMeta.Annotations "sidecar.istio.io/interceptionMode") .ProxyConfig.InterceptionMode.String) "TPROXY" -]]
          capabilities:
            add:
            - NET_ADMIN
          runAsGroup: 1337
          [[ else -]]
          runAsUser: 1337
          [[ end -]]
        restartPolicy: Always
        resources:
          [[ if (isset .ObjectMeta.Annotations "sidecar.istio.io/proxyCPU") -]]
          requests:
            cpu: "[[ index .ObjectMeta.Annotations "sidecar.istio.io/proxyCPU" ]]"
            memory: "[[ index .ObjectMeta.Annotations "sidecar.istio.io/proxyMemory" ]]"
        [[ else -]]
          requests:
            cpu: 10m
          
        [[ end -]]
        volumeMounts:
        - mountPath: /etc/istio/proxy
          name: istio-envoy
        - mountPath: /etc/certs/
          name: istio-certs
          readOnly: true
      volumes:
      - emptyDir:
          medium: Memory
        name: istio-envoy
      - name: istio-certs
        secret:
          optional: true
          [[ if eq .Spec.ServiceAccountName "" -]]
          secretName: istio.default
          [[ else -]]
          secretName: [[ printf "istio.%s" .Spec.ServiceAccountName ]]
          [[ end -]]

---
# Source: istio/charts/galley/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: istio-galley-service-account
  namespace: istio-system
  labels:
    app: istio-galley
    chart: galley-1.0.0
    heritage: Tiller
    release: istio

---
# Source: istio/charts/gateways/templates/serviceaccount.yaml

apiVersion: v1
kind: ServiceAccount
metadata:
  name: istio-egressgateway-service-account
  namespace: istio-system
  labels:
    app: egressgateway
    chart: gateways-1.0.0
    heritage: Tiller
    release: istio
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: istio-ingressgateway-service-account
  namespace: istio-system
  labels:
    app: ingressgateway
    chart: gateways-1.0.0
    heritage: Tiller
    release: istio
---

---
# Source: istio/charts/mixer/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: istio-mixer-service-account
  namespace: istio-system
  labels:
    app: mixer
    chart: mixer-1.0.0
    heritage: Tiller
    release: istio

---
# Source: istio/charts/pilot/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: istio-pilot-service-account
  namespace: istio-system
  labels:
    app: istio-pilot
    chart: pilot-1.0.0
    heritage: Tiller
    release: istio

---
# Source: istio/charts/prometheus/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: prometheus
  namespace: istio-system

---
# Source: istio/charts/security/templates/cleanup-secrets.yaml
# The reason for creating a ServiceAccount and ClusterRole specifically for this
# post-delete hooked job is because the citadel ServiceAccount is being deleted
# before this hook is launched. On the other hand, running this hook before the
# deletion of the citadel (e.g. pre-delete) won't delete the secrets because they
# will be re-created immediately by the to-be-deleted citadel.
#
# It's also important that the ServiceAccount, ClusterRole and ClusterRoleBinding
# will be ready before running the hooked Job therefore the hook weights.

apiVersion: v1
kind: ServiceAccount
metadata:
  name: istio-cleanup-secrets-service-account
  namespace: istio-system
  annotations:
    "helm.sh/hook": post-delete
    "helm.sh/hook-delete-policy": hook-succeeded
    "helm.sh/hook-weight": "1"
  labels:
    app: security
    chart: security-1.0.0
    heritage: Tiller
    release: istio
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
  name: istio-cleanup-secrets-istio-system
  annotations:
    "helm.sh/hook": post-delete
    "helm.sh/hook-delete-policy": hook-succeeded
    "helm.sh/hook-weight": "1"
  labels:
    app: security
    chart: security-1.0.0
    heritage: Tiller
    release: istio
rules:
- apiGroups: [""]
  resources: ["secrets"]
  verbs: ["list", "delete"]
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
  name: istio-cleanup-secrets-istio-system
  annotations:
    "helm.sh/hook": post-delete
    "helm.sh/hook-delete-policy": hook-succeeded
    "helm.sh/hook-weight": "2"
  labels:
    app: security
    chart: security-1.0.0
    heritage: Tiller
    release: istio
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: istio-cleanup-secrets-istio-system
subjects:
  - kind: ServiceAccount
    name: istio-cleanup-secrets-service-account
    namespace: istio-system
---
apiVersion: batch/v1
kind: Job
metadata:
  name: istio-cleanup-secrets
  namespace: istio-system
  annotations:
    "helm.sh/hook": post-delete
    "helm.sh/hook-delete-policy": hook-succeeded
    "helm.sh/hook-weight": "3"
  labels:
    app: security
    chart: security-1.0.0
    release: istio
    heritage: Tiller
spec:
  template:
    metadata:
      name: istio-cleanup-secrets
      labels:
        app: security
        release: istio
    spec:
      serviceAccountName: istio-cleanup-secrets-service-account
      containers:
        - name: hyperkube
          image: "quay.io/coreos/hyperkube:v1.7.6_coreos.0"
          command:
          - /bin/bash
          - -c
          - >
              kubectl get secret --all-namespaces | grep "istio.io/key-and-cert" |  while read -r entry; do
                ns=$(echo $entry | awk '{print $1}');
                name=$(echo $entry | awk '{print $2}');
                kubectl delete secret $name -n $ns;
              done
      restartPolicy: OnFailure

---
# Source: istio/charts/security/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: istio-citadel-service-account
  namespace: istio-system
  labels:
    app: security
    chart: security-1.0.0
    heritage: Tiller
    release: istio

---
# Source: istio/charts/sidecarInjectorWebhook/templates/serviceaccount.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
  name: istio-sidecar-injector-service-account
  namespace: istio-system
  labels:
    app: istio-sidecar-injector
    chart: sidecarInjectorWebhook-1.0.0
    heritage: Tiller
    release: istio

---
# Source: istio/templates/crds.yaml
# 
# these CRDs only make sense when pilot is enabled
#
apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition
metadata:
  name: virtualservices.networking.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: istio-pilot
spec:
  group: networking.istio.io
  names:
    kind: VirtualService
    listKind: VirtualServiceList
    plural: virtualservices
    singular: virtualservice
    categories:
    - istio-io
    - networking-istio-io
  scope: Namespaced
  version: v1alpha3
---
apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition
metadata:
  name: destinationrules.networking.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: istio-pilot
spec:
  group: networking.istio.io
  names:
    kind: DestinationRule
    listKind: DestinationRuleList
    plural: destinationrules
    singular: destinationrule
    categories:
    - istio-io
    - networking-istio-io
  scope: Namespaced
  version: v1alpha3
---
apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition
metadata:
  name: serviceentries.networking.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: istio-pilot
spec:
  group: networking.istio.io
  names:
    kind: ServiceEntry
    listKind: ServiceEntryList
    plural: serviceentries
    singular: serviceentry
    categories:
    - istio-io
    - networking-istio-io
  scope: Namespaced
  version: v1alpha3
---
apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition
metadata:
  name: gateways.networking.istio.io
  annotations:
    "helm.sh/hook": crd-install
    "helm.sh/hook-weight": "-5"
  labels:
    app: istio-pilot
spec:
  group: networking.istio.io
  names:
    kind: Gateway
    plural: gateways
    singular: gateway
    categories:
    - istio-io
    - networking-istio-io
  scope: Namespaced
  version: v1alpha3 
---
apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition
metadata:
  name: envoyfilters.networking.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: istio-pilot
spec:
  group: networking.istio.io
  names:
    kind: EnvoyFilter
    plural: envoyfilters
    singular: envoyfilter
    categories:
    - istio-io
    - networking-istio-io
  scope: Namespaced
  version: v1alpha3
---
#

# these CRDs only make sense when security is enabled
#

#
kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  annotations:
    "helm.sh/hook": crd-install
  name: httpapispecbindings.config.istio.io
spec:
  group: config.istio.io
  names:
    kind: HTTPAPISpecBinding
    plural: httpapispecbindings
    singular: httpapispecbinding
    categories:
    - istio-io
    - apim-istio-io
  scope: Namespaced
  version: v1alpha2
---
kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  annotations:
    "helm.sh/hook": crd-install
  name: httpapispecs.config.istio.io
spec:
  group: config.istio.io
  names:
    kind: HTTPAPISpec
    plural: httpapispecs
    singular: httpapispec
    categories:
    - istio-io
    - apim-istio-io
  scope: Namespaced
  version: v1alpha2
---
kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  annotations:
    "helm.sh/hook": crd-install
  name: quotaspecbindings.config.istio.io
spec:
  group: config.istio.io
  names:
    kind: QuotaSpecBinding
    plural: quotaspecbindings
    singular: quotaspecbinding
    categories:
    - istio-io
    - apim-istio-io
  scope: Namespaced
  version: v1alpha2
---
kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  annotations:
    "helm.sh/hook": crd-install
  name: quotaspecs.config.istio.io
spec:
  group: config.istio.io
  names:
    kind: QuotaSpec
    plural: quotaspecs
    singular: quotaspec
    categories:
    - istio-io
    - apim-istio-io
  scope: Namespaced
  version: v1alpha2
---

# Mixer CRDs
kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: rules.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: istio.io.mixer
    istio: core
spec:
  group: config.istio.io
  names:
    kind: rule
    plural: rules
    singular: rule
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: attributemanifests.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: istio.io.mixer
    istio: core
spec:
  group: config.istio.io
  names:
    kind: attributemanifest
    plural: attributemanifests
    singular: attributemanifest
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: bypasses.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: bypass
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: bypass
    plural: bypasses
    singular: bypass
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: circonuses.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: circonus
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: circonus
    plural: circonuses
    singular: circonus
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: deniers.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: denier
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: denier
    plural: deniers
    singular: denier
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: fluentds.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: fluentd
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: fluentd
    plural: fluentds
    singular: fluentd
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: kubernetesenvs.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: kubernetesenv
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: kubernetesenv
    plural: kubernetesenvs
    singular: kubernetesenv
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: listcheckers.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: listchecker
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: listchecker
    plural: listcheckers
    singular: listchecker
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: memquotas.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: memquota
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: memquota
    plural: memquotas
    singular: memquota
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: noops.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: noop
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: noop
    plural: noops
    singular: noop
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: opas.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: opa
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: opa
    plural: opas
    singular: opa
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: prometheuses.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: prometheus
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: prometheus
    plural: prometheuses
    singular: prometheus
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: rbacs.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: rbac
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: rbac
    plural: rbacs
    singular: rbac
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: redisquotas.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    package: redisquota
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: redisquota
    plural: redisquotas
    singular: redisquota
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: servicecontrols.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: servicecontrol
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: servicecontrol
    plural: servicecontrols
    singular: servicecontrol
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2

---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: signalfxs.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: signalfx
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: signalfx
    plural: signalfxs
    singular: signalfx
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: solarwindses.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: solarwinds
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: solarwinds
    plural: solarwindses
    singular: solarwinds
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: stackdrivers.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: stackdriver
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: stackdriver
    plural: stackdrivers
    singular: stackdriver
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: statsds.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: statsd
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: statsd
    plural: statsds
    singular: statsd
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: stdios.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: stdio
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: stdio
    plural: stdios
    singular: stdio
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: apikeys.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: apikey
    istio: mixer-instance
spec:
  group: config.istio.io
  names:
    kind: apikey
    plural: apikeys
    singular: apikey
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: authorizations.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: authorization
    istio: mixer-instance
spec:
  group: config.istio.io
  names:
    kind: authorization
    plural: authorizations
    singular: authorization
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: checknothings.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: checknothing
    istio: mixer-instance
spec:
  group: config.istio.io
  names:
    kind: checknothing
    plural: checknothings
    singular: checknothing
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: kuberneteses.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: adapter.template.kubernetes
    istio: mixer-instance
spec:
  group: config.istio.io
  names:
    kind: kubernetes
    plural: kuberneteses
    singular: kubernetes
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: listentries.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: listentry
    istio: mixer-instance
spec:
  group: config.istio.io
  names:
    kind: listentry
    plural: listentries
    singular: listentry
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: logentries.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: logentry
    istio: mixer-instance
spec:
  group: config.istio.io
  names:
    kind: logentry
    plural: logentries
    singular: logentry
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: edges.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: edge
    istio: mixer-instance
spec:
  group: config.istio.io
  names:
    kind: edge
    plural: edges
    singular: edge
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: metrics.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: metric
    istio: mixer-instance
spec:
  group: config.istio.io
  names:
    kind: metric
    plural: metrics
    singular: metric
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: quotas.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: quota
    istio: mixer-instance
spec:
  group: config.istio.io
  names:
    kind: quota
    plural: quotas
    singular: quota
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: reportnothings.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: reportnothing
    istio: mixer-instance
spec:
  group: config.istio.io
  names:
    kind: reportnothing
    plural: reportnothings
    singular: reportnothing
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: servicecontrolreports.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: servicecontrolreport
    istio: mixer-instance
spec:
  group: config.istio.io
  names:
    kind: servicecontrolreport
    plural: servicecontrolreports
    singular: servicecontrolreport
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: tracespans.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: tracespan
    istio: mixer-instance
spec:
  group: config.istio.io
  names:
    kind: tracespan
    plural: tracespans
    singular: tracespan
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: rbacconfigs.rbac.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: istio.io.mixer
    istio: rbac
spec:
  group: rbac.istio.io
  names:
    kind: RbacConfig
    plural: rbacconfigs
    singular: rbacconfig
    categories:
    - istio-io
    - rbac-istio-io
  scope: Namespaced
  version: v1alpha1
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: serviceroles.rbac.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: istio.io.mixer
    istio: rbac
spec:
  group: rbac.istio.io
  names:
    kind: ServiceRole
    plural: serviceroles
    singular: servicerole
    categories:
    - istio-io
    - rbac-istio-io
  scope: Namespaced
  version: v1alpha1
---

kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: servicerolebindings.rbac.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: istio.io.mixer
    istio: rbac
spec:
  group: rbac.istio.io
  names:
    kind: ServiceRoleBinding
    plural: servicerolebindings
    singular: servicerolebinding
    categories:
    - istio-io
    - rbac-istio-io
  scope: Namespaced
  version: v1alpha1
---
kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: adapters.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: adapter
    istio: mixer-adapter
spec:
  group: config.istio.io
  names:
    kind: adapter
    plural: adapters
    singular: adapter
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---
kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: instances.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: instance
    istio: mixer-instance
spec:
  group: config.istio.io
  names:
    kind: instance
    plural: instances
    singular: instance
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---
kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: templates.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: template
    istio: mixer-template
spec:
  group: config.istio.io
  names:
    kind: template
    plural: templates
    singular: template
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---
kind: CustomResourceDefinition
apiVersion: apiextensions.k8s.io/v1beta1
metadata:
  name: handlers.config.istio.io
  annotations:
    "helm.sh/hook": crd-install
  labels:
    app: mixer
    package: handler
    istio: mixer-handler
spec:
  group: config.istio.io
  names:
    kind: handler
    plural: handlers
    singular: handler
    categories:
    - istio-io
    - policy-istio-io
  scope: Namespaced
  version: v1alpha2
---
#
# 
---
# Source: istio/charts/galley/templates/clusterrole.yaml
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
  name: istio-galley-istio-system
  labels:
    app: istio-galley
    chart: galley-1.0.0
    heritage: Tiller
    release: istio
rules:
- apiGroups: ["admissionregistration.k8s.io"]
  resources: ["validatingwebhookconfigurations"]
  verbs: ["*"]
- apiGroups: ["config.istio.io"] # istio mixer CRD watcher
  resources: ["*"]
  verbs: ["get", "list", "watch"]
- apiGroups: ["*"]
  resources: ["deployments"]
  resourceNames: ["istio-galley"]
  verbs: ["get"]

---
# Source: istio/charts/gateways/templates/clusterrole.yaml

apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
  labels:
    app: gateways
    chart: gateways-1.0.0
    heritage: Tiller
    release: istio
  name: istio-egressgateway-istio-system
rules:
- apiGroups: ["extensions"]
  resources: ["thirdpartyresources", "virtualservices", "destinationrules", "gateways"]
  verbs: ["get", "watch", "list", "update"]
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
  labels:
    app: gateways
    chart: gateways-1.0.0
    heritage: Tiller
    release: istio
  name: istio-ingressgateway-istio-system
rules:
- apiGroups: ["extensions"]
  resources: ["thirdpartyresources", "virtualservices", "destinationrules", "gateways"]
  verbs: ["get", "watch", "list", "update"]
---

---
# Source: istio/charts/mixer/templates/clusterrole.yaml
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
  name: istio-mixer-istio-system
  labels:
    app: mixer
    chart: mixer-1.0.0
    heritage: Tiller
    release: istio
rules:
- apiGroups: ["config.istio.io"] # istio CRD watcher
  resources: ["*"]
  verbs: ["create", "get", "list", "watch", "patch"]
- apiGroups: ["rbac.istio.io"] # istio RBAC watcher
  resources: ["*"]
  verbs: ["get", "list", "watch"]
- apiGroups: ["apiextensions.k8s.io"]
  resources: ["customresourcedefinitions"]
  verbs: ["get", "list", "watch"]
- apiGroups: [""]
  resources: ["configmaps", "endpoints", "pods", "services", "namespaces", "secrets"]
  verbs: ["get", "list", "watch"]
- apiGroups: ["extensions"]
  resources: ["replicasets"]
  verbs: ["get", "list", "watch"]
- apiGroups: ["apps"]
  resources: ["replicasets"]
  verbs: ["get", "list", "watch"]

---
# Source: istio/charts/pilot/templates/clusterrole.yaml
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
  name: istio-pilot-istio-system
  labels:
    app: istio-pilot
    chart: pilot-1.0.0
    heritage: Tiller
    release: istio
rules:
- apiGroups: ["config.istio.io"]
  resources: ["*"]
  verbs: ["*"]
- apiGroups: ["rbac.istio.io"]
  resources: ["*"]
  verbs: ["get", "watch", "list"]
- apiGroups: ["networking.istio.io"]
  resources: ["*"]
  verbs: ["*"]
- apiGroups: ["authentication.istio.io"]
  resources: ["*"]
  verbs: ["*"]
- apiGroups: ["apiextensions.k8s.io"]
  resources: ["customresourcedefinitions"]
  verbs: ["*"]
- apiGroups: ["extensions"]
  resources: ["thirdpartyresources", "thirdpartyresources.extensions", "ingresses", "ingresses/status"]
  verbs: ["*"]
- apiGroups: [""]
  resources: ["configmaps"]
  verbs: ["create", "get", "list", "watch", "update"]
- apiGroups: [""]
  resources: ["endpoints", "pods", "services"]
  verbs: ["get", "list", "watch"]
- apiGroups: [""]
  resources: ["namespaces", "nodes", "secrets"]
  verbs: ["get", "list", "watch"]

---
# Source: istio/charts/prometheus/templates/clusterrole.yaml
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
  name: prometheus-istio-system
rules:
- apiGroups: [""]
  resources:
  - nodes
  - services
  - endpoints
  - pods
  - nodes/proxy
  verbs: ["get", "list", "watch"]
- apiGroups: [""]
  resources:
  - configmaps
  verbs: ["get"]
- nonResourceURLs: ["/metrics"]
  verbs: ["get"]

---
# Source: istio/charts/security/templates/clusterrole.yaml
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
  name: istio-citadel-istio-system
  labels:
    app: security
    chart: security-1.0.0
    heritage: Tiller
    release: istio
rules:
- apiGroups: [""]
  resources: ["secrets"]
  verbs: ["create", "get", "watch", "list", "update", "delete"]
- apiGroups: [""]
  resources: ["serviceaccounts"]
  verbs: ["get", "watch", "list"]
- apiGroups: [""]
  resources: ["services"]
  verbs: ["get", "watch", "list"]

---
# Source: istio/charts/sidecarInjectorWebhook/templates/clusterrole.yaml
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
  name: istio-sidecar-injector-istio-system
  labels:
    app: istio-sidecar-injector
    chart: sidecarInjectorWebhook-1.0.0
    heritage: Tiller
    release: istio
rules:
- apiGroups: ["*"]
  resources: ["configmaps"]
  verbs: ["get", "list", "watch"]
- apiGroups: ["admissionregistration.k8s.io"]
  resources: ["mutatingwebhookconfigurations"]
  verbs: ["get", "list", "watch", "patch"]

---
# Source: istio/charts/galley/templates/clusterrolebinding.yaml
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
  name: istio-galley-admin-role-binding-istio-system
  labels:
    app: istio-galley
    chart: galley-1.0.0
    heritage: Tiller
    release: istio
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: istio-galley-istio-system
subjects:
  - kind: ServiceAccount
    name: istio-galley-service-account
    namespace: istio-system

---
# Source: istio/charts/gateways/templates/clusterrolebindings.yaml

apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
  name: istio-egressgateway-istio-system
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: istio-egressgateway-istio-system
subjects:
  - kind: ServiceAccount
    name: istio-egressgateway-service-account
    namespace: istio-system
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
  name: istio-ingressgateway-istio-system
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: istio-ingressgateway-istio-system
subjects:
  - kind: ServiceAccount
    name: istio-ingressgateway-service-account
    namespace: istio-system
---

---
# Source: istio/charts/mixer/templates/clusterrolebinding.yaml
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
  name: istio-mixer-admin-role-binding-istio-system
  labels:
    app: mixer
    chart: mixer-1.0.0
    heritage: Tiller
    release: istio
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: istio-mixer-istio-system
subjects:
  - kind: ServiceAccount
    name: istio-mixer-service-account
    namespace: istio-system

---
# Source: istio/charts/pilot/templates/clusterrolebinding.yaml
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
  name: istio-pilot-istio-system
  labels:
    app: istio-pilot
    chart: pilot-1.0.0
    heritage: Tiller
    release: istio
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: istio-pilot-istio-system
subjects:
  - kind: ServiceAccount
    name: istio-pilot-service-account
    namespace: istio-system

---
# Source: istio/charts/prometheus/templates/clusterrolebindings.yaml
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
  name: prometheus-istio-system
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: prometheus-istio-system
subjects:
- kind: ServiceAccount
  name: prometheus
  namespace: istio-system

---
# Source: istio/charts/security/templates/clusterrolebinding.yaml
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
  name: istio-citadel-istio-system
  labels:
    app: security
    chart: security-1.0.0
    heritage: Tiller
    release: istio
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: istio-citadel-istio-system
subjects:
  - kind: ServiceAccount
    name: istio-citadel-service-account
    namespace: istio-system

---
# Source: istio/charts/sidecarInjectorWebhook/templates/clusterrolebinding.yaml
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
  name: istio-sidecar-injector-admin-role-binding-istio-system
  labels:
    app: istio-sidecar-injector
    chart: sidecarInjectorWebhook-1.0.0
    heritage: Tiller
    release: istio
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: istio-sidecar-injector-istio-system
subjects:
  - kind: ServiceAccount
    name: istio-sidecar-injector-service-account
    namespace: istio-system

---
# Source: istio/charts/galley/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: istio-galley
  namespace: istio-system
  labels:
    istio: galley
spec:
  ports:
  - port: 443
    name: https-validation
  - port: 9093
    name: http-monitoring
  selector:
    istio: galley

---
# Source: istio/charts/gateways/templates/service.yaml

apiVersion: v1
kind: Service
metadata:
  name: istio-egressgateway
  namespace: istio-system
  annotations:
  labels:
    chart: gateways-1.0.0
    release: istio
    heritage: Tiller
    app: istio-egressgateway
    istio: egressgateway
spec:
  type: NodePort
  selector:
    app: istio-egressgateway
    istio: egressgateway
  ports:
    -
      name: http2
      port: 80
    -
      name: https
      port: 443
---
apiVersion: v1
kind: Service
metadata:
  name: istio-ingressgateway
  namespace: istio-system
  annotations:
  labels:
    chart: gateways-1.0.0
    release: istio
    heritage: Tiller
    app: istio-ingressgateway
    istio: ingressgateway
spec:
  type: NodePort
  selector:
    app: istio-ingressgateway
    istio: ingressgateway
  ports:
    -
      name: http2
      nodePort: 31380
      port: 80
      targetPort: 80
    -
      name: https
      nodePort: 31390
      port: 443
    -
      name: tcp
      nodePort: 31400
      port: 31400
    -
      name: tcp-pilot-grpc-tls
      port: 15011
      targetPort: 15011
    -
      name: tcp-citadel-grpc-tls
      port: 8060
      targetPort: 8060
    -
      name: http2-prometheus
      port: 15030
      targetPort: 15030
    -
      name: http2-grafana
      port: 15031
      targetPort: 15031
---

---
# Source: istio/charts/mixer/templates/service.yaml

apiVersion: v1
kind: Service
metadata:
  name: istio-policy
  namespace: istio-system
  labels:
    chart: mixer-1.0.0
    release: istio
    istio: mixer
spec:
  ports:
  - name: grpc-mixer
    port: 9091
  - name: grpc-mixer-mtls
    port: 15004
  - name: http-monitoring
    port: 9093
  selector:
    istio: mixer
    istio-mixer-type: policy
---
apiVersion: v1
kind: Service
metadata:
  name: istio-telemetry
  namespace: istio-system
  labels:
    chart: mixer-1.0.0
    release: istio
    istio: mixer
spec:
  ports:
  - name: grpc-mixer
    port: 9091
  - name: grpc-mixer-mtls
    port: 15004
  - name: http-monitoring
    port: 9093
  - name: prometheus
    port: 42422
  selector:
    istio: mixer
    istio-mixer-type: telemetry
---

---
# Source: istio/charts/mixer/templates/statsdtoprom.yaml

---
apiVersion: v1
kind: Service
metadata:
  name: istio-statsd-prom-bridge
  namespace: istio-system
  labels:
    chart: mixer-1.0.0
    release: istio
    istio: statsd-prom-bridge
spec:
  ports:
  - name: statsd-prom
    port: 9102
  - name: statsd-udp
    port: 9125
    protocol: UDP
  selector:
    istio: statsd-prom-bridge

---

apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: istio-statsd-prom-bridge
  namespace: istio-system
  labels:
    chart: mixer-1.0.0
    release: istio
    istio: mixer
spec:
  template:
    metadata:
      labels:
        istio: statsd-prom-bridge
      annotations:
        sidecar.istio.io/inject: "false"
    spec:
      serviceAccountName: istio-mixer-service-account
      volumes:
      - name: config-volume
        configMap:
          name: istio-statsd-prom-bridge
      containers:
      - name: statsd-prom-bridge
        image: "docker.io/prom/statsd-exporter:v0.6.0"
        imagePullPolicy: IfNotPresent
        ports:
        - containerPort: 9102
        - containerPort: 9125
          protocol: UDP
        args:
        - '-statsd.mapping-config=/etc/statsd/mapping.conf'
        resources:
          requests:
            cpu: 10m
          
        volumeMounts:
        - name: config-volume
          mountPath: /etc/statsd

---
# Source: istio/charts/pilot/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: istio-pilot
  namespace: istio-system
  labels:
    app: istio-pilot
    chart: pilot-1.0.0
    release: istio
    heritage: Tiller
spec:
  ports:
  - port: 15010
    name: grpc-xds # direct
  - port: 15011
    name: https-xds # mTLS
  - port: 8080
    name: http-legacy-discovery # direct
  - port: 9093
    name: http-monitoring
  selector:
    istio: pilot

---
# Source: istio/charts/prometheus/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: prometheus
  namespace: istio-system
  annotations:
    prometheus.io/scrape: 'true'
  labels:
    name: prometheus
spec:
  selector:
    app: prometheus
  ports:
  - name: http-prometheus
    protocol: TCP
    port: 9090

---
# Source: istio/charts/security/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  # we use the normal name here (e.g. 'prometheus')
  # as grafana is configured to use this as a data source
  name: istio-citadel
  namespace: istio-system
  labels:
    app: istio-citadel
spec:
  ports:
    - name: grpc-citadel
      port: 8060
      targetPort: 8060
      protocol: TCP
    - name: http-monitoring
      port: 9093
  selector:
    istio: citadel

---
# Source: istio/charts/sidecarInjectorWebhook/templates/service.yaml
apiVersion: v1
kind: Service
metadata:
  name: istio-sidecar-injector
  namespace: istio-system
  labels:
    istio: sidecar-injector
spec:
  ports:
  - port: 443
  selector:
    istio: sidecar-injector

---
# Source: istio/charts/galley/templates/deployment.yaml
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: istio-galley
  namespace: istio-system
  labels:
    app: galley
    chart: galley-1.0.0
    release: istio
    heritage: Tiller
    istio: galley
spec:
  replicas: 1
  strategy:
    rollingUpdate:
      maxSurge: 1
      maxUnavailable: 0
  template:
    metadata:
      labels:
        istio: galley
      annotations:
        sidecar.istio.io/inject: "false"
        scheduler.alpha.kubernetes.io/critical-pod: ""
    spec:
      serviceAccountName: istio-galley-service-account
      containers:
        - name: validator
          image: "docker.io/istio/galley:1.0.0"
          imagePullPolicy: IfNotPresent
          ports:
          - containerPort: 443
          - containerPort: 9093
          command:
          - /usr/local/bin/galley
          - validator
          - --deployment-namespace=istio-system
          - --caCertFile=/etc/istio/certs/root-cert.pem
          - --tlsCertFile=/etc/istio/certs/cert-chain.pem
          - --tlsKeyFile=/etc/istio/certs/key.pem
          - --healthCheckInterval=2s
          - --healthCheckFile=/health
          - --webhook-config-file
          - /etc/istio/config/validatingwebhookconfiguration.yaml
          volumeMounts:
          - name: certs
            mountPath: /etc/istio/certs
            readOnly: true
          - name: config
            mountPath: /etc/istio/config
            readOnly: true
          livenessProbe:
            exec:
              command:
                - /usr/local/bin/galley
                - probe
                - --probe-path=/health
                - --interval=4s
            initialDelaySeconds: 4
            periodSeconds: 4
          readinessProbe:
            exec:
              command:
                - /usr/local/bin/galley
                - probe
                - --probe-path=/health
                - --interval=4s
            initialDelaySeconds: 4
            periodSeconds: 4
          resources:
            requests:
              cpu: 10m
            
      volumes:
      - name: certs
        secret:
          secretName: istio.istio-galley-service-account
      - name: config
        configMap:
          name: istio-galley-configuration
      affinity:      
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
                - ppc64le
                - s390x
          preferredDuringSchedulingIgnoredDuringExecution:
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - ppc64le
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - s390x

---
# Source: istio/charts/gateways/templates/deployment.yaml

apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: istio-egressgateway
  namespace: istio-system
  labels:
    app: egressgateway
    chart: gateways-1.0.0
    release: istio
    heritage: Tiller
    app: istio-egressgateway
    istio: egressgateway
spec:
  replicas: 1
  template:
    metadata:
      labels:
        app: istio-egressgateway
        istio: egressgateway
      annotations:
        sidecar.istio.io/inject: "false"
        scheduler.alpha.kubernetes.io/critical-pod: ""
    spec:
      serviceAccountName: istio-egressgateway-service-account
      containers:
        - name: egressgateway
          image: "docker.io/istio/proxyv2:1.0.0"
          imagePullPolicy: IfNotPresent
          ports:
            - containerPort: 80
            - containerPort: 443
          args:
          - proxy
          - router
          - -v
          - "2"
          - --discoveryRefreshDelay
          - '1s' #discoveryRefreshDelay
          - --drainDuration
          - '45s' #drainDuration
          - --parentShutdownDuration
          - '1m0s' #parentShutdownDuration
          - --connectTimeout
          - '10s' #connectTimeout
          - --serviceCluster
          - istio-egressgateway
          - --zipkinAddress
          - zipkin:9411
          - --statsdUdpAddress
          - istio-statsd-prom-bridge:9125
          - --proxyAdminPort
          - "15000"
          - --controlPlaneAuthPolicy
          - NONE
          - --discoveryAddress
          - istio-pilot.istio-system:8080
          resources:
            requests:
              cpu: 10m
            
          env:
          - name: POD_NAME
            valueFrom:
              fieldRef:
                apiVersion: v1
                fieldPath: metadata.name
          - name: POD_NAMESPACE
            valueFrom:
              fieldRef:
                apiVersion: v1
                fieldPath: metadata.namespace
          - name: INSTANCE_IP
            valueFrom:
              fieldRef:
                apiVersion: v1
                fieldPath: status.podIP
          - name: ISTIO_META_POD_NAME
            valueFrom:
              fieldRef:
                fieldPath: metadata.name
          volumeMounts:
          - name: istio-certs
            mountPath: /etc/certs
            readOnly: true
          - name: egressgateway-certs
            mountPath: "/etc/istio/egressgateway-certs"
            readOnly: true
          - name: egressgateway-ca-certs
            mountPath: "/etc/istio/egressgateway-ca-certs"
            readOnly: true
      volumes:
      - name: istio-certs
        secret:
          secretName: istio.istio-egressgateway-service-account
          optional: true
      - name: egressgateway-certs
        secret:
          secretName: "istio-egressgateway-certs"
          optional: true
      - name: egressgateway-ca-certs
        secret:
          secretName: "istio-egressgateway-ca-certs"
          optional: true
      affinity:      
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
                - ppc64le
                - s390x
          preferredDuringSchedulingIgnoredDuringExecution:
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - ppc64le
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - s390x
---
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: istio-ingressgateway
  namespace: istio-system
  labels:
    app: ingressgateway
    chart: gateways-1.0.0
    release: istio
    heritage: Tiller
    app: istio-ingressgateway
    istio: ingressgateway
spec:
  replicas: 1
  template:
    metadata:
      labels:
        app: istio-ingressgateway
        istio: ingressgateway
      annotations:
        sidecar.istio.io/inject: "false"
        scheduler.alpha.kubernetes.io/critical-pod: ""
    spec:
      serviceAccountName: istio-ingressgateway-service-account
      containers:
        - name: ingressgateway
          image: "docker.io/istio/proxyv2:1.0.0"
          imagePullPolicy: IfNotPresent
          ports:
            - containerPort: 80
            - containerPort: 443
            - containerPort: 31400
            - containerPort: 15011
            - containerPort: 8060
            - containerPort: 15030
            - containerPort: 15031
          args:
          - proxy
          - router
          - -v
          - "2"
          - --discoveryRefreshDelay
          - '1s' #discoveryRefreshDelay
          - --drainDuration
          - '45s' #drainDuration
          - --parentShutdownDuration
          - '1m0s' #parentShutdownDuration
          - --connectTimeout
          - '10s' #connectTimeout
          - --serviceCluster
          - istio-ingressgateway
          - --zipkinAddress
          - zipkin:9411
          - --statsdUdpAddress
          - istio-statsd-prom-bridge:9125
          - --proxyAdminPort
          - "15000"
          - --controlPlaneAuthPolicy
          - NONE
          - --discoveryAddress
          - istio-pilot.istio-system:8080
          resources:
            requests:
              cpu: 10m
            
          env:
          - name: POD_NAME
            valueFrom:
              fieldRef:
                apiVersion: v1
                fieldPath: metadata.name
          - name: POD_NAMESPACE
            valueFrom:
              fieldRef:
                apiVersion: v1
                fieldPath: metadata.namespace
          - name: INSTANCE_IP
            valueFrom:
              fieldRef:
                apiVersion: v1
                fieldPath: status.podIP
          - name: ISTIO_META_POD_NAME
            valueFrom:
              fieldRef:
                fieldPath: metadata.name
          volumeMounts:
          - name: istio-certs
            mountPath: /etc/certs
            readOnly: true
          - name: ingressgateway-certs
            mountPath: "/etc/istio/ingressgateway-certs"
            readOnly: true
          - name: ingressgateway-ca-certs
            mountPath: "/etc/istio/ingressgateway-ca-certs"
            readOnly: true
      volumes:
      - name: istio-certs
        secret:
          secretName: istio.istio-ingressgateway-service-account
          optional: true
      - name: ingressgateway-certs
        secret:
          secretName: "istio-ingressgateway-certs"
          optional: true
      - name: ingressgateway-ca-certs
        secret:
          secretName: "istio-ingressgateway-ca-certs"
          optional: true
      affinity:      
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
                - ppc64le
                - s390x
          preferredDuringSchedulingIgnoredDuringExecution:
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - ppc64le
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - s390x
---

---
# Source: istio/charts/mixer/templates/deployment.yaml

apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: istio-policy
  namespace: istio-system
  labels:
    chart: mixer-1.0.0
    release: istio
    istio: mixer
spec:
  replicas: 1
  template:
    metadata:
      labels:
        app: policy
        istio: mixer
        istio-mixer-type: policy
      annotations:
        sidecar.istio.io/inject: "false"
        scheduler.alpha.kubernetes.io/critical-pod: ""
    spec:
      serviceAccountName: istio-mixer-service-account
      volumes:
      - name: istio-certs
        secret:
          secretName: istio.istio-mixer-service-account
          optional: true
      - name: uds-socket
        emptyDir: {}
      affinity:      
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
                - ppc64le
                - s390x
          preferredDuringSchedulingIgnoredDuringExecution:
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - ppc64le
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - s390x
      containers:
      - name: mixer
        image: "docker.io/istio/mixer:1.0.0"
        imagePullPolicy: IfNotPresent
        ports:
        - containerPort: 9093
        - containerPort: 42422
        args:
          - --address
          - unix:///sock/mixer.socket
          - --configStoreURL=k8s://
          - --configDefaultNamespace=istio-system
          - --trace_zipkin_url=http://zipkin:9411/api/v1/spans
        resources:
          requests:
            cpu: 10m
          
        volumeMounts:
        - name: uds-socket
          mountPath: /sock
        livenessProbe:
          httpGet:
            path: /version
            port: 9093
          initialDelaySeconds: 5
          periodSeconds: 5
      - name: istio-proxy
        image: "docker.io/istio/proxyv2:1.0.0"
        imagePullPolicy: IfNotPresent
        ports:
        - containerPort: 9091
        - containerPort: 15004
        args:
        - proxy
        - --serviceCluster
        - istio-policy
        - --templateFile
        - /etc/istio/proxy/envoy_policy.yaml.tmpl
        - --controlPlaneAuthPolicy
        - NONE
        env:
        - name: POD_NAME
          valueFrom:
            fieldRef:
              apiVersion: v1
              fieldPath: metadata.name
        - name: POD_NAMESPACE
          valueFrom:
            fieldRef:
              apiVersion: v1
              fieldPath: metadata.namespace
        - name: INSTANCE_IP
          valueFrom:
            fieldRef:
              apiVersion: v1
              fieldPath: status.podIP
        resources:
          requests:
            cpu: 10m
          
        volumeMounts:
        - name: istio-certs
          mountPath: /etc/certs
          readOnly: true
        - name: uds-socket
          mountPath: /sock

---
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: istio-telemetry
  namespace: istio-system
  labels:
    chart: mixer-1.0.0
    release: istio
    istio: mixer
spec:
  replicas: 1
  template:
    metadata:
      labels:
        app: telemetry
        istio: mixer
        istio-mixer-type: telemetry
      annotations:
        sidecar.istio.io/inject: "false"
        scheduler.alpha.kubernetes.io/critical-pod: ""
    spec:
      serviceAccountName: istio-mixer-service-account
      volumes:
      - name: istio-certs
        secret:
          secretName: istio.istio-mixer-service-account
          optional: true
      - name: uds-socket
        emptyDir: {}
      containers:
      - name: mixer
        image: "docker.io/istio/mixer:1.0.0"
        imagePullPolicy: IfNotPresent
        ports:
        - containerPort: 9093
        - containerPort: 42422
        args:
          - --address
          - unix:///sock/mixer.socket
          - --configStoreURL=k8s://
          - --configDefaultNamespace=istio-system
          - --trace_zipkin_url=http://zipkin:9411/api/v1/spans
        resources:
          requests:
            cpu: 10m
          
        volumeMounts:
        - name: uds-socket
          mountPath: /sock
        livenessProbe:
          httpGet:
            path: /version
            port: 9093
          initialDelaySeconds: 5
          periodSeconds: 5
      - name: istio-proxy
        image: "docker.io/istio/proxyv2:1.0.0"
        imagePullPolicy: IfNotPresent
        ports:
        - containerPort: 9091
        - containerPort: 15004
        args:
        - proxy
        - --serviceCluster
        - istio-telemetry
        - --templateFile
        - /etc/istio/proxy/envoy_telemetry.yaml.tmpl
        - --controlPlaneAuthPolicy
        - NONE
        env:
        - name: POD_NAME
          valueFrom:
            fieldRef:
              apiVersion: v1
              fieldPath: metadata.name
        - name: POD_NAMESPACE
          valueFrom:
            fieldRef:
              apiVersion: v1
              fieldPath: metadata.namespace
        - name: INSTANCE_IP
          valueFrom:
            fieldRef:
              apiVersion: v1
              fieldPath: status.podIP
        resources:
          requests:
            cpu: 10m
          
        volumeMounts:
        - name: istio-certs
          mountPath: /etc/certs
          readOnly: true
        - name: uds-socket
          mountPath: /sock

--- 

---
# Source: istio/charts/pilot/templates/deployment.yaml
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: istio-pilot
  namespace: istio-system
  # TODO: default template doesn't have this, which one is right ?
  labels:
    app: istio-pilot
    chart: pilot-1.0.0
    release: istio
    heritage: Tiller
    istio: pilot
  annotations:
    checksum/config-volume: f8da08b6b8c170dde721efd680270b2901e750d4aa186ebb6c22bef5b78a43f9
spec:
  replicas: 1
  template:
    metadata:
      labels:
        istio: pilot
        app: pilot
      annotations:
        sidecar.istio.io/inject: "false"
        scheduler.alpha.kubernetes.io/critical-pod: ""
    spec:
      serviceAccountName: istio-pilot-service-account
      containers:
        - name: discovery
          image: "docker.io/istio/pilot:1.0.0"
          imagePullPolicy: IfNotPresent
          args:
          - "discovery"
          ports:
          - containerPort: 8080
          - containerPort: 15010
          readinessProbe:
            httpGet:
              path: /debug/endpointz
              port: 8080
            initialDelaySeconds: 30
            periodSeconds: 30
            timeoutSeconds: 5
          env:
          - name: POD_NAME
            valueFrom:
              fieldRef:
                apiVersion: v1
                fieldPath: metadata.name
          - name: POD_NAMESPACE
            valueFrom:
              fieldRef:
                apiVersion: v1
                fieldPath: metadata.namespace
          - name: PILOT_THROTTLE
            value: "500"
          - name: PILOT_CACHE_SQUASH
            value: "5"
          - name: PILOT_TRACE_SAMPLING
            value: "100"
          resources:
            requests:
              cpu: 500m
              memory: 2048Mi
            
          volumeMounts:
          - name: config-volume
            mountPath: /etc/istio/config
          - name: istio-certs
            mountPath: /etc/certs
            readOnly: true
        - name: istio-proxy
          image: "docker.io/istio/proxyv2:1.0.0"
          imagePullPolicy: IfNotPresent
          ports:
          - containerPort: 15003
          - containerPort: 15005
          - containerPort: 15007
          - containerPort: 15011
          args:
          - proxy
          - --serviceCluster
          - istio-pilot
          - --templateFile
          - /etc/istio/proxy/envoy_pilot.yaml.tmpl
          - --controlPlaneAuthPolicy
          - NONE
          env:
          - name: POD_NAME
            valueFrom:
              fieldRef:
                apiVersion: v1
                fieldPath: metadata.name
          - name: POD_NAMESPACE
            valueFrom:
              fieldRef:
                apiVersion: v1
                fieldPath: metadata.namespace
          - name: INSTANCE_IP
            valueFrom:
              fieldRef:
                apiVersion: v1
                fieldPath: status.podIP
          resources:
            requests:
              cpu: 10m
            
          volumeMounts:
          - name: istio-certs
            mountPath: /etc/certs
            readOnly: true
      volumes:
      - name: config-volume
        configMap:
          name: istio
      - name: istio-certs
        secret:
          secretName: istio.istio-pilot-service-account
      affinity:      
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
                - ppc64le
                - s390x
          preferredDuringSchedulingIgnoredDuringExecution:
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - ppc64le
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - s390x

---
# Source: istio/charts/prometheus/templates/deployment.yaml
# TODO: the original template has service account, roles, etc
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: prometheus
  namespace: istio-system
  labels:
    app: prometheus
    chart: prometheus-0.1.0
    release: istio
    heritage: Tiller
spec:
  replicas: 1
  selector:
    matchLabels:
      app: prometheus
  template:
    metadata:
      labels:
        app: prometheus
      annotations:
        sidecar.istio.io/inject: "false"
        scheduler.alpha.kubernetes.io/critical-pod: ""
    spec:
      serviceAccountName: prometheus
      containers:
        - name: prometheus
          image: "docker.io/prom/prometheus:v2.3.1"
          imagePullPolicy: IfNotPresent
          args:
            - '--storage.tsdb.retention=6h'
            - '--config.file=/etc/prometheus/prometheus.yml'
          ports:
            - containerPort: 9090
              name: http
          livenessProbe:
            httpGet:
              path: /-/healthy
              port: 9090
          readinessProbe:
            httpGet:
              path: /-/ready
              port: 9090
          resources:
            requests:
              cpu: 10m
            
          volumeMounts:
          - name: config-volume
            mountPath: /etc/prometheus
      volumes:
      - name: config-volume
        configMap:
          name: prometheus
      affinity:      
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
                - ppc64le
                - s390x
          preferredDuringSchedulingIgnoredDuringExecution:
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - ppc64le
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - s390x

---
# Source: istio/charts/security/templates/deployment.yaml
# istio CA watching all namespaces
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: istio-citadel
  namespace: istio-system
  labels:
    app: security
    chart: security-1.0.0
    release: istio
    heritage: Tiller
    istio: citadel
spec:
  replicas: 1
  template:
    metadata:
      labels:
        istio: citadel
      annotations:
        sidecar.istio.io/inject: "false"
        scheduler.alpha.kubernetes.io/critical-pod: ""
    spec:
      serviceAccountName: istio-citadel-service-account
      containers:
        - name: citadel
          image: "docker.io/istio/citadel:1.0.0"
          imagePullPolicy: IfNotPresent
          args:
            - --append-dns-names=true
            - --grpc-port=8060
            - --grpc-hostname=citadel
            - --citadel-storage-namespace=istio-system
            - --self-signed-ca=true
          resources:
            requests:
              cpu: 10m
            
      affinity:      
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
                - ppc64le
                - s390x
          preferredDuringSchedulingIgnoredDuringExecution:
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - ppc64le
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - s390x

---
# Source: istio/charts/sidecarInjectorWebhook/templates/deployment.yaml
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  name: istio-sidecar-injector
  namespace: istio-system
  labels:
    app: sidecarInjectorWebhook
    chart: sidecarInjectorWebhook-1.0.0
    release: istio
    heritage: Tiller
    istio: sidecar-injector
spec:
  replicas: 1
  template:
    metadata:
      labels:
        istio: sidecar-injector
      annotations:
        sidecar.istio.io/inject: "false"
        scheduler.alpha.kubernetes.io/critical-pod: ""
    spec:
      serviceAccountName: istio-sidecar-injector-service-account
      containers:
        - name: sidecar-injector-webhook
          image: "docker.io/istio/sidecar_injector:1.0.0"
          imagePullPolicy: IfNotPresent
          args:
            - --caCertFile=/etc/istio/certs/root-cert.pem
            - --tlsCertFile=/etc/istio/certs/cert-chain.pem
            - --tlsKeyFile=/etc/istio/certs/key.pem
            - --injectConfig=/etc/istio/inject/config
            - --meshConfig=/etc/istio/config/mesh
            - --healthCheckInterval=2s
            - --healthCheckFile=/health
          volumeMounts:
          - name: config-volume
            mountPath: /etc/istio/config
            readOnly: true
          - name: certs
            mountPath: /etc/istio/certs
            readOnly: true
          - name: inject-config
            mountPath: /etc/istio/inject
            readOnly: true
          livenessProbe:
            exec:
              command:
                - /usr/local/bin/sidecar-injector
                - probe
                - --probe-path=/health
                - --interval=4s
            initialDelaySeconds: 4
            periodSeconds: 4
          readinessProbe:
            exec:
              command:
                - /usr/local/bin/sidecar-injector
                - probe
                - --probe-path=/health
                - --interval=4s
            initialDelaySeconds: 4
            periodSeconds: 4
          resources:
            requests:
              cpu: 10m
            
      volumes:
      - name: config-volume
        configMap:
          name: istio
      - name: certs
        secret:
          secretName: istio.istio-sidecar-injector-service-account
      - name: inject-config
        configMap:
          name: istio-sidecar-injector
          items:
          - key: config
            path: config
      affinity:      
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
            - matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
                - ppc64le
                - s390x
          preferredDuringSchedulingIgnoredDuringExecution:
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - amd64
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - ppc64le
          - weight: 2
            preference:
              matchExpressions:
              - key: beta.kubernetes.io/arch
                operator: In
                values:
                - s390x

---
# Source: istio/charts/pilot/templates/gateway.yaml
apiVersion: networking.istio.io/v1alpha3
kind: Gateway
metadata:
  name: istio-autogenerated-k8s-ingress
  namespace: istio-system
spec:
  selector:
    istio: ingress
  servers:
  - port:
      number: 80
      protocol: HTTP2
      name: http
    hosts:
    - "*"

---

---
# Source: istio/charts/gateways/templates/autoscale.yaml

apiVersion: autoscaling/v2beta1
kind: HorizontalPodAutoscaler
metadata:
    name: istio-egressgateway
    namespace: istio-system
spec:
    maxReplicas: 5
    minReplicas: 1
    scaleTargetRef:
      apiVersion: apps/v1beta1
      kind: Deployment
      name: istio-egressgateway
    metrics:
      - type: Resource
        resource:
          name: cpu
          targetAverageUtilization: 60
---
apiVersion: autoscaling/v2beta1
kind: HorizontalPodAutoscaler
metadata:
    name: istio-ingressgateway
    namespace: istio-system
spec:
    maxReplicas: 5
    minReplicas: 1
    scaleTargetRef:
      apiVersion: apps/v1beta1
      kind: Deployment
      name: istio-ingressgateway
    metrics:
      - type: Resource
        resource:
          name: cpu
          targetAverageUtilization: 60
---

---
# Source: istio/charts/mixer/templates/autoscale.yaml

apiVersion: autoscaling/v2beta1
kind: HorizontalPodAutoscaler
metadata:
    name: istio-policy
    namespace: istio-system
spec:
    maxReplicas: 5
    minReplicas: 1
    scaleTargetRef:
      apiVersion: apps/v1beta1
      kind: Deployment
      name: istio-policy
    metrics:
    - type: Resource
      resource:
        name: cpu
        targetAverageUtilization: 80
---
apiVersion: autoscaling/v2beta1
kind: HorizontalPodAutoscaler
metadata:
    name: istio-telemetry
    namespace: istio-system
spec:
    maxReplicas: 5
    minReplicas: 1
    scaleTargetRef:
      apiVersion: apps/v1beta1
      kind: Deployment
      name: istio-telemetry
    metrics:
    - type: Resource
      resource:
        name: cpu
        targetAverageUtilization: 80
---

---
# Source: istio/charts/pilot/templates/autoscale.yaml

apiVersion: autoscaling/v2beta1
kind: HorizontalPodAutoscaler
metadata:
    name: istio-pilot
spec:
    maxReplicas: 1
    minReplicas: 1
    scaleTargetRef:
      apiVersion: apps/v1beta1
      kind: Deployment
      name: istio-pilot
    metrics:
      - type: Resource
        resource:
          name: cpu
          targetAverageUtilization: 55
---

---
# Source: istio/charts/sidecarInjectorWebhook/templates/mutatingwebhook.yaml
apiVersion: admissionregistration.k8s.io/v1beta1
kind: MutatingWebhookConfiguration
metadata:
  name: istio-sidecar-injector
  namespace: istio-system
  labels:
    app: istio-sidecar-injector
    chart: sidecarInjectorWebhook-1.0.0
    release: istio
    heritage: Tiller
webhooks:
  - name: sidecar-injector.istio.io
    clientConfig:
      service:
        name: istio-sidecar-injector
        namespace: istio-system
        path: "/inject"
      caBundle: ""
    rules:
      - operations: [ "CREATE" ]
        apiGroups: [""]
        apiVersions: ["v1"]
        resources: ["pods"]
    failurePolicy: Fail
    namespaceSelector:
      matchLabels:
        istio-injection: enabled


---
# Source: istio/charts/galley/templates/validatingwehookconfiguration.yaml.tpl


---
# Source: istio/charts/pilot/templates/meshexpansion.yaml


---
# Source: istio/charts/security/templates/create-custom-resources-job.yaml


---
# Source: istio/charts/security/templates/enable-mesh-mtls.yaml


---
# Source: istio/charts/security/templates/meshexpansion.yaml


---

---
# Source: istio/charts/telemetry-gateway/templates/gateway.yaml


---
# Source: istio/templates/install-custom-resources.sh.tpl


---
# Source: istio/charts/mixer/templates/config.yaml
apiVersion: "config.istio.io/v1alpha2"
kind: attributemanifest
metadata:
  name: istioproxy
  namespace: istio-system
spec:
  attributes:
    origin.ip:
      valueType: IP_ADDRESS
    origin.uid:
      valueType: STRING
    origin.user:
      valueType: STRING
    request.headers:
      valueType: STRING_MAP
    request.id:
      valueType: STRING
    request.host:
      valueType: STRING
    request.method:
      valueType: STRING
    request.path:
      valueType: STRING
    request.reason:
      valueType: STRING
    request.referer:
      valueType: STRING
    request.scheme:
      valueType: STRING
    request.total_size:
          valueType: INT64
    request.size:
      valueType: INT64
    request.time:
      valueType: TIMESTAMP
    request.useragent:
      valueType: STRING
    response.code:
      valueType: INT64
    response.duration:
      valueType: DURATION
    response.headers:
      valueType: STRING_MAP
    response.total_size:
          valueType: INT64
    response.size:
      valueType: INT64
    response.time:
      valueType: TIMESTAMP
    source.uid:
      valueType: STRING
    source.user: # DEPRECATED
      valueType: STRING
    source.principal:
      valueType: STRING
    destination.uid:
      valueType: STRING
    destination.principal:
      valueType: STRING
    destination.port:
      valueType: INT64
    connection.event:
      valueType: STRING
    connection.id:
      valueType: STRING
    connection.received.bytes:
      valueType: INT64
    connection.received.bytes_total:
      valueType: INT64
    connection.sent.bytes:
      valueType: INT64
    connection.sent.bytes_total:
      valueType: INT64
    connection.duration:
      valueType: DURATION
    connection.mtls:
      valueType: BOOL
    context.protocol:
      valueType: STRING
    context.timestamp:
      valueType: TIMESTAMP
    context.time:
      valueType: TIMESTAMP
    # Deprecated, kept for compatibility
    context.reporter.local:
      valueType: BOOL
    context.reporter.kind:
      valueType: STRING
    context.reporter.uid:
      valueType: STRING
    api.service:
      valueType: STRING
    api.version:
      valueType: STRING
    api.operation:
      valueType: STRING
    api.protocol:
      valueType: STRING
    request.auth.principal:
      valueType: STRING
    request.auth.audiences:
      valueType: STRING
    request.auth.presenter:
      valueType: STRING
    request.auth.claims:
      valueType: STRING_MAP
    request.auth.raw_claims:
      valueType: STRING
    request.api_key:
      valueType: STRING

---
apiVersion: "config.istio.io/v1alpha2"
kind: attributemanifest
metadata:
  name: kubernetes
  namespace: istio-system
spec:
  attributes:
    source.ip:
      valueType: IP_ADDRESS
    source.labels:
      valueType: STRING_MAP
    source.metadata:
      valueType: STRING_MAP
    source.name:
      valueType: STRING
    source.namespace:
      valueType: STRING
    source.owner:
      valueType: STRING
    source.service:  # DEPRECATED
      valueType: STRING
    source.serviceAccount:
      valueType: STRING
    source.services:
      valueType: STRING
    source.workload.uid:
      valueType: STRING
    source.workload.name:
      valueType: STRING
    source.workload.namespace:
      valueType: STRING
    destination.ip:
      valueType: IP_ADDRESS
    destination.labels:
      valueType: STRING_MAP
    destination.metadata:
      valueType: STRING_MAP
    destination.owner:
      valueType: STRING
    destination.name:
      valueType: STRING
    destination.container.name:
      valueType: STRING
    destination.namespace:
      valueType: STRING
    destination.service: # DEPRECATED
      valueType: STRING
    destination.service.uid:
      valueType: STRING
    destination.service.name:
      valueType: STRING
    destination.service.namespace:
      valueType: STRING
    destination.service.host:
      valueType: STRING
    destination.serviceAccount:
      valueType: STRING
    destination.workload.uid:
      valueType: STRING
    destination.workload.name:
      valueType: STRING
    destination.workload.namespace:
      valueType: STRING
---
apiVersion: "config.istio.io/v1alpha2"
kind: stdio
metadata:
  name: handler
  namespace: istio-system
spec:
  outputAsJson: true
---
apiVersion: "config.istio.io/v1alpha2"
kind: logentry
metadata:
  name: accesslog
  namespace: istio-system
spec:
  severity: '"Info"'
  timestamp: request.time
  variables:
    sourceIp: source.ip | ip("0.0.0.0")
    sourceApp: source.labels["app"] | ""
    sourcePrincipal: source.principal | ""
    sourceName: source.name | ""
    sourceWorkload: source.workload.name | ""
    sourceNamespace: source.namespace | ""
    sourceOwner: source.owner | ""
    destinationApp: destination.labels["app"] | ""
    destinationIp: destination.ip | ip("0.0.0.0")
    destinationServiceHost: destination.service.host | ""
    destinationWorkload: destination.workload.name | ""
    destinationName: destination.name | ""
    destinationNamespace: destination.namespace | ""
    destinationOwner: destination.owner | ""
    destinationPrincipal: destination.principal | ""
    apiClaims: request.auth.raw_claims | ""
    apiKey: request.api_key | request.headers["x-api-key"] | ""
    protocol: request.scheme | context.protocol | "http"
    method: request.method | ""
    url: request.path | ""
    responseCode: response.code | 0
    responseSize: response.size | 0
    requestSize: request.size | 0
    requestId: request.headers["x-request-id"] | ""
    clientTraceId: request.headers["x-client-trace-id"] | ""
    latency: response.duration | "0ms"
    connection_security_policy: conditional((context.reporter.kind | "inbound") == "outbound", "unknown", conditional(connection.mtls | false, "mutual_tls", "none"))
    userAgent: request.useragent | ""
    responseTimestamp: response.time
    receivedBytes: request.total_size | 0
    sentBytes: response.total_size | 0
    referer: request.referer | ""
    httpAuthority: request.headers[":authority"] | request.host | ""
    xForwardedFor: request.headers["x-forwarded-for"] | "0.0.0.0"
    reporter: conditional((context.reporter.kind | "inbound") == "outbound", "source", "destination")
  monitored_resource_type: '"global"'
---
apiVersion: "config.istio.io/v1alpha2"
kind: logentry
metadata:
  name: tcpaccesslog
  namespace: istio-system
spec:
  severity: '"Info"'
  timestamp: context.time | timestamp("2017-01-01T00:00:00Z")
  variables:
    connectionEvent: connection.event | ""
    sourceIp: source.ip | ip("0.0.0.0")
    sourceApp: source.labels["app"] | ""
    sourcePrincipal: source.principal | ""
    sourceName: source.name | ""
    sourceWorkload: source.workload.name | ""
    sourceNamespace: source.namespace | ""
    sourceOwner: source.owner | ""
    destinationApp: destination.labels["app"] | ""
    destinationIp: destination.ip | ip("0.0.0.0")
    destinationServiceHost: destination.service.host | ""
    destinationWorkload: destination.workload.name | ""
    destinationName: destination.name | ""
    destinationNamespace: destination.namespace | ""
    destinationOwner: destination.owner | ""
    destinationPrincipal: destination.principal | ""
    protocol: context.protocol | "tcp"
    connectionDuration: connection.duration | "0ms"
    connection_security_policy: conditional((context.reporter.kind | "inbound") == "outbound", "unknown", conditional(connection.mtls | false, "mutual_tls", "none"))
    receivedBytes: connection.received.bytes | 0
    sentBytes: connection.sent.bytes | 0
    totalReceivedBytes: connection.received.bytes_total | 0
    totalSentBytes: connection.sent.bytes_total | 0
    reporter: conditional((context.reporter.kind | "inbound") == "outbound", "source", "destination")
  monitored_resource_type: '"global"'
---
apiVersion: "config.istio.io/v1alpha2"
kind: rule
metadata:
  name: stdio
  namespace: istio-system
spec:
  match: context.protocol == "http" || context.protocol == "grpc"
  actions:
  - handler: handler.stdio
    instances:
    - accesslog.logentry
---
apiVersion: "config.istio.io/v1alpha2"
kind: rule
metadata:
  name: stdiotcp
  namespace: istio-system
spec:
  match: context.protocol == "tcp"
  actions:
  - handler: handler.stdio
    instances:
    - tcpaccesslog.logentry
---
apiVersion: "config.istio.io/v1alpha2"
kind: metric
metadata:
  name: requestcount
  namespace: istio-system
spec:
  value: "1"
  dimensions:
    reporter: conditional((context.reporter.kind | "inbound") == "outbound", "source", "destination")
    source_workload: source.workload.name | "unknown"
    source_workload_namespace: source.workload.namespace | "unknown"
    source_principal: source.principal | "unknown"
    source_app: source.labels["app"] | "unknown"
    source_version: source.labels["version"] | "unknown"
    destination_workload: destination.workload.name | "unknown"
    destination_workload_namespace: destination.workload.namespace | "unknown"
    destination_principal: destination.principal | "unknown"
    destination_app: destination.labels["app"] | "unknown"
    destination_version: destination.labels["version"] | "unknown"
    destination_service: destination.service.host | "unknown"
    destination_service_name: destination.service.name | "unknown"
    destination_service_namespace: destination.service.namespace | "unknown"
    request_protocol: api.protocol | context.protocol | "unknown"
    response_code: response.code | 200
    connection_security_policy: conditional((context.reporter.kind | "inbound") == "outbound", "unknown", conditional(connection.mtls | false, "mutual_tls", "none"))
  monitored_resource_type: '"UNSPECIFIED"'
---
apiVersion: "config.istio.io/v1alpha2"
kind: metric
metadata:
  name: requestduration
  namespace: istio-system
spec:
  value: response.duration | "0ms"
  dimensions:
    reporter: conditional((context.reporter.kind | "inbound") == "outbound", "source", "destination")
    source_workload: source.workload.name | "unknown"
    source_workload_namespace: source.workload.namespace | "unknown"
    source_principal: source.principal | "unknown"
    source_app: source.labels["app"] | "unknown"
    source_version: source.labels["version"] | "unknown"
    destination_workload: destination.workload.name | "unknown"
    destination_workload_namespace: destination.workload.namespace | "unknown"
    destination_principal: destination.principal | "unknown"
    destination_app: destination.labels["app"] | "unknown"
    destination_version: destination.labels["version"] | "unknown"
    destination_service: destination.service.host | "unknown"
    destination_service_name: destination.service.name | "unknown"
    destination_service_namespace: destination.service.namespace | "unknown"
    request_protocol: api.protocol | context.protocol | "unknown"
    response_code: response.code | 200
    connection_security_policy: conditional((context.reporter.kind | "inbound") == "outbound", "unknown", conditional(connection.mtls | false, "mutual_tls", "none"))
  monitored_resource_type: '"UNSPECIFIED"'
---
apiVersion: "config.istio.io/v1alpha2"
kind: metric
metadata:
  name: requestsize
  namespace: istio-system
spec:
  value: request.size | 0
  dimensions:
    reporter: conditional((context.reporter.kind | "inbound") == "outbound", "source", "destination")
    source_workload: source.workload.name | "unknown"
    source_workload_namespace: source.workload.namespace | "unknown"
    source_principal: source.principal | "unknown"
    source_app: source.labels["app"] | "unknown"
    source_version: source.labels["version"] | "unknown"
    destination_workload: destination.workload.name | "unknown"
    destination_workload_namespace: destination.workload.namespace | "unknown"
    destination_principal: destination.principal | "unknown"
    destination_app: destination.labels["app"] | "unknown"
    destination_version: destination.labels["version"] | "unknown"
    destination_service: destination.service.host | "unknown"
    destination_service_name: destination.service.name | "unknown"
    destination_service_namespace: destination.service.namespace | "unknown"
    request_protocol: api.protocol | context.protocol | "unknown"
    response_code: response.code | 200
    connection_security_policy: conditional((context.reporter.kind | "inbound") == "outbound", "unknown", conditional(connection.mtls | false, "mutual_tls", "none"))
  monitored_resource_type: '"UNSPECIFIED"'
---
apiVersion: "config.istio.io/v1alpha2"
kind: metric
metadata:
  name: responsesize
  namespace: istio-system
spec:
  value: response.size | 0
  dimensions:
    reporter: conditional((context.reporter.kind | "inbound") == "outbound", "source", "destination")
    source_workload: source.workload.name | "unknown"
    source_workload_namespace: source.workload.namespace | "unknown"
    source_principal: source.principal | "unknown"
    source_app: source.labels["app"] | "unknown"
    source_version: source.labels["version"] | "unknown"
    destination_workload: destination.workload.name | "unknown"
    destination_workload_namespace: destination.workload.namespace | "unknown"
    destination_principal: destination.principal | "unknown"
    destination_app: destination.labels["app"] | "unknown"
    destination_version: destination.labels["version"] | "unknown"
    destination_service: destination.service.host | "unknown"
    destination_service_name: destination.service.name | "unknown"
    destination_service_namespace: destination.service.namespace | "unknown"
    request_protocol: api.protocol | context.protocol | "unknown"
    response_code: response.code | 200
    connection_security_policy: conditional((context.reporter.kind | "inbound") == "outbound", "unknown", conditional(connection.mtls | false, "mutual_tls", "none"))
  monitored_resource_type: '"UNSPECIFIED"'
---
apiVersion: "config.istio.io/v1alpha2"
kind: metric
metadata:
  name: tcpbytesent
  namespace: istio-system
spec:
  value: connection.sent.bytes | 0
  dimensions:
    reporter: conditional((context.reporter.kind | "inbound") == "outbound", "source", "destination")
    source_workload: source.workload.name | "unknown"
    source_workload_namespace: source.workload.namespace | "unknown"
    source_principal: source.principal | "unknown"
    source_app: source.labels["app"] | "unknown"
    source_version: source.labels["version"] | "unknown"
    destination_workload: destination.workload.name | "unknown"
    destination_workload_namespace: destination.workload.namespace | "unknown"
    destination_principal: destination.principal | "unknown"
    destination_app: destination.labels["app"] | "unknown"
    destination_version: destination.labels["version"] | "unknown"
    destination_service: destination.service.name | "unknown"
    destination_service_name: destination.service.name | "unknown"
    destination_service_namespace: destination.service.namespace | "unknown"
    connection_security_policy: conditional((context.reporter.kind | "inbound") == "outbound", "unknown", conditional(connection.mtls | false, "mutual_tls", "none"))
  monitored_resource_type: '"UNSPECIFIED"'
---
apiVersion: "config.istio.io/v1alpha2"
kind: metric
metadata:
  name: tcpbytereceived
  namespace: istio-system
spec:
  value: connection.received.bytes | 0
  dimensions:
    reporter: conditional((context.reporter.kind | "inbound") == "outbound", "source", "destination")
    source_workload: source.workload.name | "unknown"
    source_workload_namespace: source.workload.namespace | "unknown"
    source_principal: source.principal | "unknown"
    source_app: source.labels["app"] | "unknown"
    source_version: source.labels["version"] | "unknown"
    destination_workload: destination.workload.name | "unknown"
    destination_workload_namespace: destination.workload.namespace | "unknown"
    destination_principal: destination.principal | "unknown"
    destination_app: destination.labels["app"] | "unknown"
    destination_version: destination.labels["version"] | "unknown"
    destination_service: destination.service.name | "unknown"
    destination_service_name: destination.service.name | "unknown"
    destination_service_namespace: destination.service.namespace | "unknown"
    connection_security_policy: conditional((context.reporter.kind | "inbound") == "outbound", "unknown", conditional(connection.mtls | false, "mutual_tls", "none"))
  monitored_resource_type: '"UNSPECIFIED"'
---
apiVersion: "config.istio.io/v1alpha2"
kind: prometheus
metadata:
  name: handler
  namespace: istio-system
spec:
  metrics:
  - name: requests_total
    instance_name: requestcount.metric.istio-system
    kind: COUNTER
    label_names:
    - reporter
    - source_app
    - source_principal
    - source_workload
    - source_workload_namespace
    - source_version
    - destination_app
    - destination_principal
    - destination_workload
    - destination_workload_namespace
    - destination_version
    - destination_service
    - destination_service_name
    - destination_service_namespace
    - request_protocol
    - response_code
    - connection_security_policy
  - name: request_duration_seconds
    instance_name: requestduration.metric.istio-system
    kind: DISTRIBUTION
    label_names:
    - reporter
    - source_app
    - source_principal
    - source_workload
    - source_workload_namespace
    - source_version
    - destination_app
    - destination_principal
    - destination_workload
    - destination_workload_namespace
    - destination_version
    - destination_service
    - destination_service_name
    - destination_service_namespace
    - request_protocol
    - response_code
    - connection_security_policy
    buckets:
      explicit_buckets:
        bounds: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10]
  - name: request_bytes
    instance_name: requestsize.metric.istio-system
    kind: DISTRIBUTION
    label_names:
    - reporter
    - source_app
    - source_principal
    - source_workload
    - source_workload_namespace
    - source_version
    - destination_app
    - destination_principal
    - destination_workload
    - destination_workload_namespace
    - destination_version
    - destination_service
    - destination_service_name
    - destination_service_namespace
    - request_protocol
    - response_code
    - connection_security_policy
    buckets:
      exponentialBuckets:
        numFiniteBuckets: 8
        scale: 1
        growthFactor: 10
  - name: response_bytes
    instance_name: responsesize.metric.istio-system
    kind: DISTRIBUTION
    label_names:
    - reporter
    - source_app
    - source_principal
    - source_workload
    - source_workload_namespace
    - source_version
    - destination_app
    - destination_principal
    - destination_workload
    - destination_workload_namespace
    - destination_version
    - destination_service
    - destination_service_name
    - destination_service_namespace
    - request_protocol
    - response_code
    - connection_security_policy
    buckets:
      exponentialBuckets:
        numFiniteBuckets: 8
        scale: 1
        growthFactor: 10
  - name: tcp_sent_bytes_total
    instance_name: tcpbytesent.metric.istio-system
    kind: COUNTER
    label_names:
    - reporter
    - source_app
    - source_principal
    - source_workload
    - source_workload_namespace
    - source_version
    - destination_app
    - destination_principal
    - destination_workload
    - destination_workload_namespace
    - destination_version
    - destination_service
    - destination_service_name
    - destination_service_namespace
    - connection_security_policy
  - name: tcp_received_bytes_total
    instance_name: tcpbytereceived.metric.istio-system
    kind: COUNTER
    label_names:
    - reporter
    - source_app
    - source_principal
    - source_workload
    - source_workload_namespace
    - source_version
    - destination_app
    - destination_principal
    - destination_workload
    - destination_workload_namespace
    - destination_version
    - destination_service
    - destination_service_name
    - destination_service_namespace
    - connection_security_policy
---
apiVersion: "config.istio.io/v1alpha2"
kind: rule
metadata:
  name: promhttp
  namespace: istio-system
spec:
  match: context.protocol == "http" || context.protocol == "grpc"
  actions:
  - handler: handler.prometheus
    instances:
    - requestcount.metric
    - requestduration.metric
    - requestsize.metric
    - responsesize.metric
---
apiVersion: "config.istio.io/v1alpha2"
kind: rule
metadata:
  name: promtcp
  namespace: istio-system
spec:
  match: context.protocol == "tcp"
  actions:
  - handler: handler.prometheus
    instances:
    - tcpbytesent.metric
    - tcpbytereceived.metric
---

apiVersion: "config.istio.io/v1alpha2"
kind: kubernetesenv
metadata:
  name: handler
  namespace: istio-system
spec:
  # when running from mixer root, use the following config after adding a
  # symbolic link to a kubernetes config file via:
  #
  # $ ln -s ~/.kube/config mixer/adapter/kubernetes/kubeconfig
  #
  # kubeconfig_path: "mixer/adapter/kubernetes/kubeconfig"

---
apiVersion: "config.istio.io/v1alpha2"
kind: rule
metadata:
  name: kubeattrgenrulerule
  namespace: istio-system
spec:
  actions:
  - handler: handler.kubernetesenv
    instances:
    - attributes.kubernetes
---
apiVersion: "config.istio.io/v1alpha2"
kind: rule
metadata:
  name: tcpkubeattrgenrulerule
  namespace: istio-system
spec:
  match: context.protocol == "tcp"
  actions:
  - handler: handler.kubernetesenv
    instances:
    - attributes.kubernetes
---
apiVersion: "config.istio.io/v1alpha2"
kind: kubernetes
metadata:
  name: attributes
  namespace: istio-system
spec:
  # Pass the required attribute data to the adapter
  source_uid: source.uid | ""
  source_ip: source.ip | ip("0.0.0.0") # default to unspecified ip addr
  destination_uid: destination.uid | ""
  destination_port: destination.port | 0
  attribute_bindings:
    # Fill the new attributes from the adapter produced output.
    # $out refers to an instance of OutputTemplate message
    source.ip: $out.source_pod_ip | ip("0.0.0.0")
    source.uid: $out.source_pod_uid | "unknown"
    source.labels: $out.source_labels | emptyStringMap()
    source.name: $out.source_pod_name | "unknown"
    source.namespace: $out.source_namespace | "default"
    source.owner: $out.source_owner | "unknown"
    source.serviceAccount: $out.source_service_account_name | "unknown"
    source.workload.uid: $out.source_workload_uid | "unknown"
    source.workload.name: $out.source_workload_name | "unknown"
    source.workload.namespace: $out.source_workload_namespace | "unknown"
    destination.ip: $out.destination_pod_ip | ip("0.0.0.0")
    destination.uid: $out.destination_pod_uid | "unknown"
    destination.labels: $out.destination_labels | emptyStringMap()
    destination.name: $out.destination_pod_name | "unknown"
    destination.container.name: $out.destination_container_name | "unknown"
    destination.namespace: $out.destination_namespace | "default"
    destination.owner: $out.destination_owner | "unknown"
    destination.serviceAccount: $out.destination_service_account_name | "unknown"
    destination.workload.uid: $out.destination_workload_uid | "unknown"
    destination.workload.name: $out.destination_workload_name | "unknown"
    destination.workload.namespace: $out.destination_workload_namespace | "unknown"

---
# Configuration needed by Mixer.
# Mixer cluster is delivered via CDS
# Specify mixer cluster settings
apiVersion: networking.istio.io/v1alpha3
kind: DestinationRule
metadata:
  name: istio-policy
  namespace: istio-system
spec:
  host: istio-policy.istio-system.svc.cluster.local
  trafficPolicy:
    connectionPool:
      http:
        http2MaxRequests: 10000
        maxRequestsPerConnection: 10000
---
apiVersion: networking.istio.io/v1alpha3
kind: DestinationRule
metadata:
  name: istio-telemetry
  namespace: istio-system
spec:
  host: istio-telemetry.istio-system.svc.cluster.local
  trafficPolicy:
    connectionPool:
      http:
        http2MaxRequests: 10000
        maxRequestsPerConnection: 10000
---


================================================
FILE: examples/90_Kubernetes/minikube/README.md
================================================
# Development with Minikube

## Install `minikube`, `kubectl`, and `helm`

This only need to be done one time, or periodically if you wish to upgrade.

```
sudo apt update && sudo apt install -y --no-install-recommends socat

curl -Lo minikube https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64 && chmod +x minikube && sudo mv minikube /usr/local/bin/

curl -Lo kubectl https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl && chmod +x kubectl && sudo mv kubectl /usr/local/bin

curl https://raw.githubusercontent.com/kubernetes/helm/master/scripts/get > get_helm.sh
chmod 700 get_helm.sh
./get_helm.sh
```

Add the `coreos/prometheus-operator` repo:
```
helm repo add coreos https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/
```

## Launch a Kubernetes Cluster

```
./bootstrap.sh
```

Check configurations:
```
kubectl get all
kubectl get all --all-namespaces
kubectl get nodes -o=custom-columns=NAME:.metadata.name,GPUs:.status.capacity.'nvidia\.com/gpu'
# last command should report the number of GPUs on your system
# this make take some time - coffee?!
```


================================================
FILE: examples/90_Kubernetes/minikube/bootstrap.sh
================================================
#!/bin/bash

mkdir -p $HOME/.kube
touch $HOME/.kube/config

export MINIKUBE_HOME=$HOME
export CHANGE_MINIKUBE_NONE_USER=true
export KUBECONFIG=$HOME/.kube/config

version=v1.10

sudo minikube start \
  --feature-gates=DevicePlugins=true \
  --vm-driver=none \
  --kubernetes-version=${version}.0 \
  --bootstrapper=kubeadm \
  --extra-config=kubelet.authentication-token-webhook=true \
  --extra-config=kubelet.authorization-mode=Webhook \
  --extra-config=scheduler.address=0.0.0.0 \
  --extra-config=controller-manager.address=0.0.0.0 \
  --extra-config=controller-manager.cluster-signing-cert-file="/var/lib/localkube/certs/ca.crt" \
  --extra-config=controller-manager.cluster-signing-key-file="/var/lib/localkube/certs/ca.key" \
  --extra-config=apiserver.admission-control="NamespaceLifecycle,LimitRanger,ServiceAccount,PersistentVolumeLabel,DefaultStorageClass,DefaultTolerationSeconds,MutatingAdmissionWebhook,ValidatingAdmissionWebhook,ResourceQuota" 

if [ ! -e $HOME/.kube ]; then
  sudo mv /root/.kube $HOME/.kube > /dev/null 2>&1 ||: # this will write over any previous configuration
  sudo chown -R $USER $HOME/.kube > /dev/null 2>&1 ||:
  sudo chgrp -R $USER $HOME/.kube > /dev/null 2>&1 ||:
fi

if [ ! -e $HOME/.minikube ]; then
  sudo mv /root/.minikube $HOME/.minikube # > /dev/null 2>&1 ||: this will write over any previous configuration
  sudo chown -R $USER $HOME/.minikube > /dev/null 2>&1 ||:
  sudo chgrp -R $USER $HOME/.minikube > /dev/null 2>&1 ||:
fi

kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/${version}/nvidia-device-plugin.yml

# dns fix for dgx-stations using ubuntu's network manager
kubectl apply -f https://raw.githubusercontent.com/ryanolson/k8s-upstream-dns/master/dns.yml


================================================
FILE: examples/90_Kubernetes/prometheus/bootstrap.sh
================================================
#!/bin/bash

kubectl create -f service-account.yml

helm init --wait --service-account tiller

helm repo add coreos https://s3-eu-west-1.amazonaws.com/coreos-charts/stable/

helm install coreos/prometheus-operator \
  --name prometheus-operator \
  --namespace monitoring

helm install coreos/kube-prometheus \
  --name kube-prometheus \
  --namespace monitoring \
  -f custom-settings.yml

kubectl apply -f yais-metrics.yml


================================================
FILE: examples/90_Kubernetes/prometheus/custom-settings.yml
================================================
global:
  rbacEnable: true

#prometheus:
#  service:
#    type: NodePort

grafana:
# image:
#   tag: 5.2.1
  service:
    type: NodePort
  dataSource:
    yais-datasource.json: |+
      {
        "access": "proxy",
        "basicAuth": false,
        "name": "yais",
        "type": "prometheus",
        "url": "http://yais-metrics.default:9090"
      }
  serverDashboardFiles:
    yais-dashboard.json: |+
      {
        "dashboard": 
        {
          "__inputs": [
            {
              "name": "DS_YAIS",
              "label": "yais",
              "description": "",
              "type": "datasource",
              "pluginId": "prometheus",
              "pluginName": "Prometheus"
            }
          ],
          "__requires": [
            {
              "type": "grafana",
              "id": "grafana",
              "name": "Grafana",
              "version": "5.0.0"
            },
            {
              "type": "panel",
              "id": "graph",
              "name": "Graph",
              "version": "5.0.0"
            },
            {
              "type": "panel",
              "id": "heatmap",
              "name": "Heatmap",
              "version": "5.0.0"
            },
            {
              "type": "datasource",
              "id": "prometheus",
              "name": "Prometheus",
              "version": "5.0.0"
            }
          ],
          "annotations": {
            "list": [
              {
                "builtIn": 1,
                "datasource": "-- Grafana --",
                "enable": true,
                "hide": true,
                "iconColor": "rgba(0, 211, 255, 1)",
                "name": "Annotations & Alerts",
                "type": "dashboard"
              }
            ]
          },
          "editable": true,
          "gnetId": null,
          "graphTooltip": 0,
          "id": null,
          "links": [],
          "panels": [
            {
              "aliasColors": {},
              "bars": true,
              "dashLength": 10,
              "dashes": false,
              "datasource": "${DS_YAIS}",
              "fill": 1,
              "gridPos": {
                "h": 9,
                "w": 12,
                "x": 0,
                "y": 0
              },
              "id": 8,
              "legend": {
                "avg": false,
                "current": false,
                "max": false,
                "min": false,
                "show": false,
                "total": false,
                "values": false
              },
              "lines": true,
              "linewidth": 1,
              "links": [],
              "nullPointMode": "null",
              "percentage": false,
              "pointradius": 5,
              "points": false,
              "renderer": "flot",
              "seriesOverrides": [],
              "spaceLength": 10,
              "stack": true,
              "steppedLine": false,
              "targets": [
                {
                  "expr": "irate(yais_inference_load_ratio_count[5s])",
                  "format": "time_series",
                  "intervalFactor": 1,
                  "refId": "A"
                },
                {
                  "expr": "",
                  "format": "time_series",
                  "intervalFactor": 1,
                  "refId": "B"
                }
              ],
              "thresholds": [],
              "timeFrom": null,
              "timeShift": null,
              "title": "Inference Rate (FPS)",
              "tooltip": {
                "shared": true,
                "sort": 0,
                "value_type": "individual"
              },
              "type": "graph",
              "xaxis": {
                "buckets": null,
                "mode": "time",
                "name": null,
                "show": true,
                "values": []
              },
              "yaxes": [
                {
                  "format": "short",
                  "label": null,
                  "logBase": 1,
                  "max": null,
                  "min": null,
                  "show": true
                },
                {
                  "format": "short",
                  "label": null,
                  "logBase": 1,
                  "max": null,
                  "min": null,
                  "show": true
                }
              ],
              "yaxis": {
                "align": false,
                "alignLevel": null
              }
            },
            {
              "aliasColors": {},
              "bars": false,
              "dashLength": 10,
              "dashes": false,
              "datasource": "${DS_YAIS}",
              "fill": 1,
              "gridPos": {
                "h": 9,
                "w": 12,
                "x": 12,
                "y": 0
              },
              "id": 4,
              "legend": {
                "avg": false,
                "current": false,
                "max": false,
                "min": false,
                "show": false,
                "total": false,
                "values": false
              },
              "lines": true,
              "linewidth": 1,
              "links": [],
              "nullPointMode": "null",
              "percentage": false,
              "pointradius": 5,
              "points": false,
              "renderer": "flot",
              "seriesOverrides": [],
              "spaceLength": 10,
              "stack": true,
              "steppedLine": true,
              "targets": [
                {
                  "expr": "yais_gpus_power_usage",
                  "format": "time_series",
                  "intervalFactor": 1,
                  "refId": "A"
                }
              ],
              "thresholds": [],
              "timeFrom": null,
              "timeShift": null,
              "title": "GPU Power (Watts)",
              "tooltip": {
                "shared": true,
                "sort": 0,
                "value_type": "individual"
              },
              "type": "graph",
              "xaxis": {
                "buckets": null,
                "mode": "time",
                "name": null,
                "show": true,
                "values": []
              },
              "yaxes": [
                {
                  "format": "short",
                  "label": null,
                  "logBase": 1,
                  "max": null,
                  "min": null,
                  "show": true
                },
                {
                  "format": "short",
                  "label": null,
                  "logBase": 1,
                  "max": null,
                  "min": null,
                  "show": true
                }
              ],
              "yaxis": {
                "align": false,
                "alignLevel": null
              }
            },
            {
              "aliasColors": {},
              "bars": false,
              "dashLength": 10,
              "dashes": false,
              "datasource": "${DS_YAIS}",
              "fill": 1,
              "gridPos": {
                "h": 9,
                "w": 12,
                "x": 0,
                "y": 9
              },
              "id": 2,
              "legend": {
                "avg": false,
                "current": false,
                "max": false,
                "min": false,
                "show": false,
                "total": false,
                "values": false
              },
              "lines": true,
              "linewidth": 1,
              "links": [],
              "nullPointMode": "null",
              "percentage": false,
              "pointradius": 5,
              "points": false,
              "renderer": "flot",
              "seriesOverrides": [],
              "spaceLength": 10,
              "stack": false,
              "steppedLine": false,
              "targets": [
                {
                  "expr": "yais_executor_queue_depth",
                  "format": "time_series",
                  "intervalFactor": 1,
                  "refId": "A"
                }
              ],
              "thresholds": [],
              "timeFrom": null,
              "timeShift": null,
              "title": "Queue Depth",
              "tooltip": {
                "shared": true,
                "sort": 0,
                "value_type": "individual"
              },
              "type": "graph",
              "xaxis": {
                "buckets": null,
                "mode": "time",
                "name": null,
                "show": true,
                "values": []
              },
              "yaxes": [
                {
                  "format": "short",
                  "label": null,
                  "logBase": 1,
                  "max": null,
                  "min": null,
                  "show": true
                },
                {
                  "format": "short",
                  "label": null,
                  "logBase": 1,
                  "max": null,
                  "min": null,
                  "show": true
                }
              ],
              "yaxis": {
                "align": false,
                "alignLevel": null
              }
            },
            {
              "cards": {
                "cardPadding": null,
                "cardRound": null
              },
              "color": {
                "cardColor": "#b4ff00",
                "colorScale": "sqrt",
                "colorScheme": "interpolateOranges",
                "exponent": 0.5,
                "mode": "spectrum"
              },
              "dataFormat": "timeseries",
              "datasource": "${DS_YAIS}",
              "gridPos": {
                "h": 9,
                "w": 12,
                "x": 12,
                "y": 9
              },
              "heatmap": {},
              "highlightCards": true,
              "id": 10,
              "interval": "",
              "legend": {
                "show": false
              },
              "links": [],
              "targets": [
                {
                  "expr": "irate(yais_inference_load_ratio_bucket[15s])",
                  "format": "time_series",
                  "intervalFactor": 1,
                  "legendFormat": "",
                  "refId": "A"
                }
              ],
              "title": "Load Ratio (Ideal 0-1)",
              "tooltip": {
                "show": true,
                "showHistogram": false
              },
              "type": "heatmap",
              "xAxis": {
                "show": true
              },
              "xBucketNumber": null,
              "xBucketSize": null,
              "yAxis": {
                "decimals": null,
                "format": "short",
                "logBase": 10,
                "max": null,
                "min": "0",
                "show": true,
                "splitFactor": 2
              },
              "yBucketNumber": null,
              "yBucketSize": null
            }
          ],
          "refresh": "5s",
          "schemaVersion": 16,
          "style": "dark",
          "tags": [],
          "templating": {
            "list": []
          },
          "time": {
            "from": "now-30m",
            "to": "now"
          },
          "timepicker": {
            "refresh_intervals": [
              "5s",
              "10s",
              "30s",
              "1m",
              "5m",
              "15m",
              "30m",
              "1h",
              "2h",
              "1d"
            ],
            "time_options": [
              "5m",
              "15m",
              "1h",
              "6h",
              "12h",
              "24h",
              "2d",
              "7d",
              "30d"
            ]
          },
          "timezone": "",
          "title": "YAIS",
          "uid": "3WLjQkdmk",
          "version": 1
        },
        "inputs": [
          {
            "name": "DS_YAIS",
            "pluginId": "prometheus",
            "type": "datasource",
            "value": "yais"
          }
        ],
        "overwrite": true
      }


================================================
FILE: examples/90_Kubernetes/prometheus/service-account.yml
================================================
# Create a service account for Helm and grant the cluster admin role.
# It is assumed that helm should be installed with this service account
# (tiller).
apiVersion: v1
kind: ServiceAccount
metadata:
  name: tiller
  namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
  name: tiller
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: cluster-admin
subjects:
- kind: ServiceAccount
  name: tiller
  namespace: kube-system


================================================
FILE: examples/90_Kubernetes/prometheus/yais-dashboard.json
================================================
{
  "__inputs": [
    {
      "name": "DS_YAIS",
      "label": "yais",
      "description": "",
      "type": "datasource",
      "pluginId": "prometheus",
      "pluginName": "Prometheus"
    }
  ],
  "__requires": [
    {
      "type": "grafana",
      "id": "grafana",
      "name": "Grafana",
      "version": "5.2.1"
    },
    {
      "type": "panel",
      "id": "graph",
      "name": "Graph",
      "version": "5.0.0"
    },
    {
      "type": "datasource",
      "id": "prometheus",
      "name": "Prometheus",
      "version": "5.0.0"
    }
  ],
  "annotations": {
    "list": [
      {
        "builtIn": 1,
        "datasource": "-- Grafana --",
        "enable": true,
        "hide": true,
        "iconColor": "rgba(0, 211, 255, 1)",
        "name": "Annotations & Alerts",
        "type": "dashboard"
      }
    ]
  },
  "editable": true,
  "gnetId": null,
  "graphTooltip": 0,
  "id": null,
  "links": [],
  "panels": [
    {
      "aliasColors": {},
      "bars": false,
      "dashLength": 10,
      "dashes": false,
      "datasource": "${DS_YAIS}",
      "fill": 1,
      "gridPos": {
        "h": 9,
        "w": 12,
        "x": 0,
        "y": 0
      },
      "id": 8,
      "legend": {
        "avg": false,
        "current": false,
        "max": false,
        "min": false,
        "show": true,
        "total": false,
        "values": false
      },
      "lines": true,
      "linewidth": 1,
      "links": [],
      "nullPointMode": "null",
      "percentage": false,
      "pointradius": 5,
      "points": false,
      "renderer": "flot",
      "seriesOverrides": [],
      "spaceLength": 10,
      "stack": false,
      "steppedLine": false,
      "targets": [
        {
          "expr": "rate(yais_inference_load_ratio_count[5s])",
          "format": "time_series",
          "intervalFactor": 1,
          "refId": "A"
        },
        {
          "expr": "irate(yais_inference_load_ratio_count[5s])",
          "format": "time_series",
          "intervalFactor": 1,
          "refId": "B"
        }
      ],
      "thresholds": [],
      "timeFrom": null,
      "timeShift": null,
      "title": "Inference Rate (FPS)",
      "tooltip": {
        "shared": true,
        "sort": 0,
        "value_type": "individual"
      },
      "type": "graph",
      "xaxis": {
        "buckets": null,
        "mode": "time",
        "name": null,
        "show": true,
        "values": []
      },
      "yaxes": [
        {
          "format": "short",
          "label": null,
          "logBase": 1,
          "max": null,
          "min": null,
          "show": true
        },
        {
          "format": "short",
          "label": null,
          "logBase": 1,
          "max": null,
          "min": null,
          "show": true
        }
      ],
      "yaxis": {
        "align": false,
        "alignLevel": null
      }
    },
    {
      "aliasColors": {},
      "bars": false,
      "dashLength": 10,
      "dashes": false,
      "datasource": "${DS_YAIS}",
      "fill": 1,
      "gridPos": {
        "h": 9,
        "w": 12,
        "x": 12,
        "y": 0
      },
      "id": 4,
      "legend": {
        "avg": false,
        "current": false,
        "max": false,
        "min": false,
        "show": true,
        "total": false,
        "values": false
      },
      "lines": true,
      "linewidth": 1,
      "links": [],
      "nullPointMode": "null",
      "percentage": false,
      "pointradius": 5,
      "points": false,
      "renderer": "flot",
      "seriesOverrides": [],
      "spaceLength": 10,
      "stack": false,
      "steppedLine": false,
      "targets": [
        {
          "expr": "yais_gpus_power_usage",
          "format": "time_series",
          "intervalFactor": 1,
          "refId": "A"
        }
      ],
      "thresholds": [],
      "timeFrom": null,
      "timeShift": null,
      "title": "GPU Power (Watts)",
      "tooltip": {
        "shared": true,
        "sort": 0,
        "value_type": "individual"
      },
      "type": "graph",
      "xaxis": {
        "buckets": null,
        "mode": "time",
        "name": null,
        "show": true,
        "values": []
      },
      "yaxes": [
        {
          "format": "short",
          "label": null,
          "logBase": 1,
          "max": null,
          "min": null,
          "show": true
        },
        {
          "format": "short",
          "label": null,
          "logBase": 1,
          "max": null,
          "min": null,
          "show": true
        }
      ],
      "yaxis": {
        "align": false,
        "alignLevel": null
      }
    },
    {
      "aliasColors": {},
      "bars": false,
      "dashLength": 10,
      "dashes": false,
      "datasource": "${DS_YAIS}",
      "fill": 1,
      "gridPos": {
        "h": 9,
        "w": 12,
        "x": 0,
        "y": 9
      },
      "id": 2,
      "legend": {
        "avg": false,
        "current": false,
        "max": false,
        "min": false,
        "show": true,
        "total": false,
        "values": false
      },
      "lines": true,
      "linewidth": 1,
      "links": [],
      "nullPointMode": "null",
      "percentage": false,
      "pointradius": 5,
      "points": false,
      "renderer": "flot",
      "seriesOverrides": [],
      "spaceLength": 10,
      "stack": false,
      "steppedLine": false,
      "targets": [
        {
          "expr": "yais_executor_queue_depth",
          "format": "time_series",
          "intervalFactor": 1,
          "refId": "A"
        }
      ],
      "thresholds": [],
      "timeFrom": null,
      "timeShift": null,
      "title": "Queue Depth",
      "tooltip": {
        "shared": true,
        "sort": 0,
        "value_type": "individual"
      },
      "type": "graph",
      "xaxis": {
        "buckets": null,
        "mode": "time",
        "name": null,
        "show": true,
        "values": []
      },
      "yaxes": [
        {
          "format": "short",
          "label": null,
          "logBase": 1,
          "max": null,
          "min": null,
          "show": true
        },
        {
          "format": "short",
          "label": null,
          "logBase": 1,
          "max": null,
          "min": null,
          "show": true
        }
      ],
      "yaxis": {
        "align": false,
        "alignLevel": null
      }
    }
  ],
  "schemaVersion": 16,
  "style": "dark",
  "tags": [],
  "templating": {
    "list": []
  },
  "time": {
    "from": "now-15m",
    "to": "now"
  },
  "timepicker": {
    "refresh_intervals": [
      "5s",
      "10s",
      "30s",
      "1m",
      "5m",
      "15m",
      "30m",
      "1h",
      "2h",
      "1d"
    ],
    "time_options": [
      "5m",
      "15m",
      "1h",
      "6h",
      "12h",
      "24h",
      "2d",
      "7d",
      "30d"
    ]
  },
  "timezone": "",
  "title": "YAIS",
  "uid": "3WLjQkdmk",
  "version": 1
}


================================================
FILE: examples/90_Kubernetes/prometheus/yais-metrics.yml
================================================
#
# Create a Service Account, Role, Role Binding
# YAIS Specific Prometheus (via Operator) and Service
#
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: prometheus
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
  name: prometheus
rules:
- apiGroups: [""]
  resources:
  - nodes
  - services
  - endpoints
  - pods
  verbs: ["get", "list", "watch"]
- apiGroups: [""]
  resources:
  - configmaps
  verbs: ["get"]
- nonResourceURLs: ["/metrics"]
  verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
  name: prometheus
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: prometheus
subjects:
- kind: ServiceAccount
  name: prometheus
  namespace: default
---
apiVersion: monitoring.coreos.com/v1
kind: Prometheus
metadata:
  name: yais-metrics
spec:
  serviceAccountName: prometheus
  serviceMonitorSelector:
    matchLabels:
      scrape: yais
  resources:
    requests:
      memory: 400Mi
---
apiVersion: v1
kind: Service
metadata:
  name: yais-metrics
spec:
  ports:
  - name: web
    port: 9090
  selector:
    prometheus: yais-metrics


================================================
FILE: examples/90_Kubernetes/yais-deploy.yml
================================================
---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: yais-example
spec:
  replicas: 1
  selector:
    matchLabels:
      app: yais-example
  template:
    metadata:
      labels:
        app: yais-example
      annotations:
        sidecar.istio.io/inject: "true"
    spec:
      containers:
      - name: yais-example
        image: yais
        command: ["/work/examples/90_Kubernetes/deploy/build-and-run.sh"]
        imagePullPolicy: IfNotPresent # Always
        env:
        - name: YAIS_CONCURRENCY
          value: "8"
        resources:
          limits:
            nvidia.com/gpu: 1
        ports:
        - name: grpc
          containerPort: 50051
        - name: metrics
          containerPort: 50078
        livenessProbe:
          tcpSocket:
            port: 50051
          initialDelaySeconds: 5
          periodSeconds: 5
        readinessProbe:
          tcpSocket:
            port: 50051
          initialDelaySeconds: 5
          periodSeconds: 5
---
apiVersion: v1
kind: Service
metadata:
  name: yais-example
  labels:
    app: yais-example
spec:
  selector:
    app: yais-example
  ports:
  - name: grpc
    port: 50051
    targetPort: grpc
  - name: metrics
    port: 50078
    targetPort: metrics
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: yais-example
  labels:
    scrape: yais
spec:
  selector:
    matchLabels:
      app: yais-example
  endpoints:
  - port: metrics
    interval: 2s
    honorLabels: true
---
apiVersion: networking.istio.io/v1alpha3
kind: Gateway
metadata:
  name: yais-gateway
spec:
  selector:
    istio: ingressgateway # use istio default controller
  servers:
  - hosts:
    - "*"
    port:
      name: grpc
      number: 80
      protocol: grpc
---
apiVersion: networking.istio.io/v1alpha3
kind: VirtualService
metadata:
  name: yais-virtual-service
spec:
  hosts:
  - "*"
  gateways:
  - yais-gateway
  http:
  - match:
    - uri:
        prefix: /
    route:
    - destination:
        host: yais-example.default.svc.cluster.local
        port:
          number: 50051


================================================
FILE: examples/91_Prometheus/README.md
================================================
# Prometheus

WIP


================================================
FILE: examples/91_Prometheus/scrape.conf
================================================
[[inputs.prometheus]]
  urls = ["http://localhost:50078/metrics"]
[[outputs.file]]
  files = ["stdout"]


================================================
FILE: examples/97_SingleProcessMultiSteam/launch_service.sh
================================================
#!/bin/bash -e
#
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
cleanup() {
  kill $(jobs -p) ||:
}
trap "cleanup" EXIT SIGINT SIGTERM

ENG=${1:-/work/models/ResNet-50-b1-fp32.engine}
NCTX=${2:-1}

if [ ! -e $ENG ]; then
    echo "$ENG not found"
    exit 911
fi

port=50051
/work/build/examples/02_TensorRT_GRPC/inference-grpc.x --port=$port --engine=${ENG} --contexts=$NCTX &
wait-for-it.sh localhost:$port --timeout=0 -- echo "YAIS Service is ready." > /dev/null 2>&1

echo "warmup with client-async.x"
/work/build/examples/02_TensorRT_GRPC/client-async.x --count=1000 --port=$port

echo
echo "Starting a shell keeping the services and load-balancer running..."
echo "Try /work/build/examples/02_TensorRT_GRPC/siege.x --rate=2000 --port=$port"
bash --rcfile <(echo "PS1='$NCTX x $ENG Subshell: '")


================================================
FILE: examples/98_MultiProcessSingleStream/README.md
================================================
# MPS Examples

`run_throughput_test ncopies batch_size engine_file MPS/NOMPS`

V100 - 16GB - DGX-1V

Processes | MPS | FPS | Batch | Model
--------- | --- | --- | ----- | -----
1 | N | 383 | 1 | RN50
8 | N | 365 | 1 | RN50
8 | Y | 929 | 1 | RN50

```
root@dgx11:/work/src/Examples/98_MultiProcessSingleStream# ./run_throughput_test 8 1 /work/models/ResNet-50-b1-fp32.engine MPS
starting 8 inference services
starting load balancer
load balancing over ports:  ['50051', '50052', '50053', '50054', '50055', '50056', '50057', '50058']
running test client
1000 requests in 1.07632seconds; inf/sec: 929.095

root@dgx11:/work/src/Examples/98_MultiProcessSingleStream# ./run_throughput_test 8 1 /work/models/ResNet-50-b1-fp32.engine NOMPS
starting 8 inference services
starting load balancer
load balancing over ports:  ['50051', '50052', '50053', '50054', '50055', '50056', '50057', '50058']
running test client
1000 requests in 2.74228seconds; inf/sec: 364.66

root@dgx11:/work/src/Examples/98_MultiProcessSingleStream# ./run_throughput_test 1 1 /work/models/ResNet-50-b1-fp32.engine NOMPS
starting 1 inference services
starting load balancer
load balancing over ports:  ['50051']
running test client
1000 requests in 2.60915seconds; inf/sec: 383.267
```


================================================
FILE: examples/98_MultiProcessSingleStream/run_latency_test
================================================
#!/bin/bash -e
#
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
trap 'kill $(jobs -p)' EXIT

BS=${1:-1}
ENG=${2:-/work/models/ResNet-50-b2-int8.engine}

if [ ! -e $END ]; then
    echo "$ENG not found"
    exit 911
fi

echo "starting inference service"
port=50100
/work/build/examples/02_TensorRT_GRPC/inference-grpc.x --port $port --engine=${ENG} & # > /dev/null 2>&1 &
wait-for-it.sh localhost:$port --timeout=0 -- echo "Server ${i} is ready." > /dev/null 2>&1

echo "running latency client"
/work/build/examples/02_TensorRT_GRPC/client-sync.x --port=$port


================================================
FILE: examples/98_MultiProcessSingleStream/run_throughput_test
================================================
#!/bin/bash -e
#
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
cleanup() {
  kill $(jobs -p) ||:
  echo quit | nvidia-cuda-mps-control > /dev/null 2>&1 ||:
}
trap "cleanup" EXIT SIGINT SIGTERM

NCTX=${1:-1}
BS=${2:-1}
ENG=${3:-/work/models/ResNet-50-b1-fp32.engine}
MPS=${4:-"MPS"}

if [ ! -e $ENG ]; then
    echo "$ENG not found"
    exit 911
fi

if [ "$MPS" = "MPS" ]; then
  nvidia-cuda-mps-control -d ||:
fi

sleep 1

echo "starting $NCTX inference services"
for i in $(seq 1 $NCTX);
do
  port=$(echo "50050 + $i" | bc)
  /work/build/examples/02_TensorRT_GRPC/inference-grpc.x --port $port --engine=${ENG} > /dev/null 2>&1 &
  wait-for-it.sh localhost:$port --timeout=0 -- echo "Server ${i} is ready." > /dev/null 2>&1
done

echo "starting load balancer"
../99_LoadBalancer/run_loadbalancer.py -n $NCTX 
# envoy -c /tmp/lb-envoy.yaml --disable-hot-restart > /dev/null 2>&1 &
envoy -c /tmp/lb-envoy.yaml > /dev/null 2>&1 &
wait-for-it.sh localhost:50050 --timeout=0 -- echo "Load balancer is ready." > /dev/null 2>&1

echo "running test client"
/work/build/examples/02_TensorRT_GRPC/client-async.x --count=1000 --port=50050

echo
echo "Starting a shell keeping the services and load-balancer running..."
echo "Try /work/build/examples/02_TensorRT_GRPC/siege.x --rate=2000 --port=50050"
bash --rcfile <(echo "PS1='Throughput Subshell: '")


================================================
FILE: examples/98_MultiProcessSingleStream/setup.py
================================================
#!/usr/bin/env python3
#
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
import os
import subprocess

models = [
    ("ResNet-50-deploy.prototxt", "prob"),
#   ("ResNet-152-deploy.prototxt", "prob"),
]

precisions = [
    ("fp32", ""),
    ("fp16", "--fp16"),
    ("int8", "--int8")
]

def main():
    for model, o in models:
        for name, p in precisions:
            for b in [1]: #, 2, 4, 8]:
                n = "b{}-{}".format(b, name)
                e = model.replace("prototxt", "engine")
                e = e.replace("deploy", n)
                m = os.path.join("/work/models", model)
                if os.path.isfile(e):
                    continue
                subprocess.call("giexec --deploy={} --batch={} --output={} {} --engine={}".format(
                    m, b, o, p, e
                ), shell=True)

if __name__ == "__main__":
    main()


================================================
FILE: examples/99_LoadBalancer/README.md
================================================
# Envoy Load Balancer

Very basic Envoy Proxy L7 load balancer for testing purposes.

`run_loadbalancer.py -n <number of instances>` will start a copy of envoy 
listening on port `50050` and load-balancing over ports `[50051, 50051+n-1]`.

You are responsible for spinning up the backend services.

## Notes

The load-balancer overhead appears to be about 150us.  Running the `client-sync.x`
directly to a backend vs. through the load-balancer shows about 150us overhead per
transaction.

```
# direct
Throughput Subshell: /work/build/examples/02_TensorRT_GRPC/client-sync.x --port 50051
1000 requests in 2.69029 seconds; inf/sec: 371.707

# proxied via envoy load-balancer
Throughput Subshell: /work/build/examples/02_TensorRT_GRPC/client-sync.x --port 50050
1000 requests in 2.8411 seconds; inf/sec: 351.977
```

================================================
FILE: examples/99_LoadBalancer/lb-envoy.j2
================================================
{#-
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
-#}
static_resources:
  listeners:
  - address:
      socket_address:
        address: 0.0.0.0
        port_value: 50050
    filter_chains:
    - filters:
      - name: envoy.http_connection_manager
        config:
          codec_type: auto
          stat_prefix: ingress_http
          route_config:
            name: local_route
            virtual_hosts:
            - name: backend
              domains:
              - "*"
              routes:
              - match:
                  prefix: "/"
                  headers:
                  - name: content-type
                    value: application/grpc
                route:
                  cluster: inference
          http_filters:
          - name: envoy.router
            config: {}
  clusters:
  - name: inference
    connect_timeout: 0.25s
    type: strict_dns
    lb_policy: round_robin
    http2_protocol_options: {}
    hosts:
{% for port in ports %}
    - socket_address:
        address: 127.0.0.1
        port_value: {{ port }}
{% endfor %}

admin:
  access_log_path: "/dev/null"
  address:
    socket_address:
      address: 0.0.0.0
      port_value: 8001


================================================
FILE: examples/99_LoadBalancer/run_loadbalancer.py
================================================
#!/usr/bin/env python3
#
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
import os
import inspect
import shutil
import tempfile

import click
from jinja2 import Environment, FileSystemLoader, Template

def render(template_path, data=None, extensions=None, strict=False):
    data = data or {}
    extensions = extensions or []
    env = Environment(
        loader=FileSystemLoader(os.path.dirname(template_path)),
        extensions=extensions,
        keep_trailing_newline=True,
    )
    if strict:
        from jinja2 import StrictUndefined
        env.undefined = StrictUndefined

    # Add environ global
    env.globals['environ'] = os.environ.get

    return env.get_template(os.path.basename(template_path)).render(data)

script_path = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
FileType = click.Path(exists=True, file_okay=True, dir_okay=False, resolve_path=True)

@click.command()
@click.option("-n", default=1)
@click.option("--template", type=FileType, default=os.path.join(script_path, "lb-envoy.j2"))
def main(n, template):
    envoy = shutil.which("envoy")
    if not os.path.isfile(envoy):
        raise RuntimeError("envoy executable not found in currently directory: {}".format(envoy))
    ports = [50051 + p for p in range(n)]
    print("load balancing over ports: ", [str(p) for p in ports])
    with open("/tmp/lb-envoy.yaml", "w") as file:
        file.write(render(template, data={"ports": ports}))
#   os.system("{} -c /tmp/lb-envoy.yaml".format(envoy))

if __name__ == "__main__":
    main()
    

================================================
FILE: examples/CMakeLists.txt
================================================
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

find_package(gflags 2.2.1 REQUIRED)

add_subdirectory(nvRPC)
add_subdirectory(Deployment)

add_subdirectory(00_TensorRT)
add_subdirectory(01_Basic_GRPC)
add_subdirectory(02_TensorRT_GRPC)
#add_subdirectory(03_Batching)
add_subdirectory(04_Middleman)
add_subdirectory(10_Internals)
add_subdirectory(11_Protos)
add_subdirectory(12_FlatBuffers)
#add_subdirectory(30_PyTensorRT)


================================================
FILE: examples/Deployment/CMakeLists.txt
================================================
add_subdirectory(ImageClient)
add_subdirectory(RouteRequests)


================================================
FILE: examples/Deployment/ImageClient/CMakeLists.txt
================================================
set(protobuf_MODULE_COMPATIBLE TRUE)
find_package(Protobuf CONFIG REQUIRED)
message(STATUS "Using protobuf ${protobuf_VERSION}")
set(_PROTOBUF_LIBPROTOBUF protobuf::libprotobuf)
set(_PROTOBUF_PROTOC $<TARGET_FILE:protobuf::protoc>)

INCLUDE(GRPCGenerateCPP)

PROTOBUF_GENERATE_CPP(PROTO_SRCS PROTO_HDRS 
    api.proto
)

PROTOBUF_GENERATE_GRPC_CPP(PROTO_GRPC_SRCS PROTO_GRPC_HDRS 
    api.proto
)

add_library(deploy-image-client-protos
    ${PROTO_SRCS}
    ${PROTO_GRPC_SRCS}
)

target_link_libraries(deploy-image-client-protos
  PUBLIC
  ${_PROTOBUF_LIBPROTOBUF}
)

target_include_directories(deploy-image-client-protos PUBLIC
    ${CMAKE_CURRENT_BINARY_DIR}
)

pybind11_add_module(deploy_image_client
  client.cc
)

target_link_libraries(deploy_image_client
PUBLIC
  nvrpc-client
  deploy-image-client-protos
)


================================================
FILE: examples/Deployment/ImageClient/api.proto
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

syntax = "proto3";

package trtlab.deploy.image_client;
 
service Inference {
   rpc Classify (ImageInfo) returns (Classifications) {}
   rpc Detection (ImageInfo) returns (Detections) {}

   rpc ClassifyStream (stream ImageInfo) returns (stream Classifications) {}
   rpc DetectionSream (stream ImageInfo) returns (stream Detections) {}
}
 
message ImageInfo {
    string api_key = 1;
    string image_uuid = 2;
    string model_name = 3;
}

message Classifications
{
    string image_uuid = 1;
    repeated Score scores =2;
    Details details = 3;
}

message Detections
{
    string image_uuid = 1;
    repeated Object objects = 2;
    Details details = 3;
}

message Score
{
    int32 class_idx = 1;
    float score = 2;
}

message Object
{
    string class_name = 1;
    float upper_left_x = 2;  // [0, 1] normalized on width
    float upper_left_y = 3;  // [0, 1] normalized on height
    float width = 4;         // [0, 1] normalized on width
    float height = 5;        // [0, 1] normalized on height
}

// this is where you can customize what details your service
// returns to the user
message Details
{
    float request_time = 1;
    float compute_time = 2;
}

================================================
FILE: examples/Deployment/ImageClient/client.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <pybind11/stl_bind.h>

#include "client.h"

using namespace trtlab;
using namespace nvrpc;

namespace py = pybind11;

using deploy::image_client::Classifications;
using deploy::image_client::Detections;
using deploy::image_client::ImageInfo;
using deploy::image_client::Inference;

ImageClient::ImageClient(std::string hostname)
{
    auto executor = std::make_shared<client::Executor>(1);

    auto channel = grpc::CreateChannel(hostname, grpc::InsecureChannelCredentials());
    std::shared_ptr<Inference::Stub> stub = Inference::NewStub(channel);

    auto classify_prepare_fn = [stub](::grpc::ClientContext * context, const ImageInfo& request,
                                      ::grpc::CompletionQueue* cq) -> auto
    {
        return std::move(stub->PrepareAsyncClassify(context, request, cq));
    };

    auto detection_prepare_fn = [stub](::grpc::ClientContext * context, const ImageInfo& request,
                                       ::grpc::CompletionQueue* cq) -> auto
    {
        return std::move(stub->PrepareAsyncDetection(context, request, cq));
    };

    m_ClassifyClient = std::make_unique<client::ClientUnary<ImageInfo, Classifications>>(
        classify_prepare_fn, executor);
    m_DetectionClient = std::make_unique<client::ClientUnary<ImageInfo, Detections>>(
        detection_prepare_fn, executor);
}

std::shared_future<ClassifyResult> ImageClient::Classify(const std::string& model_name,
                                                         const std::string& image_uuid)
{
    ImageInfo image_info;
    image_info.set_model_name(model_name);
    image_info.set_image_uuid(image_uuid);
    std::map<std::string, std::string> headers = {{"custom-metadata-model-name", model_name}};
    auto post = [](ImageInfo& input, Classifications& output,
                   ::grpc::Status& status) -> ClassifyResult {
        ClassifyResult results(output);
        return std::move(results);
    };
    return m_ClassifyClient->Enqueue(std::move(image_info), post, headers);
}

std::shared_future<DetectionResult> ImageClient::Detection(const std::string& model_name,
                                                           const std::string& image_uuid)
{
    ImageInfo image_info;
    image_info.set_model_name(model_name);
    image_info.set_image_uuid(image_uuid);
    std::map<std::string, std::string> headers = {{"custom-metadata-model-name", model_name}};
    auto post = [](ImageInfo& input, Detections& output,
                   ::grpc::Status& status) -> DetectionResult {
        DetectionResult results(output);
        return std::move(results);
    };
    return m_DetectionClient->Enqueue(std::move(image_info), post, headers);
}

ClassifyResult::ClassifyResult(const ::trtlab::deploy::image_client::Classifications& pb)
    : m_UUID(pb.image_uuid())
{
}

DetectionResult::DetectionResult(const ::trtlab::deploy::image_client::Detections& pb)
    : m_UUID(pb.image_uuid())
{
}

using PyClassifyFuture = std::shared_future<ClassifyResult>;
using PyDetectionFuture = std::shared_future<DetectionResult>;

PYBIND11_MAKE_OPAQUE(PyClassifyFuture);
PYBIND11_MAKE_OPAQUE(PyDetectionFuture);

PYBIND11_MODULE(deploy_image_client, m)
{
    py::class_<ImageClient, std::shared_ptr<ImageClient>>(m, "ImageClient")
        .def(py::init<std::string>(), py::arg("hostname") = "trt.lab")
        .def("classify", &ImageClient::Classify)
        .def("detection", &ImageClient::Detection);

    py::class_<PyClassifyFuture, std::shared_ptr<PyClassifyFuture>>(m, "ClassifyFuture")
        .def("wait", &PyClassifyFuture::wait, py::call_guard<py::gil_scoped_release>())
        .def("get", &PyClassifyFuture::get, py::call_guard<py::gil_scoped_release>());

    py::class_<PyDetectionFuture, std::shared_ptr<PyDetectionFuture>>(m, "DetectionFuture")
        .def("wait", &PyDetectionFuture::wait, py::call_guard<py::gil_scoped_release>())
        .def("get", &PyDetectionFuture::get, py::call_guard<py::gil_scoped_release>());

    py::class_<ClassifyResult, std::shared_ptr<ClassifyResult>>(m, "ClassifyResult")
        .def_property_readonly("uuid", &ClassifyResult::UUID);

    py::class_<DetectionResult, std::shared_ptr<DetectionResult>>(m, "DetectionResult")
        .def_property_readonly("uuid", &DetectionResult::UUID);
}

================================================
FILE: examples/Deployment/ImageClient/client.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once
#include <future>
#include <memory>
#include <string>

#include "nvrpc/client/client_unary.h"

#include "api.grpc.pb.h"
#include "api.pb.h"

class ClassifyResult
{
  public:
    ClassifyResult(const ::trtlab::deploy::image_client::Classifications&);
    const std::string& UUID() const { return m_UUID; }

  private:
    std::string m_UUID;
};

struct DetectionResult
{
  public:
    DetectionResult(const ::trtlab::deploy::image_client::Detections&);
    const std::string& UUID() const { return m_UUID; }

  private:
    std::string m_UUID;
};

class ImageClient
{
  public:
    ImageClient(std::string);
    ~ImageClient() {}

    std::shared_future<ClassifyResult> Classify(const std::string&, const std::string&);
    std::shared_future<DetectionResult> Detection(const std::string&, const std::string&);

  private:
    using ImageInfo = ::trtlab::deploy::image_client::ImageInfo;
    using Classifications = ::trtlab::deploy::image_client::Classifications;
    using Detections = ::trtlab::deploy::image_client::Detections;

    std::unique_ptr<::nvrpc::client::ClientUnary<ImageInfo, Classifications>> m_ClassifyClient;
    std::unique_ptr<::nvrpc::client::ClientUnary<ImageInfo, Detections>> m_DetectionClient;
};


================================================
FILE: examples/Deployment/ImageClient/client.py
================================================
## Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
##
## Redistribution and use in source and binary forms, with or without
## modification, are permitted provided that the following conditions
## are met:
##  * Redistributions of source code must retain the above copyright
##    notice, this list of conditions and the following disclaimer.
##  * Redistributions in binary form must reproduce the above copyright
##    notice, this list of conditions and the following disclaimer in the
##    documentation and/or other materials provided with the distribution.
##  * Neither the name of NVIDIA CORPORATION nor the names of its
##    contributors may be used to endorse or promote products derived
##    from this software without specific prior written permission.
##
## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
## EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
## IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
## PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
## CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
## EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
## PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
## PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
## OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
## (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
## OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE
import os
import uuid

import boto3
import deploy_image_client as client

class ImageClient:

    def __init__(self, *, hostname = "trt.lab"):
        self._cpp_client = client.ImageClient(hostname)
        self._s3_client = self._get_s3_client()

    def classify(self, image_path, model):
        key = self._upload_to_s3(image_path)
        return self._cpp_client.classify(key, model)

    def object_detection(self, image_path, model):
        key = self._upload_to_s3(image_path)
        return self._cpp_client.object_detection(key, model)

    def _get_s3_client(self):
        kwargs = {}
        if os.environ.get("AWS_ENDPOINT_URL"):
            kwargs = {
                endpoint_url: os.environ.get("AWS_ENDPOINT_URL"),
                use_ssl: False,
                verify: False,
            }
        return boto3.client("s3", **kwargs)

    def _check_if_file(self, file_path):
        if not os.path.isfile(file_path):
            raise RuntimeError("{} is not a file".format(file_path))

    def _upload_to_s3(self, image_path):
        self._check_if_file(image_path)
        key = str(uuid.uuid4())
        with open(image_path, "rb") as data:
            self._s3_client.upload_fileobj(data, 'images', key)
        return key


================================================
FILE: examples/Deployment/Kubernetes/basic-trtis-deployment/deploy.yml
================================================
---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: basic-trtis-deployment
  namespace: trtlab
spec:
  replicas: 1
  selector:
    matchLabels:
      app: basic-trtis-deployment
  template:
    metadata:
      labels:
        app: basic-trtis-deployment
      annotations:
        sidecar.istio.io/inject: "true"
    spec:
      containers:
      - name: trtis
        image: nvcr.io/nvidia/tensorrtserver:19.02-py3
        command: ["trtserver", "--model-store=/tmp/models"]
        imagePullPolicy: IfNotPresent
        resources:
          limits:
            nvidia.com/gpu: 1
        ports:
        - name: http
          containerPort: 8000
        - name: grpc
          containerPort: 8001
        - name: metrics
          containerPort: 8002
        livenessProbe:
          httpGet:
            path: /api/health/live
            port: http
          initialDelaySeconds: 10
          periodSeconds: 5
        readinessProbe:
          httpGet:
            path: /api/health/ready
            port: http
          initialDelaySeconds: 10
          periodSeconds: 5
        volumeMounts:
        - mountPath: /tmp/models
          name: model-store
      volumes:
      - name: model-store
        hostPath:
          path: /shared/trtis/example-model-store
          type: Directory
---
apiVersion: v1
kind: Service
metadata:
  name: basic-trtis-deployment
  namespace: trtlab
  labels:
    app: basic-trtis-deployment
spec:
  selector:
    app: basic-trtis-deployment
  ports:
  - name: http
    port: 8000
    targetPort: http
  - name: grpc
    port: 8001
    targetPort: grpc
  - name: metrics
    port: 8002
    targetPort: metrics


================================================
FILE: examples/Deployment/Kubernetes/basic-trtis-deployment/istio-ingress.yml
================================================
---
apiVersion: networking.istio.io/v1alpha3
kind: Gateway
metadata:
  name: basic-trtis-deployment-gateway
  namespace: trtlab
spec:
  selector:
    istio: ingressgateway
  servers:
  - port:
      name: http
      number: 80
      protocol: grpc
    hosts:
    - "trt.lab"
---
apiVersion: networking.istio.io/v1alpha3
kind: VirtualService
metadata:
  name: basic-trtis-deployment-virtual-service
  namespace: trtlab
spec:
  hosts:
  - "trt.lab"
  gateways:
  - basic-trtis-deployment-gateway
  http:
  - match:
    - uri:
        prefix: /api/health/
    route:
    - destination:
        host: basic-trtis-deployment
        port:
          number: 8000
  - match:
    - uri:
        prefix: /
    route:
    - destination:
        host: basic-trtis-deployment
        port:
          number: 8001
---
apiVersion: networking.istio.io/v1alpha3
kind: DestinationRule
metadata:
  name: basic-trtis-deployment-load-balancer
  namespace: trtlab
spec:
  host: basic-trtis-deployment
  trafficPolicy:
    loadBalancer:
      simple: LEAST_CONN


================================================
FILE: examples/Deployment/Kubernetes/basic-trtis-deployment/scrape-metrics.yml
================================================
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: basic-trtis-deployment
  namespace: trtlab
  labels:
    scrape: nv-inference-metrics
spec:
  selector:
    matchLabels:
      app: basic-trtis-deployment
  endpoints:
  - port: metrics
    interval: 2s
    honorLabels: true


================================================
FILE: examples/Deployment/ObjectStore/README.md
================================================
# Object Store

In the Image Service example, the ImageClient separates out an inference request
into two components: 
- a bulk data transfers to a backend store, 
- a gRPC request that contains the details of the request (model, file_handle, etc.)

To implement this concept, we will use an S3-compatible Object Store. The
example should work equally well on AWS S3 or via Rook's S3 CephObjectStore
implementation running locally a Kubernetes cluster. For more details on
how Kubernetes and Rook were installed, see the [NVIDIA DeepOps Project](https://github.com/nvidia/deepops).

This folder contains some basic configuration files and scripts for preparing the
ObjectStore for our Image Service.

## AWS S3

Simply sent your AWS API configuration or export the following environment variables.

```
export AWS_ACCESS_KEY_ID=<your-aws-access-key>
export AWS_SECRET_ACCESS_KEY=<your-super-secret-access-key>
```

## Rook + Kubernetes

You will need to modify some of the configuration files for your cluster.

- `rook-s3.yml` options:
  - requires 3 unique hosts with bluestore backed OSDs
  - creates a `trtlab` user

If you modify the name of the ObjectStore (`trtlab-s3`) and or the username (`trtlab`),
be aware the `get_rook_s3_keys.sh` needs to be modified.

Similarly, the examples uses `s3.trt.lab` as the endpoint on which the storage
is hosted. If you change this, you will need to modify `get_rook_s3_keys.sh` to
output the proper `AWS_ENDPOINT_URL`. You will also need to modify the ingress
examples with the proper hostname.

```
kubectl apply -f rook-s3.yml
kubectl apply -f ingress-nginx.yml
```

### Setup your environment
```
eval $(./get_rook_s3_keys.sh)
```

### Prepare your Image bucket

Note: you will need to have python3 and boto installed.  This does not have be done
inside the container.

```
python3 create_buckets.py
```

## TODOs

- [ ] Export S3 keys to a Kubernetes Secret
- [ ] Scripts for bucket maintenance: probably some k8s CronJobs
- [ ] Update Istio ingress example

================================================
FILE: examples/Deployment/ObjectStore/create_buckets.py
================================================
import os
import boto3

s3 = boto3.client("s3", use_ssl=False, verify=False, 
                  endpoint_url=os.environ.get("AWS_ENDPOINT_URL"))

response = s3.list_buckets()

buckets = [b["Name"] for b in response["Buckets"]]

if "images" not in buckets:
    s3.create_bucket(Bucket="images")

response = s3.list_buckets()
buckets = [b["Name"] for b in response["Buckets"]]

print(buckets)


================================================
FILE: examples/Deployment/ObjectStore/get_rook_s3_keys.sh
================================================
#!/bin/bash
objstore=trtlab-s3
user=trtlab
echo -n export AWS_ACCESS_KEY_ID=
kubectl -n rook-ceph get secret rook-ceph-object-user-${objstore}-${user} -o yaml | grep AccessKey | awk '{print $2}' | base64 --decode
echo
echo -n export AWS_SECRET_ACCESS_KEY=
kubectl -n rook-ceph get secret rook-ceph-object-user-${objstore}-${user} -o yaml | grep SecretKey | awk '{print $2}' | base64 --decode
echo
echo export AWS_ENDPOINT_URL=http://s3.trt.lab
echo


================================================
FILE: examples/Deployment/ObjectStore/ingress-istio.yml
================================================
# not working yet
---
apiVersion: networking.istio.io/v1alpha3
kind: Gateway
metadata:
  name: trtlab-s3-gateway
spec:
  selector:
    app: trtlab-s3
    istio: ingressgateway
  servers:
  - hosts:
    - "s3.trt.lab"
    port:
      number: 80
      name: http
      protocol: HTTP
---
apiVersion: networking.istio.io/v1alpha3
kind: VirtualService
metadata:
  name: trtlab-s3-virtualservice
spec:
  hosts:
  - "s3.trt.lab"
  gateways:
  - trtlab-s3-gateway
  http:
  - match:
    - uri:
        prefix: /
    route:
    - destination:
        host: rook-ceph-rgw-trtlab-s3.rook-ceph
        port:
          number: 80


================================================
FILE: examples/Deployment/ObjectStore/ingress-nginx.yml
================================================
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
  annotations:
    nginx.ingress.kubernetes.io/proxy-body-size: "0"
    nginx.ingress.kubernetes.io/proxy-read-timeout: "600"
    nginx.ingress.kubernetes.io/proxy-send-timeout: "600"
  name: trtlab-s3-ingress
  namespace: rook-ceph
spec:
  rules:
  - host: s3.trt.lab
    http:
      paths:
      - backend:
          serviceName: rook-ceph-rgw-trtlab-s3
          servicePort: 80
        path: /


================================================
FILE: examples/Deployment/ObjectStore/rook-s3.yml
================================================
---
apiVersion: ceph.rook.io/v1
kind: CephObjectStore
metadata:
  name: trtlab-s3
  namespace: rook-ceph
spec:
  metadataPool:
    failureDomain: host
    replicated:
      size: 3
  dataPool:
    failureDomain: host
    erasureCoded:
      dataChunks: 2
      codingChunks: 1
  gateway:
    type: s3
    sslCertificateRef:
    port: 80
    securePort:
    instances: 1
    allNodes: false
---
apiVersion: ceph.rook.io/v1
kind: CephObjectStoreUser
metadata:
  name: trtlab
  namespace: rook-ceph
spec:
  store: trtlab-s3
  displayName: "TensorRT Laboratory"


================================================
FILE: examples/Deployment/README.md
================================================
# Deploying Inference Services

This document/example folder is a work in progress. Its intent is to cover
various aspect of deployment including strategies, limitations, services and
kubernetes examples of deploying inference services.

Over the course of this guide, we will build a full end-to-end image processing
service deployed on Kubernetes.

Let us start by assuming all your models can be served/deployed with the
[TensorRT Inference Server, aka TRTIS](https://github.com/nvidia/tensorrt-inference-server).
One of the primary advantages of TRTIS is the ability to host multiple models in
a single linux process. Given the capabilities of modern GPUs, this is the most
efficient way to for multiple models to efficiently share of both compute and
memory resources.

Next, lets dive a little deeper into the features of TRTIS that gives this effiency advantage.

  - _Concurrent Executions_ allow for multiple independent inference batches to be
    in-flight on the device at a given time. This could be multiple batches of
    the same model or single batches of different models, or any combination
    imaginable. 
    - Running on a Tesla V100 GPU with ResNet-152 Batch8
      - Allowing only 1 in-flight batch8 yields XXX images/sec with a compute
        latency of YYY.
      - Allowing 8 concurrency batch8 executions increases the throughput to
        2500 images/sec; however, the compute latency per batch increases to
        ZZZ.
    - To evaluate the performance of TensorRT models a function of concurrent
      in-flight executions we provide the [TensorRT/ConcurrentExecution](../TensorRT/ConcurrentExecution)
      example.
      ```bash
      infer.x --engine=/external/models/ResNet-152-b8-fp16.engine --concurrency=8
      ```
    - The value of concurrent executions will differ depending on the compute
      requirements of the model and the GPU on which it executes. The best
      practice is to benchmark and evaluates performance.

  - _Tunable Concurency_ enables you to specify on a per model basis the number
    of concurreny copies that can be executed at any given time. Follow the guidelines
    Concurrent Executions to tune this option on a per model basis.

  - _Dynamic Batching_ allows individual requests from either same or different
    clients to be multiplexed into a single mini-batch and infered. Dynamic
    Batching is performed on a per-model basis and can have a max
    `preferred_batch_size` and a `max_queue_delay_microseconds` which specific
    how long of a delay messages are allowed to accumulate. Batching is one of
    the best ways to improve throughput. Depending on your needs, you will want
    to balance the added latency required for batching vs the throughput
    improvements achieved.
    - Note: in scale-out deployments where unary (send/recv, not streaming)
      requests are being load-balanced across multiple TRTIS instances, the
      value of dynamic batching in the TRTIS decreases as the number of replicas  
      increases.
    - TODO: We address this issue by creating Dynamic Batching Services that sit
      infront of the Load-Balancer. Add a discussion and update the [examples/03_Batching]
      example with the latest streaming Server/Client

  - _Custom Metrics_ provide application-level metrics on how the TRTIS service
    is performing. Analyzing these metrics can provide insight on when a service
    is being overloaded and when to add more resources. TRTIS and some TRTLAB
    examples expose Prometheus Metrics.
     - [examples/90_Kubernetes/prometheus] is one exmaple of how to use
       Kubernetes + Prometheus + Grafana to scrape and visualize metrics from
       running TRTIS services.
     - TODO: Document and clean up examples.

The goal of this project is to provide supplementary support to TRTIS by
providing buildings blocks that help you build companion microservices that
worth with TRTIS, as well as example deployment scenarios.

## Configure a TRTIS Kubernetes Deployment

In Kubernetes, a
[Deployment](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/)
allows you to define a [Pod](https://kubernetes.io/docs/concepts/workloads/pods/pod/)
and the number of copies of that pod, i.e.
[ReplicaSet](https://kubernetes.io/docs/concepts/workloads/controllers/replicaset/)
that you would like deployed on your cluster.

### Build a Model Store

The [TRTIS Model Store] is the collection of models that will be loaded and
served by a TRTIS instance. For details on how to build a model store, see the [Deployment/ModelStore]
example.

In order to define our Kubernetes Deployment, we need to map our model store
into our TRTIS Pod. There are multiple ways this can be achieved.
- Extend the TRTIS container image and add the model store to the image
- Mount an External Volume into the TRTIS Pod
- Use another container in the same TRTIS Pod that will dynamically generate a
  model store from some external data sources, e.g. S3.
  - TODO: Add this example

My Kubernetes cluster has an NFS mount at `/shared` on every node. For this
example, our model store will be located at `/shared/trtis/example-model-store`

### Deploy TRTIS Pods


## Configure Multiple TRTIS Deployments with Different Sets of Models

Now suppose you are serving more models than you can allocate on a single GPU.  In this scenario, we can split our list of models into groups and spread those groups out over multiple TRTIS `Deployments`

## Scenario #1: 2 Models, 10 Servers, 20 GPUs

Assume you have two computer vision models, e.g. classification, object
detection, segmentation, that you wish to deploy on 10 servers.

Probably the first questions you might ask yourself is what is the breakdown of
load expected for each model. Is it 50/50? Or is it 90/10? Does it vary by time
of day? Is it predictive?

The strategy with TRTIS is simple, you deploy 20 replicas of the TRTIS service
across 10 servers, 1 TRTIS service per GPU and tell your load-balancer to
round-robin requests across your services.

However, unless you customize TRTIS, by default, TRTIS only accepts raw tensors
as inputs and returns raw tensors an outputs.

What are your inputs?  And what's a reasonable expectation for the rest of the
inference pipeline?

Let's assume you are receiving JPEG images. First, you need to decode the images to
raw pixels, then you need to prepare the images to be inferred. To keep things
simple, let's assume that both models use the same input preprocessing method.

What is the compression ratio of your JPEG images? Assuming the images are
8-bit, then
[a blog post by Graphics Mill](https://www.graphicsmill.com/blog/2014/11/06/Compression-ratio-for-different-JPEG-quality-values#.XHtdPpNKiXE_)
measured the compression ratio to be 1:5.27 for JPEG Quality 100 (Q=100) and
1:43.27 for Q=55. For this discussion we wil focus on Q=80, which was measured
to be a 1:25 ratio. This is the decompression ratio for JPEG bytes to INT8 pixel
values. However, most DNN models after normalization accept tensors that are
fp32 values as inputs. This means that we have a 1:100 an increased ratio of
input bytes to bytes of input tensors.

A 100KiB image becomes a 9.75MiB data structure that needs to be provided to
TRTIS. This 100x increase in size is big, but not unreasonably large. If your
images are coming in over the Internet/WAN, then your LAN connction is likely to
be 100x faster. However, if this was video, that compression ratio per frame
would be MUCH larger. To future-proof our implementation, we are going to do our
best to minimize the amount of data we move around.

Next, what is it that our users are providing to us beside just the image to be
inferred. The client must inform the server which model should be used for the
request. And similarly, the server probably also wants to know some details
about the user. In this example, we will using a user API key to authenticate
the user.

Now let's look at the data payload the client will be sending our server. We are
going to break down an image inference request into two parts:
- the bulk image data
- the request metadata: api key, image_uuid, model_name

This separation allows us to commmunicate and move each component more
optimally. We will send the JPEG image bytes directly to an object store and we
will use a gRPC message to communicate the metadata for the request.

Why not embed the image directly into the gRPC message? While this is certainly
a possibility, we are chosing this separation because in our optimial pipeline,
our client request message will go through several services before the bulk
image data is needed. These services include an ingress router/load-balancer and
an external batching service before they are sent to the image pre-processing
service. This separate helps us future-proof our implementation by avoiding
unnecessary data movement.

Let's break down our client implementation:
- 1) writes the jpeg image data to an S3-compatible object store as some UUID.jpg
- 2) creates an async gRPC unary (send/recv) RPC request to our inference service.
  - the message payload consists of our client api key, image uuid and model_name
  - add custom headers that will enable our ingress router/load-balancer to
    properly route the message to the correct target without the need to
    deserialize the request payload. this allows us to route message directly to
    services specify to some metadata. In this case, we will add the model name
    to the headers so we can eventually route requests to batching services
    unique to that model.

This means our client can continue to issue async inference requests with the
promise that the results will returned in some future time.

On the server side, this data move break down the inference request into two
components, the data and the request.

Let's assume our incoming data transport is very effficient. The images will
be deposited into an S3-compatible object store.  On my Kubernetes cluster,
I'll be using [Rook's Mino/S3 Operator](https://rook.io), but it would work
equally well on AWS.


================================================
FILE: examples/Deployment/RouteRequests/CMakeLists.txt
================================================
add_executable(test_image_service.x
  test_service.cc
)

target_link_libraries(test_image_service.x
PUBLIC
  nvrpc
  deploy-image-client-protos
  gflags
)


================================================
FILE: examples/Deployment/RouteRequests/README.md
================================================
# Routing Requests

If we have multiple instances of TRTIS each with different models, we need a way
to route requests to the proper service. There are two convenient options:
routing by subdomain or routing by headers.

In this example, we will have three unique pools of TRTIS services:
- Pool A: only handles `model_a` `Classify` requests
- Pool B: only handles `model_b` requests (`Classify` or `Detection`)
- General Pool: handles all other requests

In our hypothetical deployment scenario, both `model_a` and `model_b` are
particularly active so we have dedicated resources to handle those requests.
Similarly, our general pool has it's own fixed size. Later, we will show out to
auto-scale pods based on TRTIS and GPU metrics.

A simple approach would be to host `model_a.trt.lab`, `model_b.trt.lab` and
`general_pool.trt.lab` and have the client make the decision sender side on
where to send the requests. However, changest to the service layout would
require updates to the clients software which makes this option less appealing.

Ideally, we want our entire service to be hosted on a single endpoint `trt.lab`.  

To ensure our requests arrive at the proper destination server-side, we have our
client add [Custom Metadata](https://github.com/grpc/grpc/blob/master/doc/PROTOCOL-HTTP2.md) to each gRPC request.

```c++
std::map<std::string, std::string> headers = {{"custom-metadata-model-name", model_name}};
```

which inside our client library

```c++
for (auto& header : headers)
{
    // add headers to ::grpc::ClientContext
    ctx->m_Context.AddMetadata(header.first, header.second);
}
```

To test routing, we have provided an `envoy_config.yaml` configuration. The
load-balancer/router listens on port 50050 and routes to three sample services
running on 51051, 51052 and 51053.

To differentiate by endpoint, `Classify` or `Detection`, we can match routes on their `uri`.

Here is the relevant parts of the envoy config:
```yaml
- match:
    prefix: /trtlab.deploy.image_client.Inference/Classify
    headers:
    - name: custom-metadata-model-name
      exact_match: model_a
    grpc:
  route:
    cluster: classify_model_a
- match:
    prefix: /
    headers:
    - name: custom-metadata-model-name
      exact_match: model_b
    grpc:
  route:
    cluster: model_b
- match:
    prefix: /
    grpc:
  route:
    cluster: general_pool
```

`test_routing.sh` provides a convenient means to test the configuration. It will
compile a simple implementation of the ImageClient service, bring up an instance
of Envoy, 3 instances of the test_service, then send both `Classify` and
`Detection` requests with three different models at the router. The service
implementation simple returns which named service handled the request.

```s
root@5e8ffb38df87:/work/examples/Deployment/RouteRequests# ./test_routing.sh 
... some start up output ...
Testing Classify RPC
I0307 13:41:36.614954   355 test_service.cc:74] model_a served by model_a
I0307 13:41:36.616070   359 test_service.cc:74] model_b served by model_b
I0307 13:41:36.617031   362 test_service.cc:74] model_c served by general_pool
Testing Detection RPC
I0307 13:41:36.617636   362 test_service.cc:74] model_a served by general_pool
I0307 13:41:36.618005   359 test_service.cc:74] model_b served by model_b
I0307 13:41:36.618367   362 test_service.cc:74] model_c served by general_pool

**** Test Passed ****
```

While we are using Envoy (v1.9) directly in this example, we will later show how
this can be accomplished in Istio. A major TODO in this project is to build a
TRTIS operator while will provide Kubernetes CRD that will be able to
dynamically manage the routes as a function of where the model will be loaded on
the cluster. Whereas this example shows static placement, we eventually want to
get to fully dynamic routes.

================================================
FILE: examples/Deployment/RouteRequests/envoy_config.yaml
================================================
static_resources:
  listeners:
  - name: listener_0
    address:
      socket_address: { address: 0.0.0.0, port_value: 50050 }
    filter_chains:
    - filters:
      - name: envoy.http_connection_manager
        config:
          codec_type: auto
          stat_prefix: ingress_http
          route_config:
            name: local_route
            virtual_hosts:
            - name: backend
              domains:
              - "*"
              routes:
              - match:
                  prefix: /trtlab.deploy.image_client.Inference/Classify
                  headers:
#                 - name: content-type
#                   value: application/grpc
                  - name: custom-metadata-model-name
                    exact_match: model_a
                  grpc:
                route:
                  cluster: classify_model_a
              - match:
                  prefix: /
                  headers:
#                 - name: content-type
#                   value: application/grpc
                  - name: custom-metadata-model-name
                    exact_match: model_b
                  grpc:
                route:
                  cluster: model_b
              - match:
                  prefix: /
#                 headers:
#                 - name: content-type
#                   value: application/grpc
                  grpc:
                route:
                  cluster: general_pool
          http_filters:
          - name: envoy.router
            config: {}
  clusters:
  - name: classify_model_a
    connect_timeout: 0.25s
    type: strict_dns
    lb_policy: round_robin
    http2_protocol_options: {}
    hosts:
    - socket_address:
        address: 127.0.0.1
        port_value: 51051
  - name: model_b
    connect_timeout: 0.25s
    type: strict_dns
    lb_policy: round_robin
    http2_protocol_options: {}
    hosts:
    - socket_address:
        address: 127.0.0.1
        port_value: 51052
  - name: general_pool
    connect_timeout: 0.25s
    type: strict_dns
    lb_policy: round_robin
    http2_protocol_options: {}
    hosts:
    - socket_address:
        address: 127.0.0.1
        port_value: 51053

admin:
  access_log_path: "/dev/null"
  address:
    socket_address:
      address: 0.0.0.0
      port_value: 8001


================================================
FILE: examples/Deployment/RouteRequests/test_client.py
================================================
import os

import deploy_image_client as cpp_client


def main():
    if not os.environ.get("TRTLAB_ROUTING_TEST"):
        raise RuntimeError(
            "Plese run this script in the environment setup by test_routing.sh")
    router = cpp_client.ImageClient("localhost:50050")

    print("Testing Classify RPC")

    a = router.classify("model_a", "via_router_uuid1").get()
    b = router.classify("model_b", "via_router_uuid2").get()
    c = router.classify("model_c", "via_router_uuid3").get()

    assert a.uuid == "model_a"
    assert b.uuid == "model_b"
    assert c.uuid == "general_pool"

    print("Testing Detection RPC")

    a = router.detection("model_a", "via_router_uuid1").get()
    b = router.detection("model_b", "via_router_uuid2").get()
    c = router.detection("model_c", "via_router_uuid3").get()

    assert a.uuid == "general_pool"
    assert b.uuid == "model_b"
    assert c.uuid == "general_pool"

    print("\n**** Test Passed ****\n")


if __name__ == "__main__":
    try:
        main()
    except RuntimeError as e:
        print("\n**** Error ****")
        print(e)
        print()


================================================
FILE: examples/Deployment/RouteRequests/test_routing.sh
================================================
#!/bin/bash

cleanup() {
  kill $(jobs -p) ||:
}
trap "cleanup" EXIT SIGINT SIGTERM

(cd /work/build/examples/Deployment/ImageClient; make)
(cd /work/build/examples/Deployment/RouteRequests; make)

export PYTHONPATH=$PYTHONPATH:/work/build/examples/Deployment/ImageClient

exe=/work/build/examples/Deployment/RouteRequests/test_image_service.x

$exe --hostname="model_a" --ip_port="0.0.0.0:51051" & #> /dev/null 2>&1 &
$exe --hostname="model_b" --ip_port="0.0.0.0:51052" & #> /dev/null 2>&1 &
$exe --hostname="general_pool" --ip_port="0.0.0.0:51053" & #> /dev/null 2>&1 &
envoy -c envoy_config.yaml > /dev/null 2>&1 &

wait-for-it.sh localhost:50050 --timeout=0 -- echo "Envoy on 50050 ready"
wait-for-it.sh localhost:51051 --timeout=0 -- echo "ModelA on 51051 ready"
wait-for-it.sh localhost:51052 --timeout=0 -- echo "ModelB on 51052 ready"
wait-for-it.sh localhost:51053 --timeout=0 -- echo "General Pool on 51053 ready"

export TRTLAB_ROUTING_TEST=True

python3 test_client.py


================================================
FILE: examples/Deployment/RouteRequests/test_service.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <gflags/gflags.h>
#include <glog/logging.h>

#include "tensorrt/laboratory/core/pool.h"
#include "tensorrt/laboratory/core/resources.h"
#include "tensorrt/laboratory/core/thread_pool.h"

using trtlab::Resources;
using trtlab::ThreadPool;

#include "nvrpc/executor.h"
#include "nvrpc/server.h"
#include "nvrpc/service.h"

using nvrpc::AsyncRPC;
using nvrpc::AsyncService;
using nvrpc::Context;
using nvrpc::Executor;
using nvrpc::Server;

#include "api.grpc.pb.h"
#include "api.pb.h"

using trtlab::deploy::image_client::Classifications;
using trtlab::deploy::image_client::Detections;
using trtlab::deploy::image_client::ImageInfo;
using trtlab::deploy::image_client::Inference;

// CLI Options
DEFINE_string(hostname, "localhost", "Hostname");
DEFINE_string(ip_port, "0.0.0.0:50051", "IP/Port on which to listen");

class TestResources : public Resources
{
  public:
    TestResources(const std::string& hostname) : m_Hostname(hostname) {}
    const std::string& Hostname() const { return m_Hostname; }

  private:
    std::string m_Hostname;
};

template<typename Output>
class TestContext final : public Context<ImageInfo, Output, TestResources>
{
    void ExecuteRPC(ImageInfo& input, Output& output) final override
    {
        LOG(INFO) << input.model_name() << " served by " << this->GetResources()->Hostname();
        output.set_image_uuid(this->GetResources()->Hostname());
        this->FinishResponse();
    }
};

int main(int argc, char* argv[])
{
    FLAGS_alsologtostderr = 1; // Log to console
    ::google::InitGoogleLogging("test_deploy_client");
    ::google::ParseCommandLineFlags(&argc, &argv, true);

    Server server(FLAGS_ip_port);
    auto service = server.RegisterAsyncService<Inference>();
    auto rpc_classify = service->RegisterRPC<TestContext<Classifications>>(
        &Inference::AsyncService::RequestClassify);
    auto rpc_detection =
        service->RegisterRPC<TestContext<Detections>>(&Inference::AsyncService::RequestDetection);

    auto resources = std::make_shared<TestResources>(FLAGS_hostname);
    auto executor = server.RegisterExecutor(new Executor(1));

    executor->RegisterContexts(rpc_classify, resources, 1);
    executor->RegisterContexts(rpc_detection, resources, 1);

    server.Run(std::chrono::milliseconds(2000), [] {});
}


================================================
FILE: examples/Deployment/batcher.cc
================================================


template<typename Request, typename Response>
class BatchingService
{
  public:
    using PrepareFn =
        std::function<std::unique_ptr<::grpc::ClientAsyncReaderWriter<Request, Response>>(
            ::grpc::ClientContext*, ::grpc::CompletionQueue*)>;
    using Callback = std::function<void(bool)>;

    struct MessageType
    {
        Request& request;
        Response& response;
        Callback callback;
    };

    class Resources
    {
      public:
        Resources(PrepareFn prepare_fn, std::shared_ptr<client::Executor> executor,
                  std::shared_ptr<::trtlab::ThreadPool> post_process uint32_t max_batch_size,
                  uint64_t timeout_in_us)
            : m_PrepareFn(prepare_fn), m_Executor(executor), m_WaitAndDone(post_process),
              m_MaxBatchSize(max_batch_size), m_Timeout(timeout_in_us)
        {
        }

        std::shared_ptr<ClientStreaming<Request, Response>>
            CreateClient(std::function<void(Response&&)> on_recv)
        {
            auto on_sent = [](Request&& request) {};

            return std::make_shared<client::ClientStreaming<Request, Response>>(
                m_PrepareFn, m_Executor, on_sent, on_recv);
        }

        void Enqueue(Request& req, Response& resp, Callback callback)
        {
            m_MessageQueue.enqueue(MessageType{req, resp, callback});
        }

      protected:
        void BatchingEngine()
        {
            constexpr uint64_t quanta = 100;
            const double timeout = static_cast<double>(m_Timeout - quanta) / 1000000.0;
            const size_t max_batch_size = m_MaxBatchSize;
            size_t total_count;
            size_t max_deque;
            std::shared_ptr<std::vector<MessageType>> messages;
            std::chrono::time_point<std::chrono::high_resolution_clock> start;
            thread_local ConsumerToken token(m_MessageQueue);

            // clang-format off
            auto elapsed_time =
                [](std::chrono::time_point<std::chrono::high_resolution_clock>& start) -> double {
                    return std::chrono::duration<double>(
                        std::chrono::high_resolution_clock::now() - start).count();
                };
            // clang-format on

            for(;;)
            {
                MessageType messages[m_MaxBatchsize];
                max_batch = m_MaxBatchsize;
                total_count = 0;

                // pull 1 element from the queue and start timer
                // if dequeue times outs, then restart the loop
                total_count = m_MessageQueue.wait_dequeue_bulk_timed(
                    token, &(*messages)[total_count], 1, quanta);
                max_deque = max_batch_size - total_count;

                if(count == 0)
                {
                    continue;
                }

                // Create a Corked Stream - Corked = buffered writes
                stream = CreateClient([messages](Response&& response) mutable {
                    CHECK(!messges.empty());
                    DLOG(INFO) << "Finishing Unary Response/Callback: " << message.size()
                               << " remain on queue";
                    auto m = messages.front();
                    m.response = std::move(response);
                    m.callback();
                    messages.erase(messages.begin());
                });
                stream->Corked(true);

                // Continue to collect inference requests until we reach a maximum batch size
                // or we hit the timeout.  We will eagerly forward our current batch items along
                // the stream so the preprocessor can get ahead start
                start = std::chrono::high_resolution_clock::now();
                while(total_count < max_batch_size && elapsed(start) <)
                {
                    total_count += total_count = m_MessageQueue.wait_dequeue_bulk_timed(
                        token, &(*messages)[total_count], max_deque, quanta);
                    max_deque = max_batch_size - total_count;

                    for(; isend < total_count; isend++)
                    {
                        auto& m = (*messages)[isend];
                        stream->Write(m.request));
                    }
                }

                // Batching complete
                if(total_count)
                {
                    messages->resize(total_count);
                    stream->Done();
                    m_WaitAndFinish.enqueue([stream]() mutable {
                        auto future = stream->Status();
                        future.wait();
                        streawm.reset();
                    });
                    messages.reset(new std::vector<MessageType>);
                    messages->resize(max_batch_size);
                }
            }
        }

      private:
        PrepareFn m_PrepareFn;
        std::shared_ptr<client::Executor> m_Executor;
        std::shared_ptr<::trtlab::ThreadPool> m_WaitAndDelete;
        size_t m_MaxBatchsize;
        uint64_t m_Timeout;
        BlockingConcurrentQueue<MessageType> m_MessageQueue;
    };

    class BatchingContext : public Context<Request, Response, Resources>
    {
        void ExecuteRPC(Request& request, Response& response) final override
        {
            LOG(INFO) << "incoming unary request";
            this->GetResources()->Enqueue(&request, &response, [this](bool ok) {
                if(ok)
                {
                    this->FinishResponse();
                }
                else
                {
                    LOG(ERROR) << "Upstream Error";
                    this->CancelResponse();
                }
            });
        }
    };
};


================================================
FILE: examples/ONNX/resnet50/README.md
================================================
# TensorRT ResNet50 Example

- `fetch.sh` downloads the onnx model, test data, and calibration images from S3
  - after running this script the `resnet50` and `calibration_images` directories should be present in your
    local path

- Build (`build.py`) TensorRT engines from the `model.onnx` file
  - cli options:
    - `--batch` will select the batch size, multiple can be given, a separate
      engine for each batch size will be generated.
    - `--precision` can be `fp32`, `fp16`, or `int8`.  if multiple precision are given, an
      engine for each will be created.  
    - **Note**: To use `int8` precision, you will need a Turing, Volta, or Pascal GPU with compute capability 6.1.
  - If you have a Turing or Volta GPU, then run the following commmand which will generates 4 engines:
    ```
    ./build.py --batch=1 --batch=8 --precision=fp16 --precision=int8 resnet50/model.onnx
    ```
  - If you have a Pascal GPU, run the following which generates 2 engines:
    ```
    ./build.py --batch=1 --batch=8 --precision=fp32 resnet50/model.onnx
    ```
- Functional Test
  - `./run_onnx_tests.py model-b1-fp16.engine` will run the onnx tests 

- Benchmark TensorRT engines at different batch sizes and concurrent executions:
  - `/work/build/examples/00_TensorRT/infer.x --engine=model-b1-fp16.engine --contexts=1`
  - `/work/build/examples/00_TensorRT/infer.x --engine=model-b1-fp16.engine --contexts=8`
  - `/work/build/examples/00_TensorRT/infer.x --engine=model-b8-fp16.engine --contexts=1`
  - `/work/build/examples/00_TensorRT/infer.x --engine=model-b8-fp16.engine --contexts=6`

- `./run_jpeg_test.py --image=images/broccoli-3784.jpg model-b1-fp16.engine`
  - Note this example requires MxNet for image preprocessing: `pip install mxnet`
  - On a V100 using FP16, your results should be close to
  ```
  *** Results ***
  broccoli 0.9511453
  ```
- `./run_jpeg_test.py --image=images/broccoli-3784.jpg model-b1-int8.engine`
  - When using INT8, your results should be close to
  ```
  *** Results ***
  broccoli 0.9228073
  ```

## Credits

 - [broccoli image](https://www.openfotos.com/view/broccoli-3784) - OpenFotos
   - https://www.openfotos.com/pages/open-fotos-license
 - [calibration images](calibration_images.csv) - Images from OpenFotos and Pixabay
   - https://www.openfotos.com/pages/open-fotos-license
   - https://pixabay.com/service/license/ 

## TODOs

 - [ ] Update `run_jpeg_test.py` to highlight the async interface.


================================================
FILE: examples/ONNX/resnet50/build.py
================================================
#!/usr/bin/env python3
#
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
import os
import subprocess

import click
import int8

precision_opts = {
  "fp32": "",
  "fp16": "--fp16",
  "int8": "--fp16 --int8",
}

File = click.Path(exists=True, file_okay=True, dir_okay=False, resolve_path=True)

@click.command()
@click.option("--batch", type=click.IntRange(min=1, max=32), multiple=True)
@click.option("--precision", type=click.Choice(["fp32", "fp16", "int8"]), multiple=True)
@click.argument("models", type=File, nargs=-1)
def main(models, batch, precision):
    for model in models:
        #click.echo(model)
        #click.echo(precision)
        for p in precision:
            #click.echo(p)
            for b in batch:
                #click.echo(b)
                n = "b{}-{}".format(b, p)
                m = os.path.basename(model)
                m, ext = os.path.splitext(m)
                e = "{}-{}.{}".format(m,n,"engine")

                if os.path.isfile(e):
                    print("A TensorRT engine {} already exists! Skipping...".format(e))
                    continue
                elif p == "int8":
                    assert os.path.isdir("./calibration_images"), "Need to download calibration images before creating INT8 engine!"
                    int8.build_int8_engine_onnx(model, "./calibration_images", b, 32, e)
                else:
                    subprocess.call("trtexec --onnx={} --batch={} {} --saveEngine={}".format(model, b, precision_opts.get(p), e), shell=True)

if __name__ == "__main__":
    main()


================================================
FILE: examples/ONNX/resnet50/calibration_images.csv
================================================
url,license,label
https://www.openfotos.com/pictures/red-rock-crab-1096.full.jpg,Open Fotos License,crab
https://www.openfotos.com/pictures/lazy-cats-4109.full.jpg,Open Fotos License,cat
https://www.openfotos.com/pictures/beautiful-elephants-4005.full.jpg,Open Fotos License,elephants
https://www.openfotos.com/pictures/funny-wild-pigs-437.full.jpg,Open Fotos License,pig
https://www.openfotos.com/pictures/hopping-5017.full.jpg,Open Fotos License,bird
https://www.openfotos.com/pictures/old-car-4811.full.jpg,Open Fotos License,car
https://www.openfotos.com/pictures/audi-4740.full.jpg,Open Fotos License,car
https://www.openfotos.com/pictures/boarding-in-the-plane-4891.full.jpg,Open Fotos License,plane
https://cdn.pixabay.com/photo/2016/03/09/09/28/bear-1245807_960_720.jpg,Pixabay License,bear
https://cdn.pixabay.com/photo/2015/02/26/06/09/panda-649938_960_720.jpg,Pixabay License,giant panda
https://cdn.pixabay.com/photo/2015/09/22/19/00/ship-952292_960_720.jpg,Pixabay License,boat
https://cdn.pixabay.com/photo/2017/01/16/19/17/horses-1984977_960_720.jpg,Pixabay License,horse
https://cdn.pixabay.com/photo/2015/03/26/09/54/pug-690566_960_720.jpg,Pixabay License,dog
https://cdn.pixabay.com/photo/2016/08/19/15/23/lizard-1605515_960_720.jpg,Pixabay License,lizzard
https://cdn.pixabay.com/photo/2016/07/09/12/16/apple-1506119_960_720.jpg,Pixabay License,apple
https://cdn.pixabay.com/photo/2016/10/07/14/11/tangerines-1721633_960_720.jpg,Pixabay License,orange
https://cdn.pixabay.com/photo/2015/06/19/16/48/watermelon-815072_960_720.jpg,Pixabay License,watermelon
https://cdn.pixabay.com/photo/2017/06/09/16/39/carrots-2387394_960_720.jpg,Pixabay License,carrot
https://cdn.pixabay.com/photo/2019/02/28/22/45/hippo-4027011_960_720.jpg,Pixabay License,hippo
https://cdn.pixabay.com/photo/2012/03/04/00/09/africa-21787_960_720.jpg,Pixabay License,lion
https://cdn.pixabay.com/photo/2015/07/27/19/47/turtle-863336_960_720.jpg,Pixabay License,turtle
https://cdn.pixabay.com/photo/2018/04/15/17/45/fish-3322230_960_720.jpg,Pixabay License,fish
https://cdn.pixabay.com/photo/2013/11/01/11/13/dolphin-203875_960_720.jpg,Pixabay License,dolphin
https://cdn.pixabay.com/photo/2014/05/02/21/49/home-office-336373_960_720.jpg,Pixabay License,laptop
https://cdn.pixabay.com/photo/2012/05/18/21/45/tiger-mosquito-49141_960_720.jpg,Pixabay License,mosquito
https://cdn.pixabay.com/photo/2016/08/09/13/21/coffee-1580595_960_720.jpg,Pixabay License,cup
https://cdn.pixabay.com/photo/2016/02/19/10/36/lemons-1209309_960_720.jpg,Pixabay License,lemon
https://cdn.pixabay.com/photo/2018/12/14/02/41/chengdu-3874136_960_720.jpg,Pixabay License,hotpot
https://cdn.pixabay.com/photo/2015/09/05/12/53/violin-924349_960_720.jpg,Pixabay License,violin
https://cdn.pixabay.com/photo/2015/11/07/11/22/pillows-1031079_960_720.jpg,Pixabay License,pillow
https://cdn.pixabay.com/photo/2016/11/18/22/26/animal-1837164_960_720.jpg,Pixabay License,rhino
https://cdn.pixabay.com/photo/2016/12/04/21/58/rabbit-1882699_960_720.jpg,Pixabay License,rabbit
https://cdn.pixabay.com/photo/2014/10/23/18/56/tiger-500118_960_720.jpg,Pixabay License,tiger


================================================
FILE: examples/ONNX/resnet50/calibrator.py
================================================
#
# Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
#
# NOTICE TO LICENSEE:
#
# This source code and/or documentation ("Licensed Deliverables") are
# subject to NVIDIA intellectual property rights under U.S. and
# international Copyright laws.
#
# These Licensed Deliverables contained herein is PROPRIETARY and
# CONFIDENTIAL to NVIDIA and is being provided under the terms and
# conditions of a form of NVIDIA software license agreement by and
# between NVIDIA and Licensee ("License Agreement") or electronically
# accepted by Licensee.  Notwithstanding any terms or conditions to
# the contrary in the License Agreement, reproduction or disclosure
# of the Licensed Deliverables to any third party without the express
# written consent of NVIDIA is prohibited.
#
# NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
# LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
# SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
# PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
# NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
# DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
# NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
# NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
# LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
# SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
# OF THESE LICENSED DELIVERABLES.
#
# U.S. Government End Users.  These Licensed Deliverables are a
# "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
# 1995), consisting of "commercial computer software" and "commercial
# computer software documentation" as such terms are used in 48
# C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
# only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
# 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
# U.S. Government End Users acquire the Licensed Deliverables with
# only those rights set forth herein.
#
# Any use of the Licensed Deliverables in individual and commercial
# software must include, in the user documentation and internal
# comments to the code, the above Disclaimer and U.S. Government End
# Users Notice.

import tensorrt as trt
import os

import pycuda.driver as cuda
import pycuda.autoinit
import matplotlib.pyplot as plt

import mxnet as mx
from mxnet.gluon.data.vision import transforms
import numpy as np
from random import shuffle

class ONNXEntropyCalibrator(trt.IInt8EntropyCalibrator):
    def __init__(self, image_dir, batch_size, calibration_batches, cache_file):
        # Whenever you specify a custom constructor for a TensorRT class,
        # you MUST call the constructor of the parent explicitly.
        trt.IInt8EntropyCalibrator.__init__(self)

        self.cache_file = cache_file

        # Get a list of all the images in the image directory.
        image_files = [os.path.join(image_dir, f) for f in os.listdir(image_dir)]
        shuffle(image_files)

        if len(image_files) < calibration_batches * batch_size:
            print("Only found enough images for {} batches instead of {}, continuing anyway...".format(len(image_files) // batch_size, calibration_batches))
            self.image_files = image_files
        else:
            self.image_files = image_files[:calibration_batches * batch_size]

        # Keeps track of current image in image list
        self.current_image = 0
        self.batch_size = batch_size
        self.input_size = [3,224,224]

        # Each element of the calibration data is a float32.
        self.device_input = cuda.mem_alloc(self.batch_size * self.input_size[0] * self.input_size[1] * self.input_size[2]  * trt.float32.itemsize)

        # Create a generator that will give us batches. We can use next() to iterate over the result.
        def load_batches():
            while self.current_image < len(self.image_files):
                data, images_read = self.read_image_batch()
                self.current_image += images_read
                yield data
        self.batches = load_batches()


    def transform_image(self, img):
        transform_fn = transforms.Compose([
            transforms.Resize(224),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
        img = transform_fn(mx.nd.array(img)).asnumpy()
        return img


    # This function is used to load calibration images into batches.
    def read_image_batch(self):
        # Depending on batch size and number of images, the final batch might only be partially full.
        images_to_read = min(self.batch_size, len(self.image_files) - self.current_image)

        host_buffer = np.zeros(shape=[self.batch_size]+self.input_size)
        for i in range(images_to_read):
            img = np.array(plt.imread(self.image_files[self.current_image]))
            img = self.transform_image(img)
            host_buffer[i,:,:,:] = img

        return host_buffer, images_to_read

    def get_batch_size(self):
        return self.batch_size

    # TensorRT passes along the names of the engine bindings to the get_batch function.
    # You don't necessarily have to use them, but they can be useful to understand the order of
    # the inputs. The bindings list is expected to have the same ordering as 'names'.
    def get_batch(self, names):
        try:
            # Get a single batch.
            data = np.ascontiguousarray(next(self.batches), np.float32)
            # Copy to device, then return a list containing pointers to input device buffers.
            cuda.memcpy_htod(self.device_input, data)
            return [int(self.device_input)]
        except StopIteration:
            # When we're out of batches, we return either [] or None.
            # This signals to TensorRT that there is no calibration data remaining.
            return None

    def read_calibration_cache(self):
        # If there is a cache, use it instead of calibrating again. Otherwise, implicitly return None.
        if os.path.exists(self.cache_file):
            with open(self.cache_file, "rb") as f:
                return f.read()

    def write_calibration_cache(self, cache):
        with open(self.cache_file, "wb") as f:
            f.write(cache)


================================================
FILE: examples/ONNX/resnet50/fetch.sh
================================================
#!/bin/bash

if [ ! -e "resnet50.tar.gz" ]; then
  wget https://s3.amazonaws.com/download.onnx/models/opset_8/resnet50.tar.gz
fi

if [ ! -e "open_source_images.tar.gz" ]; then
  wget https://s3-us-west-2.amazonaws.com/com.nvidia.tensorrt-laboratory/open_source_images.tar.gz
fi

if md5sum -c resnet50.md5; then
  if [ ! -e "resnet50" ]; then
    tar xzf resnet50.tar.gz
  fi
  echo "ResNet50 download good"
else
  echo "ResNet50 md5 checksum failed"
  exit 911
fi

if md5sum -c open_source_images.md5; then
  if [ ! -e "calibration_images" ]; then
    tar xf open_source_images.tar.gz
  fi
  echo "All good - Continue to Build Phase"
else
  echo "calibration_images md5 checksum failed"
  exit 911
fi


================================================
FILE: examples/ONNX/resnet50/imagenet_labels.py
================================================

labels = {0: 'tench, Tinca tinca',
 1: 'goldfish, Carassius auratus',
 2: 'great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias',
 3: 'tiger shark, Galeocerdo cuvieri',
 4: 'hammerhead, hammerhead shark',
 5: 'electric ray, crampfish, numbfish, torpedo',
 6: 'stingray',
 7: 'cock',
 8: 'hen',
 9: 'ostrich, Struthio camelus',
 10: 'brambling, Fringilla montifringilla',
 11: 'goldfinch, Carduelis carduelis',
 12: 'house finch, linnet, Carpodacus mexicanus',
 13: 'junco, snowbird',
 14: 'indigo bunting, indigo finch, indigo bird, Passerina cyanea',
 15: 'robin, American robin, Turdus migratorius',
 16: 'bulbul',
 17: 'jay',
 18: 'magpie',
 19: 'chickadee',
 20: 'water ouzel, dipper',
 21: 'kite',
 22: 'bald eagle, American eagle, Haliaeetus leucocephalus',
 23: 'vulture',
 24: 'great grey owl, great gray owl, Strix nebulosa',
 25: 'European fire salamander, Salamandra salamandra',
 26: 'common newt, Triturus vulgaris',
 27: 'eft',
 28: 'spotted salamander, Ambystoma maculatum',
 29: 'axolotl, mud puppy, Ambystoma mexicanum',
 30: 'bullfrog, Rana catesbeiana',
 31: 'tree frog, tree-frog',
 32: 'tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui',
 33: 'loggerhead, loggerhead turtle, Caretta caretta',
 34: 'leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea',
 35: 'mud turtle',
 36: 'terrapin',
 37: 'box turtle, box tortoise',
 38: 'banded gecko',
 39: 'common iguana, iguana, Iguana iguana',
 40: 'American chameleon, anole, Anolis carolinensis',
 41: 'whiptail, whiptail lizard',
 42: 'agama',
 43: 'frilled lizard, Chlamydosaurus kingi',
 44: 'alligator lizard',
 45: 'Gila monster, Heloderma suspectum',
 46: 'green lizard, Lacerta viridis',
 47: 'African chameleon, Chamaeleo chamaeleon',
 48: 'Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis',
 49: 'African crocodile, Nile crocodile, Crocodylus niloticus',
 50: 'American alligator, Alligator mississipiensis',
 51: 'triceratops',
 52: 'thunder snake, worm snake, Carphophis amoenus',
 53: 'ringneck snake, ring-necked snake, ring snake',
 54: 'hognose snake, puff adder, sand viper',
 55: 'green snake, grass snake',
 56: 'king snake, kingsnake',
 57: 'garter snake, grass snake',
 58: 'water snake',
 59: 'vine snake',
 60: 'night snake, Hypsiglena torquata',
 61: 'boa constrictor, Constrictor constrictor',
 62: 'rock python, rock snake, Python sebae',
 63: 'Indian cobra, Naja naja',
 64: 'green mamba',
 65: 'sea snake',
 66: 'horned viper, cerastes, sand viper, horned asp, Cerastes cornutus',
 67: 'diamondback, diamondback rattlesnake, Crotalus adamanteus',
 68: 'sidewinder, horned rattlesnake, Crotalus cerastes',
 69: 'trilobite',
 70: 'harvestman, daddy longlegs, Phalangium opilio',
 71: 'scorpion',
 72: 'black and gold garden spider, Argiope aurantia',
 73: 'barn spider, Araneus cavaticus',
 74: 'garden spider, Aranea diademata',
 75: 'black widow, Latrodectus mactans',
 76: 'tarantula',
 77: 'wolf spider, hunting spider',
 78: 'tick',
 79: 'centipede',
 80: 'black grouse',
 81: 'ptarmigan',
 82: 'ruffed grouse, partridge, Bonasa umbellus',
 83: 'prairie chicken, prairie grouse, prairie fowl',
 84: 'peacock',
 85: 'quail',
 86: 'partridge',
 87: 'African grey, African gray, Psittacus erithacus',
 88: 'macaw',
 89: 'sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita',
 90: 'lorikeet',
 91: 'coucal',
 92: 'bee eater',
 93: 'hornbill',
 94: 'hummingbird',
 95: 'jacamar',
 96: 'toucan',
 97: 'drake',
 98: 'red-breasted merganser, Mergus serrator',
 99: 'goose',
 100: 'black swan, Cygnus atratus',
 101: 'tusker',
 102: 'echidna, spiny anteater, anteater',
 103: 'platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus',
 104: 'wallaby, brush kangaroo',
 105: 'koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus',
 106: 'wombat',
 107: 'jellyfish',
 108: 'sea anemone, anemone',
 109: 'brain coral',
 110: 'flatworm, platyhelminth',
 111: 'nematode, nematode worm, roundworm',
 112: 'conch',
 113: 'snail',
 114: 'slug',
 115: 'sea slug, nudibranch',
 116: 'chiton, coat-of-mail shell, sea cradle, polyplacophore',
 117: 'chambered nautilus, pearly nautilus, nautilus',
 118: 'Dungeness crab, Cancer magister',
 119: 'rock crab, Cancer irroratus',
 120: 'fiddler crab',
 121: 'king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica',
 122: 'American lobster, Northern lobster, Maine lobster, Homarus americanus',
 123: 'spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish',
 124: 'crayfish, crawfish, crawdad, crawdaddy',
 125: 'hermit crab',
 126: 'isopod',
 127: 'white stork, Ciconia ciconia',
 128: 'black stork, Ciconia nigra',
 129: 'spoonbill',
 130: 'flamingo',
 131: 'little blue heron, Egretta caerulea',
 132: 'American egret, great white heron, Egretta albus',
 133: 'bittern',
 134: 'crane',
 135: 'limpkin, Aramus pictus',
 136: 'European gallinule, Porphyrio porphyrio',
 137: 'American coot, marsh hen, mud hen, water hen, Fulica americana',
 138: 'bustard',
 139: 'ruddy turnstone, Arenaria interpres',
 140: 'red-backed sandpiper, dunlin, Erolia alpina',
 141: 'redshank, Tringa totanus',
 142: 'dowitcher',
 143: 'oystercatcher, oyster catcher',
 144: 'pelican',
 145: 'king penguin, Aptenodytes patagonica',
 146: 'albatross, mollymawk',
 147: 'grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus',
 148: 'killer whale, killer, orca, grampus, sea wolf, Orcinus orca',
 149: 'dugong, Dugong dugon',
 150: 'sea lion',
 151: 'Chihuahua',
 152: 'Japanese spaniel',
 153: 'Maltese dog, Maltese terrier, Maltese',
 154: 'Pekinese, Pekingese, Peke',
 155: 'Shih-Tzu',
 156: 'Blenheim spaniel',
 157: 'papillon',
 158: 'toy terrier',
 159: 'Rhodesian ridgeback',
 160: 'Afghan hound, Afghan',
 161: 'basset, basset hound',
 162: 'beagle',
 163: 'bloodhound, sleuthhound',
 164: 'bluetick',
 165: 'black-and-tan coonhound',
 166: 'Walker hound, Walker foxhound',
 167: 'English foxhound',
 168: 'redbone',
 169: 'borzoi, Russian wolfhound',
 170: 'Irish wolfhound',
 171: 'Italian greyhound',
 172: 'whippet',
 173: 'Ibizan hound, Ibizan Podenco',
 174: 'Norwegian elkhound, elkhound',
 175: 'otterhound, otter hound',
 176: 'Saluki, gazelle hound',
 177: 'Scottish deerhound, deerhound',
 178: 'Weimaraner',
 179: 'Staffordshire bullterrier, Staffordshire bull terrier',
 180: 'American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier',
 181: 'Bedlington terrier',
 182: 'Border terrier',
 183: 'Kerry blue terrier',
 184: 'Irish terrier',
 185: 'Norfolk terrier',
 186: 'Norwich terrier',
 187: 'Yorkshire terrier',
 188: 'wire-haired fox terrier',
 189: 'Lakeland terrier',
 190: 'Sealyham terrier, Sealyham',
 191: 'Airedale, Airedale terrier',
 192: 'cairn, cairn terrier',
 193: 'Australian terrier',
 194: 'Dandie Dinmont, Dandie Dinmont terrier',
 195: 'Boston bull, Boston terrier',
 196: 'miniature schnauzer',
 197: 'giant schnauzer',
 198: 'standard schnauzer',
 199: 'Scotch terrier, Scottish terrier, Scottie',
 200: 'Tibetan terrier, chrysanthemum dog',
 201: 'silky terrier, Sydney silky',
 202: 'soft-coated wheaten terrier',
 203: 'West Highland white terrier',
 204: 'Lhasa, Lhasa apso',
 205: 'flat-coated retriever',
 206: 'curly-coated retriever',
 207: 'golden retriever',
 208: 'Labrador retriever',
 209: 'Chesapeake Bay retriever',
 210: 'German short-haired pointer',
 211: 'vizsla, Hungarian pointer',
 212: 'English setter',
 213: 'Irish setter, red setter',
 214: 'Gordon setter',
 215: 'Brittany spaniel',
 216: 'clumber, clumber spaniel',
 217: 'English springer, English springer spaniel',
 218: 'Welsh springer spaniel',
 219: 'cocker spaniel, English cocker spaniel, cocker',
 220: 'Sussex spaniel',
 221: 'Irish water spaniel',
 222: 'kuvasz',
 223: 'schipperke',
 224: 'groenendael',
 225: 'malinois',
 226: 'briard',
 227: 'kelpie',
 228: 'komondor',
 229: 'Old English sheepdog, bobtail',
 230: 'Shetland sheepdog, Shetland sheep dog, Shetland',
 231: 'collie',
 232: 'Border collie',
 233: 'Bouvier des Flandres, Bouviers des Flandres',
 234: 'Rottweiler',
 235: 'German shepherd, German shepherd dog, German police dog, alsatian',
 236: 'Doberman, Doberman pinscher',
 237: 'miniature pinscher',
 238: 'Greater Swiss Mountain dog',
 239: 'Bernese mountain dog',
 240: 'Appenzeller',
 241: 'EntleBucher',
 242: 'boxer',
 243: 'bull mastiff',
 244: 'Tibetan mastiff',
 245: 'French bulldog',
 246: 'Great Dane',
 247: 'Saint Bernard, St Bernard',
 248: 'Eskimo dog, husky',
 249: 'malamute, malemute, Alaskan malamute',
 250: 'Siberian husky',
 251: 'dalmatian, coach dog, carriage dog',
 252: 'affenpinscher, monkey pinscher, monkey dog',
 253: 'basenji',
 254: 'pug, pug-dog',
 255: 'Leonberg',
 256: 'Newfoundland, Newfoundland dog',
 257: 'Great Pyrenees',
 258: 'Samoyed, Samoyede',
 259: 'Pomeranian',
 260: 'chow, chow chow',
 261: 'keeshond',
 262: 'Brabancon griffon',
 263: 'Pembroke, Pembroke Welsh corgi',
 264: 'Cardigan, Cardigan Welsh corgi',
 265: 'toy poodle',
 266: 'miniature poodle',
 267: 'standard poodle',
 268: 'Mexican hairless',
 269: 'timber wolf, grey wolf, gray wolf, Canis lupus',
 270: 'white wolf, Arctic wolf, Canis lupus tundrarum',
 271: 'red wolf, maned wolf, Canis rufus, Canis niger',
 272: 'coyote, prairie wolf, brush wolf, Canis latrans',
 273: 'dingo, warrigal, warragal, Canis dingo',
 274: 'dhole, Cuon alpinus',
 275: 'African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus',
 276: 'hyena, hyaena',
 277: 'red fox, Vulpes vulpes',
 278: 'kit fox, Vulpes macrotis',
 279: 'Arctic fox, white fox, Alopex lagopus',
 280: 'grey fox, gray fox, Urocyon cinereoargenteus',
 281: 'tabby, tabby cat',
 282: 'tiger cat',
 283: 'Persian cat',
 284: 'Siamese cat, Siamese',
 285: 'Egyptian cat',
 286: 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor',
 287: 'lynx, catamount',
 288: 'leopard, Panthera pardus',
 289: 'snow leopard, ounce, Panthera uncia',
 290: 'jaguar, panther, Panthera onca, Felis onca',
 291: 'lion, king of beasts, Panthera leo',
 292: 'tiger, Panthera tigris',
 293: 'cheetah, chetah, Acinonyx jubatus',
 294: 'brown bear, bruin, Ursus arctos',
 295: 'American black bear, black bear, Ursus americanus, Euarctos americanus',
 296: 'ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus',
 297: 'sloth bear, Melursus ursinus, Ursus ursinus',
 298: 'mongoose',
 299: 'meerkat, mierkat',
 300: 'tiger beetle',
 301: 'ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle',
 302: 'ground beetle, carabid beetle',
 303: 'long-horned beetle, longicorn, longicorn beetle',
 304: 'leaf beetle, chrysomelid',
 305: 'dung beetle',
 306: 'rhinoceros beetle',
 307: 'weevil',
 308: 'fly',
 309: 'bee',
 310: 'ant, emmet, pismire',
 311: 'grasshopper, hopper',
 312: 'cricket',
 313: 'walking stick, walkingstick, stick insect',
 314: 'cockroach, roach',
 315: 'mantis, mantid',
 316: 'cicada, cicala',
 317: 'leafhopper',
 318: 'lacewing, lacewing fly',
 319: "dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk",
 320: 'damselfly',
 321: 'admiral',
 322: 'ringlet, ringlet butterfly',
 323: 'monarch, monarch butterfly, milkweed butterfly, Danaus plexippus',
 324: 'cabbage butterfly',
 325: 'sulphur butterfly, sulfur butterfly',
 326: 'lycaenid, lycaenid butterfly',
 327: 'starfish, sea star',
 328: 'sea urchin',
 329: 'sea cucumber, holothurian',
 330: 'wood rabbit, cottontail, cottontail rabbit',
 331: 'hare',
 332: 'Angora, Angora rabbit',
 333: 'hamster',
 334: 'porcupine, hedgehog',
 335: 'fox squirrel, eastern fox squirrel, Sciurus niger',
 336: 'marmot',
 337: 'beaver',
 338: 'guinea pig, Cavia cobaya',
 339: 'sorrel',
 340: 'zebra',
 341: 'hog, pig, grunter, squealer, Sus scrofa',
 342: 'wild boar, boar, Sus scrofa',
 343: 'warthog',
 344: 'hippopotamus, hippo, river horse, Hippopotamus amphibius',
 345: 'ox',
 346: 'water buffalo, water ox, Asiatic buffalo, Bubalus bubalis',
 347: 'bison',
 348: 'ram, tup',
 349: 'bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis',
 350: 'ibex, Capra ibex',
 351: 'hartebeest',
 352: 'impala, Aepyceros melampus',
 353: 'gazelle',
 354: 'Arabian camel, dromedary, Camelus dromedarius',
 355: 'llama',
 356: 'weasel',
 357: 'mink',
 358: 'polecat, fitch, foulmart, foumart, Mustela putorius',
 359: 'black-footed ferret, ferret, Mustela nigripes',
 360: 'otter',
 361: 'skunk, polecat, wood pussy',
 362: 'badger',
 363: 'armadillo',
 364: 'three-toed sloth, ai, Bradypus tridactylus',
 365: 'orangutan, orang, orangutang, Pongo pygmaeus',
 366: 'gorilla, Gorilla gorilla',
 367: 'chimpanzee, chimp, Pan troglodytes',
 368: 'gibbon, Hylobates lar',
 369: 'siamang, Hylobates syndactylus, Symphalangus syndactylus',
 370: 'guenon, guenon monkey',
 371: 'patas, hussar monkey, Erythrocebus patas',
 372: 'baboon',
 373: 'macaque',
 374: 'langur',
 375: 'colobus, colobus monkey',
 376: 'proboscis monkey, Nasalis larvatus',
 377: 'marmoset',
 378: 'capuchin, ringtail, Cebus capucinus',
 379: 'howler monkey, howler',
 380: 'titi, titi monkey',
 381: 'spider monkey, Ateles geoffroyi',
 382: 'squirrel monkey, Saimiri sciureus',
 383: 'Madagascar cat, ring-tailed lemur, Lemur catta',
 384: 'indri, indris, Indri indri, Indri brevicaudatus',
 385: 'Indian elephant, Elephas maximus',
 386: 'African elephant, Loxodonta africana',
 387: 'lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens',
 388: 'giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca',
 389: 'barracouta, snoek',
 390: 'eel',
 391: 'coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch',
 392: 'rock beauty, Holocanthus tricolor',
 393: 'anemone fish',
 394: 'sturgeon',
 395: 'gar, garfish, garpike, billfish, Lepisosteus osseus',
 396: 'lionfish',
 397: 'puffer, pufferfish, blowfish, globefish',
 398: 'abacus',
 399: 'abaya',
 400: "academic gown, academic robe, judge's robe",
 401: 'accordion, piano accordion, squeeze box',
 402: 'acoustic guitar',
 403: 'aircraft carrier, carrier, flattop, attack aircraft carrier',
 404: 'airliner',
 405: 'airship, dirigible',
 406: 'altar',
 407: 'ambulance',
 408: 'amphibian, amphibious vehicle',
 409: 'analog clock',
 410: 'apiary, bee house',
 411: 'apron',
 412: 'ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin',
 413: 'assault rifle, assault gun',
 414: 'backpack, back pack, knapsack, packsack, rucksack, haversack',
 415: 'bakery, bakeshop, bakehouse',
 416: 'balance beam, beam',
 417: 'balloon',
 418: 'ballpoint, ballpoint pen, ballpen, Biro',
 419: 'Band Aid',
 420: 'banjo',
 421: 'bannister, banister, balustrade, balusters, handrail',
 422: 'barbell',
 423: 'barber chair',
 424: 'barbershop',
 425: 'barn',
 426: 'barometer',
 427: 'barrel, cask',
 428: 'barrow, garden cart, lawn cart, wheelbarrow',
 429: 'baseball',
 430: 'basketball',
 431: 'bassinet',
 432: 'bassoon',
 433: 'bathing cap, swimming cap',
 434: 'bath towel',
 435: 'bathtub, bathing tub, bath, tub',
 436: 'beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon',
 437: 'beacon, lighthouse, beacon light, pharos',
 438: 'beaker',
 439: 'bearskin, busby, shako',
 440: 'beer bottle',
 441: 'beer glass',
 442: 'bell cote, bell cot',
 443: 'bib',
 444: 'bicycle-built-for-two, tandem bicycle, tandem',
 445: 'bikini, two-piece',
 446: 'binder, ring-binder',
 447: 'binoculars, field glasses, opera glasses',
 448: 'birdhouse',
 449: 'boathouse',
 450: 'bobsled, bobsleigh, bob',
 451: 'bolo tie, bolo, bola tie, bola',
 452: 'bonnet, poke bonnet',
 453: 'bookcase',
 454: 'bookshop, bookstore, bookstall',
 455: 'bottlecap',
 456: 'bow',
 457: 'bow tie, bow-tie, bowtie',
 458: 'brass, memorial tablet, plaque',
 459: 'brassiere, bra, bandeau',
 460: 'breakwater, groin, groyne, mole, bulwark, seawall, jetty',
 461: 'breastplate, aegis, egis',
 462: 'broom',
 463: 'bucket, pail',
 464: 'buckle',
 465: 'bulletproof vest',
 466: 'bullet train, bullet',
 467: 'butcher shop, meat market',
 468: 'cab, hack, taxi, taxicab',
 469: 'caldron, cauldron',
 470: 'candle, taper, wax light',
 471: 'cannon',
 472: 'canoe',
 473: 'can opener, tin opener',
 474: 'cardigan',
 475: 'car mirror',
 476: 'carousel, carrousel, merry-go-round, roundabout, whirligig',
 477: "carpenter's kit, tool kit",
 478: 'carton',
 479: 'car wheel',
 480: 'cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM',
 481: 'cassette',
 482: 'cassette player',
 483: 'castle',
 484: 'catamaran',
 485: 'CD player',
 486: 'cello, violoncello',
 487: 'cellular telephone, cellular phone, cellphone, cell, mobile phone',
 488: 'chain',
 489: 'chainlink fence',
 490: 'chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour',
 491: 'chain saw, chainsaw',
 492: 'chest',
 493: 'chiffonier, commode',
 494: 'chime, bell, gong',
 495: 'china cabinet, china closet',
 496: 'Christmas stocking',
 497: 'church, church building',
 498: 'cinema, movie theater, movie theatre, movie house, picture palace',
 499: 'cleaver, meat cleaver, chopper',
 500: 'cliff dwelling',
 501: 'cloak',
 502: 'clog, geta, patten, sabot',
 503: 'cocktail shaker',
 504: 'coffee mug',
 505: 'coffeepot',
 506: 'coil, spiral, volute, whorl, helix',
 507: 'combination lock',
 508: 'computer keyboard, keypad',
 509: 'confectionery, confectionary, candy store',
 510: 'container ship, containership, container vessel',
 511: 'convertible',
 512: 'corkscrew, bottle screw',
 513: 'cornet, horn, trumpet, trump',
 514: 'cowboy boot',
 515: 'cowboy hat, ten-gallon hat',
 516: 'cradle',
 517: 'crane',
 518: 'crash helmet',
 519: 'crate',
 520: 'crib, cot',
 521: 'Crock Pot',
 522: 'croquet ball',
 523: 'crutch',
 524: 'cuirass',
 525: 'dam, dike, dyke',
 526: 'desk',
 527: 'desktop computer',
 528: 'dial telephone, dial phone',
 529: 'diaper, nappy, napkin',
 530: 'digital clock',
 531: 'digital watch',
 532: 'dining table, board',
 533: 'dishrag, dishcloth',
 534: 'dishwasher, dish washer, dishwashing machine',
 535: 'disk brake, disc brake',
 536: 'dock, dockage, docking facility',
 537: 'dogsled, dog sled, dog sleigh',
 538: 'dome',
 539: 'doormat, welcome mat',
 540: 'drilling platform, offshore rig',
 541: 'drum, membranophone, tympan',
 542: 'drumstick',
 543: 'dumbbell',
 544: 'Dutch oven',
 545: 'electric fan, blower',
 546: 'electric guitar',
 547: 'electric locomotive',
 548: 'entertainment center',
 549: 'envelope',
 550: 'espresso maker',
 551: 'face powder',
 552: 'feather boa, boa',
 553: 'file, file cabinet, filing cabinet',
 554: 'fireboat',
 555: 'fire engine, fire truck',
 556: 'fire screen, fireguard',
 557: 'flagpole, flagstaff',
 558: 'flute, transverse flute',
 559: 'folding chair',
 560: 'football helmet',
 561: 'forklift',
 562: 'fountain',
 563: 'fountain pen',
 564: 'four-poster',
 565: 'freight car',
 566: 'French horn, horn',
 567: 'frying pan, frypan, skillet',
 568: 'fur coat',
 569: 'garbage truck, dustcart',
 570: 'gasmask, respirator, gas helmet',
 571: 'gas pump, gasoline pump, petrol pump, island dispenser',
 572: 'goblet',
 573: 'go-kart',
 574: 'golf ball',
 575: 'golfcart, golf cart',
 576: 'gondola',
 577: 'gong, tam-tam',
 578: 'gown',
 579: 'grand piano, grand',
 580: 'greenhouse, nursery, glasshouse',
 581: 'grille, radiator grille',
 582: 'grocery store, grocery, food market, market',
 583: 'guillotine',
 584: 'hair slide',
 585: 'hair spray',
 586: 'half track',
 587: 'hammer',
 588: 'hamper',
 589: 'hand blower, blow dryer, blow drier, hair dryer, hair drier',
 590: 'hand-held computer, hand-held microcomputer',
 591: 'handkerchief, hankie, hanky, hankey',
 592: 'hard disc, hard disk, fixed disk',
 593: 'harmonica, mouth organ, harp, mouth harp',
 594: 'harp',
 595: 'harvester, reaper',
 596: 'hatchet',
 597: 'holster',
 598: 'home theater, home theatre',
 599: 'honeycomb',
 600: 'hook, claw',
 601: 'hoopskirt, crinoline',
 602: 'horizontal bar, high bar',
 603: 'horse cart, horse-cart',
 604: 'hourglass',
 605: 'iPod',
 606: 'iron, smoothing iron',
 607: "jack-o'-lantern",
 608: 'jean, blue jean, denim',
 609: 'jeep, landrover',
 610: 'jersey, T-shirt, tee shirt',
 611: 'jigsaw puzzle',
 612: 'jinrikisha, ricksha, rickshaw',
 613: 'joystick',
 614: 'kimono',
 615: 'knee pad',
 616: 'knot',
 617: 'lab coat, laboratory coat',
 618: 'ladle',
 619: 'lampshade, lamp shade',
 620: 'laptop, laptop computer',
 621: 'lawn mower, mower',
 622: 'lens cap, lens cover',
 623: 'letter opener, paper knife, paperknife',
 624: 'library',
 625: 'lifeboat',
 626: 'lighter, light, igniter, ignitor',
 627: 'limousine, limo',
 628: 'liner, ocean liner',
 629: 'lipstick, lip rouge',
 630: 'Loafer',
 631: 'lotion',
 632: 'loudspeaker, speaker, speaker unit, loudspeaker system, speaker system',
 633: "loupe, jeweler's loupe",
 634: 'lumbermill, sawmill',
 635: 'magnetic compass',
 636: 'mailbag, postbag',
 637: 'mailbox, letter box',
 638: 'maillot',
 639: 'maillot, tank suit',
 640: 'manhole cover',
 641: 'maraca',
 642: 'marimba, xylophone',
 643: 'mask',
 644: 'matchstick',
 645: 'maypole',
 646: 'maze, labyrinth',
 647: 'measuring cup',
 648: 'medicine chest, medicine cabinet',
 649: 'megalith, megalithic structure',
 650: 'microphone, mike',
 651: 'microwave, microwave oven',
 652: 'military uniform',
 653: 'milk can',
 654: 'minibus',
 655: 'miniskirt, mini',
 656: 'minivan',
 657: 'missile',
 658: 'mitten',
 659: 'mixing bowl',
 660: 'mobile home, manufactured home',
 661: 'Model T',
 662: 'modem',
 663: 'monastery',
 664: 'monitor',
 665: 'moped',
 666: 'mortar',
 667: 'mortarboard',
 668: 'mosque',
 669: 'mosquito net',
 670: 'motor scooter, scooter',
 671: 'mountain bike, all-terrain bike, off-roader',
 672: 'mountain tent',
 673: 'mouse, computer mouse',
 674: 'mousetrap',
 675: 'moving van',
 676: 'muzzle',
 677: 'nail',
 678: 'neck brace',
 679: 'necklace',
 680: 'nipple',
 681: 'notebook, notebook computer',
 682: 'obelisk',
 683: 'oboe, hautboy, hautbois',
 684: 'ocarina, sweet potato',
 685: 'odometer, hodometer, mileometer, milometer',
 686: 'oil filter',
 687: 'organ, pipe organ',
 688: 'oscilloscope, scope, cathode-ray oscilloscope, CRO',
 689: 'overskirt',
 690: 'oxcart',
 691: 'oxygen mask',
 692: 'packet',
 693: 'paddle, boat paddle',
 694: 'paddlewheel, paddle wheel',
 695: 'padlock',
 696: 'paintbrush',
 697: "pajama, pyjama, pj's, jammies",
 698: 'palace',
 699: 'panpipe, pandean pipe, syrinx',
 700: 'paper towel',
 701: 'parachute, chute',
 702: 'parallel bars, bars',
 703: 'park bench',
 704: 'parking meter',
 705: 'passenger car, coach, carriage',
 706: 'patio, terrace',
 707: 'pay-phone, pay-station',
 708: 'pedestal, plinth, footstall',
 709: 'pencil box, pencil case',
 710: 'pencil sharpener',
 711: 'perfume, essence',
 712: 'Petri dish',
 713: 'photocopier',
 714: 'pick, plectrum, plectron',
 715: 'pickelhaube',
 716: 'picket fence, paling',
 717: 'pickup, pickup truck',
 718: 'pier',
 719: 'piggy bank, penny bank',
 720: 'pill bottle',
 721: 'pillow',
 722: 'ping-pong ball',
 723: 'pinwheel',
 724: 'pirate, pirate ship',
 725: 'pitcher, ewer',
 726: "plane, carpenter's plane, woodworking plane",
 727: 'planetarium',
 728: 'plastic bag',
 729: 'plate rack',
 730: 'plow, plough',
 731: "plunger, plumber's helper",
 732: 'Polaroid camera, Polaroid Land camera',
 733: 'pole',
 734: 'police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria',
 735: 'poncho',
 736: 'pool table, billiard table, snooker table',
 737: 'pop bottle, soda bottle',
 738: 'pot, flowerpot',
 739: "potter's wheel",
 740: 'power drill',
 741: 'prayer rug, prayer mat',
 742: 'printer',
 743: 'prison, prison house',
 744: 'projectile, missile',
 745: 'projector',
 746: 'puck, hockey puck',
 747: 'punching bag, punch bag, punching ball, punchball',
 748: 'purse',
 749: 'quill, quill pen',
 750: 'quilt, comforter, comfort, puff',
 751: 'racer, race car, racing car',
 752: 'racket, racquet',
 753: 'radiator',
 754: 'radio, wireless',
 755: 'radio telescope, radio reflector',
 756: 'rain barrel',
 757: 'recreational vehicle, RV, R.V.',
 758: 'reel',
 759: 'reflex camera',
 760: 'refrigerator, icebox',
 761: 'remote control, remote',
 762: 'restaurant, eating house, eating place, eatery',
 763: 'revolver, six-gun, six-shooter',
 764: 'rifle',
 765: 'rocking chair, rocker',
 766: 'rotisserie',
 767: 'rubber eraser, rubber, pencil eraser',
 768: 'rugby ball',
 769: 'rule, ruler',
 770: 'running shoe',
 771: 'safe',
 772: 'safety pin',
 773: 'saltshaker, salt shaker',
 774: 'sandal',
 775: 'sarong',
 776: 'sax, saxophone',
 777: 'scabbard',
 778: 'scale, weighing machine',
 779: 'school bus',
 780: 'schooner',
 781: 'scoreboard',
 782: 'screen, CRT screen',
 783: 'screw',
 784: 'screwdriver',
 785: 'seat belt, seatbelt',
 786: 'sewing machine',
 787: 'shield, buckler',
 788: 'shoe shop, shoe-shop, shoe store',
 789: 'shoji',
 790: 'shopping basket',
 791: 'shopping cart',
 792: 'shovel',
 793: 'shower cap',
 794: 'shower curtain',
 795: 'ski',
 796: 'ski mask',
 797: 'sleeping bag',
 798: 'slide rule, slipstick',
 799: 'sliding door',
 800: 'slot, one-armed bandit',
 801: 'snorkel',
 802: 'snowmobile',
 803: 'snowplow, snowplough',
 804: 'soap dispenser',
 805: 'soccer ball',
 806: 'sock',
 807: 'solar dish, solar collector, solar furnace',
 808: 'sombrero',
 809: 'soup bowl',
 810: 'space bar',
 811: 'space heater',
 812: 'space shuttle',
 813: 'spatula',
 814: 'speedboat',
 815: "spider web, spider's web",
 816: 'spindle',
 817: 'sports car, sport car',
 818: 'spotlight, spot',
 819: 'stage',
 820: 'steam locomotive',
 821: 'steel arch bridge',
 822: 'steel drum',
 823: 'stethoscope',
 824: 'stole',
 825: 'stone wall',
 826: 'stopwatch, stop watch',
 827: 'stove',
 828: 'strainer',
 829: 'streetcar, tram, tramcar, trolley, trolley car',
 830: 'stretcher',
 831: 'studio couch, day bed',
 832: 'stupa, tope',
 833: 'submarine, pigboat, sub, U-boat',
 834: 'suit, suit of clothes',
 835: 'sundial',
 836: 'sunglass',
 837: 'sunglasses, dark glasses, shades',
 838: 'sunscreen, sunblock, sun blocker',
 839: 'suspension bridge',
 840: 'swab, swob, mop',
 841: 'sweatshirt',
 842: 'swimming trunks, bathing trunks',
 843: 'swing',
 844: 'switch, electric switch, electrical switch',
 845: 'syringe',
 846: 'table lamp',
 847: 'tank, army tank, armored combat vehicle, armoured combat vehicle',
 848: 'tape player',
 849: 'teapot',
 850: 'teddy, teddy bear',
 851: 'television, television system',
 852: 'tennis ball',
 853: 'thatch, thatched roof',
 854: 'theater curtain, theatre curtain',
 855: 'thimble',
 856: 'thresher, thrasher, threshing machine',
 857: 'throne',
 858: 'tile roof',
 859: 'toaster',
 860: 'tobacco shop, tobacconist shop, tobacconist',
 861: 'toilet seat',
 862: 'torch',
 863: 'totem pole',
 864: 'tow truck, tow car, wrecker',
 865: 'toyshop',
 866: 'tractor',
 867: 'trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi',
 868: 'tray',
 869: 'trench coat',
 870: 'tricycle, trike, velocipede',
 871: 'trimaran',
 872: 'tripod',
 873: 'triumphal arch',
 874: 'trolleybus, trolley coach, trackless trolley',
 875: 'trombone',
 876: 'tub, vat',
 877: 'turnstile',
 878: 'typewriter keyboard',
 879: 'umbrella',
 880: 'unicycle, monocycle',
 881: 'upright, upright piano',
 882: 'vacuum, vacuum cleaner',
 883: 'vase',
 884: 'vault',
 885: 'velvet',
 886: 'vending machine',
 887: 'vestment',
 888: 'viaduct',
 889: 'violin, fiddle',
 890: 'volleyball',
 891: 'waffle iron',
 892: 'wall clock',
 893: 'wallet, billfold, notecase, pocketbook',
 894: 'wardrobe, closet, press',
 895: 'warplane, military plane',
 896: 'washbasin, handbasin, washbowl, lavabo, wash-hand basin',
 897: 'washer, automatic washer, washing machine',
 898: 'water bottle',
 899: 'water jug',
 900: 'water tower',
 901: 'whiskey jug',
 902: 'whistle',
 903: 'wig',
 904: 'window screen',
 905: 'window shade',
 906: 'Windsor tie',
 907: 'wine bottle',
 908: 'wing',
 909: 'wok',
 910: 'wooden spoon',
 911: 'wool, woolen, woollen',
 912: 'worm fence, snake fence, snake-rail fence, Virginia fence',
 913: 'wreck',
 914: 'yawl',
 915: 'yurt',
 916: 'web site, website, internet site, site',
 917: 'comic book',
 918: 'crossword puzzle, crossword',
 919: 'street sign',
 920: 'traffic light, traffic signal, stoplight',
 921: 'book jacket, dust cover, dust jacket, dust wrapper',
 922: 'menu',
 923: 'plate',
 924: 'guacamole',
 925: 'consomme',
 926: 'hot pot, hotpot',
 927: 'trifle',
 928: 'ice cream, icecream',
 929: 'ice lolly, lolly, lollipop, popsicle',
 930: 'French loaf',
 931: 'bagel, beigel',
 932: 'pretzel',
 933: 'cheeseburger',
 934: 'hotdog, hot dog, red hot',
 935: 'mashed potato',
 936: 'head cabbage',
 937: 'broccoli',
 938: 'cauliflower',
 939: 'zucchini, courgette',
 940: 'spaghetti squash',
 941: 'acorn squash',
 942: 'butternut squash',
 943: 'cucumber, cuke',
 944: 'artichoke, globe artichoke',
 945: 'bell pepper',
 946: 'cardoon',
 947: 'mushroom',
 948: 'Granny Smith',
 949: 'strawberry',
 950: 'orange',
 951: 'lemon',
 952: 'fig',
 953: 'pineapple, ananas',
 954: 'banana',
 955: 'jackfruit, jak, jack',
 956: 'custard apple',
 957: 'pomegranate',
 958: 'hay',
 959: 'carbonara',
 960: 'chocolate sauce, chocolate syrup',
 961: 'dough',
 962: 'meat loaf, meatloaf',
 963: 'pizza, pizza pie',
 964: 'potpie',
 965: 'burrito',
 966: 'red wine',
 967: 'espresso',
 968: 'cup',
 969: 'eggnog',
 970: 'alp',
 971: 'bubble',
 972: 'cliff, drop, drop-off',
 973: 'coral reef',
 974: 'geyser',
 975: 'lakeside, lakeshore',
 976: 'promontory, headland, head, foreland',
 977: 'sandbar, sand bar',
 978: 'seashore, coast, seacoast, sea-coast',
 979: 'valley, vale',
 980: 'volcano',
 981: 'ballplayer, baseball player',
 982: 'groom, bridegroom',
 983: 'scuba diver',
 984: 'rapeseed',
 985: 'daisy',
 986: "yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum",
 987: 'corn',
 988: 'acorn',
 989: 'hip, rose hip, rosehip',
 990: 'buckeye, horse chestnut, conker',
 991: 'coral fungus',
 992: 'agaric',
 993: 'gyromitra',
 994: 'stinkhorn, carrion fungus',
 995: 'earthstar',
 996: 'hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa',
 997: 'bolete',
 998: 'ear, spike, capitulum',
 999: 'toilet tissue, toilet paper, bathroom tissue'}


================================================
FILE: examples/ONNX/resnet50/int8.py
================================================
import calibrator
import tensorrt as trt

# Use TensorRT ONNX parser to parse model file, and enable INT8 calibration during engine construction
def build_int8_engine_onnx(model_file, image_dir, batch_size, calibration_batches, engine_file, cache_file='INT8CalibrationTable'):

    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
        # Load the Onnx model and parse it in order to populate the TensorRT network.
        with open(model_file, 'rb') as model:
            parser.parse(model.read())

        # Allow builder to use INT8 or FP16 kernels when building engine
        builder.int8_mode = True
        builder.fp16_mode = True
        calib = calibrator.ONNXEntropyCalibrator(image_dir, batch_size, calibration_batches, cache_file)
        builder.int8_calibrator = calib
        builder.max_batch_size = batch_size

        engine = builder.build_cuda_engine(network)

        with open(engine_file, 'wb') as f:
            f.write(engine.serialize())


================================================
FILE: examples/ONNX/resnet50/onnx_utils.py
================================================
#!/usr/bin/env python3
import glob
import os

import onnx
from onnx import numpy_helper
from matplotlib import pyplot as plt
import numpy as np

def load_inputs(test_data_dir):
    # Load inputs
    inputs = []
    inputs_num = len(glob.glob(os.path.join(test_data_dir, 'input_*.pb')))
    for i in range(inputs_num):
        input_file = os.path.join(test_data_dir, 'input_{}.pb'.format(i))
        tensor = onnx.TensorProto()
        with open(input_file, 'rb') as f:
            tensor.ParseFromString(f.read())
        inputs.append(numpy_helper.to_array(tensor))
    return inputs

def load_outputs(test_data_dir):
    # Load reference outputs
    ref_outputs = []
    ref_outputs_num = len(glob.glob(os.path.join(test_data_dir, 'output_*.pb')))
    for i in range(ref_outputs_num):
        output_file = os.path.join(test_data_dir, 'output_{}.pb'.format(i))
        tensor = onnx.TensorProto()
        with open(output_file, 'rb') as f:
            tensor.ParseFromString(f.read())
        ref_outputs.append(numpy_helper.to_array(tensor))
    return ref_outputs

def mnist_image(data):
    two_d = (np.reshape(data, (28, 28))).astype(np.uint8)
    plt.imshow(two_d, interpolation='nearest')
    return plt

def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()


================================================
FILE: examples/ONNX/resnet50/open_source_images.md5
================================================
6cd502bc217f3960cf34447ec4ede610  open_source_images.tar.gz


================================================
FILE: examples/ONNX/resnet50/resnet50.md5
================================================
0e8088c7b1a1a9b2d0a5ae05601cc55e  resnet50.tar.gz


================================================
FILE: examples/ONNX/resnet50/run_jpeg_test.py
================================================
#!/usr/bin/env python3

import os
import time

import trtlab
import onnx_utils as utils

import numpy as np
import matplotlib.pyplot as plt

import mxnet as mx
from mxnet.gluon.data.vision import transforms

from imagenet_labels import labels

import click

tests = {}

def tensorrt_init(engines):
    manager = trtlab.InferenceManager(max_exec_concurrency=4)
    runners = []
    for engine in engines:
        name, _ = os.path.splitext(os.path.basename(engine))
        runners.append(manager.register_tensorrt_engine(name, engine))
    manager.update_resources()
    return runners

def infer_image(runner, image):
    inputs = preprocess_image(runner, image)
    future = runner.infer(**inputs)
    result = future.get()
    for name, tensor in result.items():
        tensor = tensor.reshape(1000)
        idx = np.argmax(tensor) 
        print("\n*** Results ***") 
        print(labels[idx], tensor[idx])
        print("") 

def preprocess_image(runner, image_path):
    inputs = runner.input_bindings()
    keys = list(inputs.keys())
    input_name = keys[0]
    img = np.array(plt.imread(image_path))
    img = transform_image(img)
    return { input_name: img }

def transform_image(img):
    transform_fn = transforms.Compose([
        transforms.Resize(224),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    img = transform_fn(mx.nd.array(img)).asnumpy()
    img = np.expand_dims(img, axis=0) # batchify
    return img


def validate_results(computed, expected):
    keys = list(computed.keys())
    output_name = keys[0]
    output_value = computed[output_name]
    np.testing.assert_almost_equal(output_value, expected[0], decimal=3)
    print("-- Test Passed: All outputs {} match within 3 decimals".format(output_value.shape))

File = click.Path(exists=True, file_okay=True, dir_okay=False, resolve_path=True)
Path = click.Path(exists=True, file_okay=False, dir_okay=True, resolve_path=True)

@click.command()
@click.option("--image", type=File, multiple=True)
@click.argument("engine", type=File, nargs=1)
def main(engine, image):
    runners = tensorrt_init([engine])
    for runner in runners:
        for img in image:
            infer_image(runner, img) 
    

if __name__ == "__main__":
    main() 


================================================
FILE: examples/ONNX/resnet50/run_onnx_tests.py
================================================
#!/usr/bin/env python3

import os

import trtlab
import numpy as np

import click
import onnx_utils as utils

tests = {}

def tensorrt_init(engines):
    manager = trtlab.InferenceManager(max_exec_concurrency=4)
    runners = []
    for engine in engines:
        name, _ = os.path.splitext(os.path.basename(engine))
        runners.append(manager.register_tensorrt_engine(name, engine))
    manager.update_resources()
    return runners

def test_data(test_path):
    for path, dirs, files in os.walk(test_path):
        if os.path.basename(path).startswith("test_"):
            tests[path] = files 
    for path, files in tests.items():
        inputs = utils.load_inputs(path)
        outputs = utils.load_outputs(path)
        print("** Testing {} **".format(path))
        yield inputs, outputs

def run_test(runner, inputs, outputs):
    inputs = preprocess_inputs(runner, inputs)
    future = runner.infer(**inputs)
    result = future.get()
    validate_results(result, outputs)

def preprocess_inputs(runner, inputs):
    expected_input = runner.input_bindings()
    if len(expected_input) != len(inputs):
        raise RuntimeError("mismatched number of inputs")
    keys = list(expected_input.keys())
    input_name = keys[0]
    info = expected_input[keys[0]]
    shape = info['shape']
    tensor = inputs[0]
    batch_size = tensor.shape[0]
    if list(shape) != list(tensor.shape[1:]):
        raise RuntimeError("mismatched input dimensions")
    return { input_name: tensor }

def validate_results(computed, expected):
    keys = list(computed.keys())
    output_name = keys[0]
    output_value = computed[output_name]
    np.testing.assert_almost_equal(output_value, expected[0], decimal=3)
    print("-- Test Passed: All outputs {} match within 3 decimals".format(output_value.shape))

File = click.Path(exists=True, file_okay=True, dir_okay=False, resolve_path=True)
Path = click.Path(exists=True, file_okay=False, dir_okay=True, resolve_path=True)

@click.command()
@click.option("--tests", type=Path, default="resnet50")
@click.argument("engine", type=File, nargs=1)
def main(engine, tests):
    runners = tensorrt_init([engine])
    for runner in runners:
        for inputs, outputs in test_data(tests):
            run_test(runner, inputs, outputs)
    

if __name__ == "__main__":
    main() 


================================================
FILE: examples/nvRPC/CMakeLists.txt
================================================
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

add_subdirectory(UnaryService)
add_subdirectory(StreamingService)
add_subdirectory(SharedMemoryService)

# TODO: WIP
# add_subdirectory(StreamingInOrderSendRecv)

================================================
FILE: examples/nvRPC/SharedMemoryService/CMakeLists.txt
================================================
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

add_executable(sysv-nvrpc.x
    server.cc)

target_link_libraries(sysv-nvrpc.x
    nvrpc
    echo-protos
    gflags
)

add_executable(sysv-client.x
    client.cc)

target_link_libraries(sysv-client.x
    nvrpc
    echo-protos
    gflags
)


================================================
FILE: examples/nvRPC/SharedMemoryService/README.md
================================================
# Shared Memory Service

Client/Server service extending the Basic nvRPC example.

The client (`sysv-client.x`) creates a `CyclicAllocator<SystemV>` from which it
allocates buffers of shared memory.

The client:
  1. Write some data into shared memory (batch_id and 0xDEADBEEF), `data[0]` and `data[1]` respectively.
  2. Packs the RPC message with the batch_id and the SystemV memory descriptor details
  3. Initiates the RPC
  4. Checks the value of the address in which it wrote 0xDEADBEEF to ensure that the server wrote it's response to the proper location.

On reciept of a client RPC, the server Aquires a `Descriptor<SystemV>` to the
offset in the shared memory segment required in the RPC.  This is done via the
`ExternalSharedMemoryManager`.

The server:
  1. Aquires the `Descriptor<SystemV>` to the offset
  2. Checks the values at `data[0]` and `data[1]` for `batch_id` and `0xDEADBEEF` respectively
  3. Write the `batch_id` into `[1]` element
  4. Returns the Response

================================================
FILE: examples/nvRPC/SharedMemoryService/client.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <algorithm>
#include <chrono>
#include <iostream>
#include <memory>
#include <string>

#include <gflags/gflags.h>
#include <glog/logging.h>
#include <grpcpp/grpcpp.h>

#include "tensorrt/laboratory/core/memory/cyclic_allocator.h"
#include "tensorrt/laboratory/core/memory/system_v.h"

#include "echo.grpc.pb.h"

using grpc::Channel;
using grpc::ClientContext;
using grpc::Status;
using simple::Inference;
using simple::Input;
using simple::Output;

using trtlab::CyclicAllocator;
using trtlab::SystemV;

static constexpr size_t one_mb = 1024 * 1024;

DEFINE_int32(count, 1, "number of grpc messages to send");

class SimpleClient final
{
  public:
    SimpleClient(std::shared_ptr<Channel> channel)
        : m_Stub(Inference::NewStub(channel)), m_Memory(5, one_mb)
    {
    }

    // Generate and send RPC message
    int Compute(const int batch_id)
    {
        // Allocate some SysV shared memory from the CyclicAllocator
        CyclicAllocator<SystemV>::Descriptor mdesc = RandomAllocation();

        // Populate the request object
        Input request;
        request.set_batch_id(batch_id);
        auto sysv = request.mutable_sysv();
        sysv->set_shm_id(mdesc->Stack().Memory().ShmID());
        sysv->set_offset(mdesc->Offset());
        sysv->set_size(mdesc->Size());

        // Write the batch_id to the shared memory segment
        // This will validated against the batch_id in the message body on the server
        auto data = mdesc->CastToArray<size_t>();
        data[0] = batch_id;
        data[1] = 0xDEADBEEF;

        // Container for the data we expect from the server.
        Output reply;

        // Context for the client. It could be used to convey extra information to
        // the server and/or tweak certain RPC behaviors.
        ClientContext context;

        // The actual RPC.
        Status status = m_Stub->Compute(&context, request, &reply);

        if(status.ok())
        {
            CHECK_EQ(data[1], batch_id);
            return reply.batch_id();
        }
        else
        {
            LOG(ERROR) << status.error_code() << ": " << status.error_message();
            return -1;
        }
    }

  private:
    CyclicAllocator<SystemV>::Descriptor RandomAllocation()
    {
        size_t bytes = rand() % (m_Memory.MaxAllocationSize() / 4);
        bytes = std::max(bytes, 16UL); // guarantee at least 16 bytes (2x size_t)
        DLOG(INFO) << "RandomAllocation: " << bytes << " bytes";
        return m_Memory.Allocate(bytes);
    }

    std::unique_ptr<Inference::Stub> m_Stub;
    CyclicAllocator<SystemV> m_Memory;
};

int main(int argc, char** argv)
{
    FLAGS_alsologtostderr = 1; // It will dump to console
    ::google::ParseCommandLineFlags(&argc, &argv, true);

    SimpleClient client(grpc::CreateChannel("localhost:50051", grpc::InsecureChannelCredentials()));

    auto start = std::chrono::steady_clock::now();
    for(int i = 0; i < FLAGS_count; i++)
    {
        auto reply = client.Compute(i);
        LOG_IF(INFO, reply == -1) << "BatchId received: " << reply;
    }
    auto end = std::chrono::steady_clock::now();
    float elapsed = std::chrono::duration<float>(end - start).count();
    std::cout << FLAGS_count << " requests in " << elapsed << "seconds" << std::endl;
    return 0;
}


================================================
FILE: examples/nvRPC/SharedMemoryService/server.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <chrono>
#include <map>
#include <memory>
#include <thread>

#include <gflags/gflags.h>
#include <glog/logging.h>

#include "nvrpc/executor.h"
#include "nvrpc/server.h"
#include "nvrpc/service.h"
#include "tensorrt/laboratory/core/memory/descriptor.h"
#include "tensorrt/laboratory/core/memory/system_v.h"
#include "tensorrt/laboratory/core/pool.h"
#include "tensorrt/laboratory/core/resources.h"
#include "tensorrt/laboratory/core/thread_pool.h"
#include "tensorrt/laboratory/core/utils.h"

#include "echo.grpc.pb.h"
#include "echo.pb.h"

using nvrpc::AsyncRPC;
using nvrpc::AsyncService;
using nvrpc::Context;
using nvrpc::Executor;
using nvrpc::Server;
using trtlab::Resources;
using trtlab::ThreadPool;

using trtlab::Descriptor;
using trtlab::SystemV;

// CLI Options
DEFINE_int32(thread_count, 1, "Size of thread pool");

/**
 * @brief SystemV Memory Manager
 *
 * This object does not allocate system v shared memory segments.  Instead, it attaches and manages
 * descriptors into shared memory segments allocated by an external source.
 */
class ExternalSharedMemoryManager final
{
    class PartialSegmentDescriptor final : public Descriptor<SystemV>
    {
      public:
        PartialSegmentDescriptor(const std::shared_ptr<SystemV>& segment, size_t offset,
                                 size_t size)
            : Descriptor<SystemV>((*segment)[offset], size, "PartialSysVSegment"),
              m_Segment(segment)
        {
        }

        PartialSegmentDescriptor(PartialSegmentDescriptor&& other)
            : Descriptor<SystemV>(std::move(other)), m_Segment{
                                                         std::exchange(other.m_Segment, nullptr)}
        {
        }

        PartialSegmentDescriptor& operator=(PartialSegmentDescriptor&&) = delete;
        DELETE_COPYABILITY(PartialSegmentDescriptor);

        virtual ~PartialSegmentDescriptor() override {}

      private:
        std::shared_ptr<SystemV> m_Segment;
    };

  public:
    ExternalSharedMemoryManager() = default;
    using Descriptor = std::unique_ptr<PartialSegmentDescriptor>;

    Descriptor Acquire(size_t shm_id, size_t offset, size_t size)
    {
        const auto& segment = GetOrAttachToShmID(shm_id);
        CHECK_LE(offset + size, segment->Size());
        return std::make_unique<PartialSegmentDescriptor>(segment, offset, size);
    }

    void Release(size_t shm_id)
    {
        std::lock_guard<std::mutex> l(m_Mutex);
        auto count = m_AttachedSegments.erase(shm_id);
        DLOG_IF(WARNING, count == 0) << "Attempting to Release an unmapped shm_id";
    }

  protected:
    std::shared_ptr<SystemV> GetOrAttachToShmID(size_t shm_id)
    {
        std::shared_ptr<SystemV> segment;
        std::lock_guard<std::mutex> l(m_Mutex);
        auto search = m_AttachedSegments.find(shm_id);
        if(search == m_AttachedSegments.end())
        {
            DLOG(INFO) << "SystemV Manager: attaching to shm_id: " << shm_id;
            segment = SystemV::Attach(shm_id);
            m_AttachedSegments[shm_id] = segment;
        }
        else
        {
            segment = search->second;
        }
        return segment;
    }

  private:
    std::map<size_t, std::shared_ptr<SystemV>> m_AttachedSegments;
    std::mutex m_Mutex;
};

struct SimpleResources : public Resources
{
    SimpleResources() = default;

    ExternalSharedMemoryManager& GetExternalSharedMemoryManager()
    {
        return m_ExternalSharedMemoryManager;
    }

  private:
    ExternalSharedMemoryManager m_ExternalSharedMemoryManager;
};

class SimpleContext final : public Context<simple::Input, simple::Output, SimpleResources>
{
    void ExecuteRPC(RequestType& input, ResponseType& output) final override
    {
        ExternalSharedMemoryManager::Descriptor mdesc;
        if(input.has_sysv())
        {
            mdesc = GetResources()->GetExternalSharedMemoryManager().Acquire(
                input.sysv().shm_id(), input.sysv().offset(), input.sysv().size());
        }
        CHECK(mdesc);
        auto array = mdesc->CastToArray<size_t>();
        CHECK_EQ(array[0], input.batch_id());
        CHECK_EQ(array[1], 0xDEADBEEF);
        array[1] = input.batch_id();

        output.set_batch_id(input.batch_id());
        this->FinishResponse();
    }
};

int main(int argc, char* argv[])
{
    FLAGS_alsologtostderr = 1; // Log to console

    ::google::InitGoogleLogging("simpleServer");
    ::google::ParseCommandLineFlags(&argc, &argv, true);

    // A server will bind an IP:PORT to listen on
    Server server("0.0.0.0:50051");

    // A server can host multiple services
    LOG(INFO) << "Register Service (simple::Inference) with Server";
    auto simpleInference = server.RegisterAsyncService<simple::Inference>();

    LOG(INFO) << "Register RPC (simple::Inference::Compute) with Service (simple::Inference)";
    auto rpcCompute = simpleInference->RegisterRPC<SimpleContext>(
        &simple::Inference::AsyncService::RequestCompute);

    LOG(INFO) << "Initializing Resources for RPC (simple::Inference::Compute)";
    auto rpcResources = std::make_shared<SimpleResources>();

    LOG(INFO) << "Creating Executor";
    auto executor = server.RegisterExecutor(new Executor(1));

    LOG(INFO) << "Creating Execution Contexts for RPC (simple::Inference::Compute) with Executor";
    executor->RegisterContexts(rpcCompute, rpcResources, 10);

    LOG(INFO) << "Running Server";
    server.Run(std::chrono::milliseconds(2000), [] {});
}


================================================
FILE: examples/nvRPC/StreamingInOrderSendRecv/CMakeLists.txt
================================================
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

add_executable(nvrpc-bidirectional-server.x
    server.cc)

target_link_libraries(nvrpc-bidirectional-server.x
    nvrpc
    echo-protos
    gflags
)

add_executable(nvrpc-bidirectional-client.x
    client.cc
)

target_link_libraries(nvrpc-bidirectional-client.x
    nvrpc
    nvrpc-client
    echo-protos
    gflags
)

================================================
FILE: examples/nvRPC/StreamingInOrderSendRecv/README.md
================================================
# BidirectionalStream In-Order Send/Recv

```
rpc InOrderSendRecv (stream Request) returns (stream Response)
```

The service will accept a stream or Requests, queue them for in-order execution
via the `ExecuteRPC` virtual method, and for each result of `ExecuteRPC` return
a Response on the stream.

Only one `ExecuteRPC` can be in-action at anytime which allows the RPC to
optionally maintain a state.  In some regards, this lifecyle could be used to
model an RNN as the collective history can be built into the resources.


================================================
FILE: examples/nvRPC/StreamingInOrderSendRecv/client.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <chrono>
#include <iostream>
#include <memory>
#include <string>

#include <gflags/gflags.h>
#include <glog/logging.h>
#include <grpcpp/grpcpp.h>

#include "nvrpc/client/client_streaming.h"
#include "nvrpc/client/executor.h"

using grpc::Channel;
using grpc::ClientContext;
using grpc::Status;

using nvrpc::client::ClientBidirectional;
using nvrpc::client::Executor;

#include "echo.grpc.pb.h"

using simple::Inference;
using simple::Input;
using simple::Output;

DEFINE_int32(count, 100, "number of grpc messages to send");
DEFINE_int32(thread_count, 1, "Size of thread pool");
DEFINE_string(hostname, "127.0.0.1:50051", "hostname and port");

int main(int argc, char** argv)
{
    // Instantiate the client. It requires a channel, out of which the actual RPCs
    // are created. This channel models a connection to an endpoint (in this case,
    // localhost at port 50051). We indicate that the channel isn't authenticated
    // (use of InsecureChannelCredentials()).
    FLAGS_alsologtostderr = 1; // It will dump to console
    ::google::ParseCommandLineFlags(&argc, &argv, true);

    std::mutex mutex;
    std::size_t count = 0;

    auto executor = std::make_shared<Executor>(FLAGS_thread_count);

    auto channel = grpc::CreateChannel(FLAGS_hostname, grpc::InsecureChannelCredentials());
    auto stub = Inference::NewStub(channel);

    auto infer_prepare_fn = [&stub](::grpc::ClientContext * context,
                                    ::grpc::CompletionQueue * cq) -> auto
    {
        return std::move(stub->PrepareAsyncBidirectional(context, cq));
    };

    auto stream = std::make_unique<ClientBidirectional<Input, Output>>(
        infer_prepare_fn, executor,
        [](Input&& request) {
            LOG_FIRST_N(INFO, 10) << "Sent Request with BatchID: " << request.batch_id();
            static size_t last = 0;
            CHECK_EQ(last + 1, request.batch_id());
            ++last;
            // CHECK(request.batch_id());
        },
        [&mutex, &count](Output&& response) {
            LOG_FIRST_N(INFO, 10) << "Received Response with BatchID: " << response.batch_id();
            // CHECK(response.batch_id());
            std::lock_guard<std::mutex> lock(mutex);
            --count;
        });

    auto start = std::chrono::steady_clock::now();
    auto elapsed = [start]() -> float {
        return std::chrono::duration<float>(std::chrono::steady_clock::now() - start).count();
    };

    for(int i = 1; i < FLAGS_count + 1; i++)
    {
        {
            std::lock_guard<std::mutex> lock(mutex);
            ++count;
        }
        Input input;
        input.set_batch_id(i);
        stream->Send(std::move(input));
    }
    std::cout << FLAGS_count << " queued in " << elapsed() << "seconds" << std::endl;
    auto future = stream->Done();
    auto status = future.get();
    executor->ShutdownAndJoin();
    CHECK_EQ(count, 0UL);
    std::cout << FLAGS_count << " completed in " << elapsed() << "seconds" << std::endl;
    std::cout << "gRPC Status: " << (status.ok() ? "OK" : "NOT OK") << std::endl;
    return 0;
}

================================================
FILE: examples/nvRPC/StreamingInOrderSendRecv/server.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <chrono>
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <thread>

#include "tensorrt/laboratory/core/pool.h"
#include "tensorrt/laboratory/core/resources.h"
#include "tensorrt/laboratory/core/thread_pool.h"

using trtlab::Resources;
using trtlab::ThreadPool;

#include "nvrpc/executor.h"
#include "nvrpc/server.h"
#include "nvrpc/service.h"

using nvrpc::AsyncRPC;
using nvrpc::AsyncService;
using nvrpc::BidirectionalContext;
using nvrpc::Executor;
using nvrpc::Server;

#include "echo.grpc.pb.h"
#include "echo.pb.h"

// CLI Options
DEFINE_int32(thread_count, 1, "Size of thread pool");

// Define the resources your RPC will need to execute
// ==================================================
// In this case, all simple::Inference::Compute RPCs share a threadpool in which they will
// queue up some work on.  This essentially means, after the message as been received and
// processed, the actual work for the RPC is pushed to a worker pool outside the scope of
// the transaction processing system (TPS).  This is essentially async computing, we have
// decoupled the transaction from the workers executing the implementation.  The TPS can
// continue to queue work, while the workers process the load.
struct SimpleResources : public Resources
{
    SimpleResources(int numThreadsInPool = 3) : m_ThreadPool(numThreadsInPool)
    {
        LOG(INFO) << "Server ThreadCount: " << numThreadsInPool;
    }

    ThreadPool& AcquireThreadPool() { return m_ThreadPool; }

  private:
    ThreadPool m_ThreadPool;
};

// Contexts hold the state and provide the definition of the work to be performed by the RPC.
// This is where you define what gets executed for a given RPC.
// Incoming Message = simple::Input (RequestType)
// Outgoing Message = simple::Output (ResponseType)
class SimpleContext final
    : public BidirectionalContext<simple::Input, simple::Output, SimpleResources>
{
    void ExecuteRPC(RequestType& input, ResponseType& output) final override
    {
        // We could do work here, but we'd block the TPS, i.e. the threads pulling messages
        // off the incoming recieve queue.  Very quick responses are best done here; however,
        // longer running workload should be offloaded so the TPS can avoid being blocked.
        // GetResources()->AcquireThreadPool().enqueue([this, &input, &output]{
        // Now running on a worker thread of the ThreadPool defined in SimpleResources.
        // Here we are just echoing back the incoming // batch_id; however, in later
        // examples, we'll show how to run an async cuda pipline.
        LOG_FIRST_N(INFO, 10) << "BatchID: " << input.batch_id() << " Tag = " << Tag()
                              << " Thread = " << std::this_thread::get_id();
        output.set_batch_id(input.batch_id());
        this->FinishResponse();
        // });
        // The TPS thread is now free to continue processing message - async ftw!
    }
};

DEFINE_string(ip_port, "0.0.0.0:50051", "IP/Port");

int main(int argc, char* argv[])
{
    FLAGS_alsologtostderr = 1; // Log to console

    ::google::InitGoogleLogging("simpleServer");
    ::google::ParseCommandLineFlags(&argc, &argv, true);

    // A server will bind an IP:PORT to listen on
    Server server(FLAGS_ip_port);

    // A server can host multiple services
    LOG(INFO) << "Register Service (simple::Inference) with Server";
    auto simpleInference = server.RegisterAsyncService<simple::Inference>();

    // An RPC has two components that need to be specified when registering with the service:
    //  1) Type of Execution Context (SimpleContext).  The execution context defines the behavor
    //     of the RPC, i.e. it contains the control logic for the execution of the RPC.
    //  2) The Request function (RequestCompute) which was generated by gRPC when compiling the
    //     protobuf which defined the service.  This function is responsible for queuing the
    //     RPC's execution context to the
    LOG(INFO) << "Register RPC (simple::Inference::Compute) with Service (simple::Inference)";
    auto rpcCompute = simpleInference->RegisterRPC<SimpleContext>(
        &simple::Inference::AsyncService::RequestBidirectional);

    LOG(INFO) << "Initializing Resources for RPC (simple::Inference::Compute)";
    auto rpcResources = std::make_shared<SimpleResources>(FLAGS_thread_count);

    // Create Executors - Executors provide the messaging processing resources for the RPCs
    // Multiple Executors can be registered with a Server.  The executor is responsible
    // for pulling incoming message off the receive queue and executing the associated
    // context.  By default, an executor only uses a single thread.  A typical usecase is
    // an Executor executes a context, which immediate pushes the work to a thread pool.
    // However, for very low-latency messaging, you might want to use a multi-threaded
    // Executor and a Blocking Context - meaning the Context performs the entire RPC function
    // on the Executor's thread.
    LOG(INFO) << "Creating Executor";
    auto executor = server.RegisterExecutor(new Executor(1));

    // You can register RPC execution contexts from any registered RPC on any executor.
    // The power of that will become clear in later examples. For now, we will register
    // 10 instances of the simple::Inference::Compute RPC's SimpleContext execution context
    // with the Executor.
    LOG(INFO) << "Creating Execution Contexts for RPC (simple::Inference::Compute) with Executor";
    executor->RegisterContexts(rpcCompute, rpcResources, 10);

    LOG(INFO) << "Running Server";
    server.Run(std::chrono::milliseconds(2000), [] {
        // This is a timeout loop executed every 2seconds
        // Run() with no arguments will run an empty timeout loop every 5 seconds.
        // RunAsync() will return immediately, its your responsibility to ensure the
        // server doesn't go out of scope or a Shutdown will be triggered on your services.
    });
}


================================================
FILE: examples/nvRPC/StreamingInOrderSendRecv/test.sh
================================================
#!/bin/bash 

cleanup() {
  kill $(jobs -p) ||:
}
trap "cleanup" EXIT SIGINT SIGTERM

./nvrpc-bidirectional-server.x --ip_port="0.0.0.0:5555" &

f=$(mktmp)
cat <<EOF > $f
PS1='nvRPC Bidirectional: '

go() { ./nvrpc-bidirectional-client.x --hostname="localhost:5555" --count=${1:-100} }
EOF

ps aux

echo
echo 'Try ./nvrpc-bidirectional-client.x --hostname="localhost:5555" --count=100'
bash --rcfile <(echo "PS1='nvRPC Bidirectional: '")


================================================
FILE: examples/nvRPC/StreamingService/CMakeLists.txt
================================================
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

set(LIBS nvrpc echo-protos gflags)

add_executable(nvrpc-ping-pong-server.x ping-pong.cc)
add_executable(nvrpc-even-odds-server.x even-odds.cc)

target_link_libraries(nvrpc-ping-pong-server.x ${LIBS})
target_link_libraries(nvrpc-even-odds-server.x ${LIBS})

add_executable(nvrpc-streaming-client.x client.cc)
target_link_libraries(nvrpc-streaming-client.x ${LIBS} nvrpc-client)

================================================
FILE: examples/nvRPC/StreamingService/README.md
================================================
# Streaming Examples

Async gRPC streaming can take on many forms. nvRPC provides a set of LifeCycles
to accommodate a variety of common use-cases.

For all examples, the RPC that we will implement will have the same form:

```protobuf
rpc Bidirectional (stream Input) returns (stream Output) {}
```

To implement a `StreamingContext`, you must implement the `ReceivedRequest` pure
virtual method.  This method is triggered once for each incoming request that is
read from the stream.  The `ServerStream` object is used to write responses or 
close the stream.  

```c++
class SimpleContext final : public StreamingContext<Input, Output, ExternalResources>
{
    void RequestReceived(RequestType&& input, std::shared_ptr<ServerStream> stream) final override
    {
       // `input` incoming message of Input/RequestType
       // `stream` allows you to:
       //  ->StreamID()
       //    - unique identifier for this stream; note, the ID will be reused when the context is
       //      recycled.
       //  ->WriteResponse(Output&&)
       //    - writes a response on the stream; returns `true` if the stream is connected;
       //      otherwise, `false`  if the stream is disconnected
       //  ->IsConnected()
       //    - bool - is the stream still connected to the client. `FinishStream` or `CancelStream`
       //      will disconnect all `ServerStream`  objects that share the same StreamID.
       //  ->FinishStream()
       //    - Close the Stream from the server-side with status OK
       //  ->CancelStream()
       //    - Close the Stream with status CANCELLED
       //
       //  NOTE: The gRPC stream will stay open to the client until:
       //        1) the client closes its half of the stream, and
       //        2) all `ServerStream` objects are destroyed OR
       //           the stream is explicitly closed by Cancel/FinishStream
    }
```

The final comment is worth further discussion.  The life of the `ServerStream` object does
not have to be tied to the life of the `ReceivedRequest` call.  You can pass the
`ServerStream` object off to an external resource which can then write messages to on the
stream has long as the stream remains connected.  

The stream will disconnect implicitly 


- `ping-pong.cc` - In-order send/recv stream. This client sends a Request and
  the server Response with the same value for `batch_id`.  

================================================
FILE: examples/nvRPC/StreamingService/client.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <chrono>
#include <iostream>
#include <memory>
#include <string>

#include <gflags/gflags.h>
#include <glog/logging.h>
#include <grpcpp/grpcpp.h>

#include "nvrpc/client/client_streaming.h"
#include "nvrpc/client/executor.h"

using grpc::Channel;
using grpc::ClientContext;
using grpc::Status;

using nvrpc::client::ClientStreaming;
using nvrpc::client::Executor;

#include "echo.grpc.pb.h"

using simple::Inference;
using simple::Input;
using simple::Output;

static bool ValidateEven(const char* flagname, int value)
{
    LOG_IF(ERROR, value % 2) << "Examples require an even number of messages";
    return (value % 2 == 0);
}

DEFINE_int32(count, 100, "number of grpc messages to send");
DEFINE_validator(count, &ValidateEven);
DEFINE_int32(thread_count, 1, "Size of thread pool");
DEFINE_string(hostname, "127.0.0.1:50051", "hostname and port");

int main(int argc, char** argv)
{
    // Instantiate the client. It requires a channel, out of which the actual RPCs
    // are created. This channel models a connection to an endpoint (in this case,
    // localhost at port 50051). We indicate that the channel isn't authenticated
    // (use of InsecureChannelCredentials()).
    FLAGS_alsologtostderr = 1; // It will dump to console
    ::google::ParseCommandLineFlags(&argc, &argv, true);

    std::mutex mutex;
    std::size_t count = 0;

    auto executor = std::make_shared<Executor>(FLAGS_thread_count);

    auto channel = grpc::CreateChannel(FLAGS_hostname, grpc::InsecureChannelCredentials());
    auto stub = Inference::NewStub(channel);

    auto infer_prepare_fn = [&stub](::grpc::ClientContext * context,
                                    ::grpc::CompletionQueue * cq) -> auto
    {
        return std::move(stub->PrepareAsyncBidirectional(context, cq));
    };

    auto stream = std::make_unique<ClientStreaming<Input, Output>>(
        infer_prepare_fn, executor,
        [](Input&& request) {
            LOG_FIRST_N(INFO, 10) << "Sent Request with BatchID: " << request.batch_id();
        },
        [&mutex, &count](Output&& response) {
            static size_t last = 0;
            LOG_FIRST_N(INFO, 10) << "Received Response with BatchID: " << response.batch_id();
            CHECK_EQ(++last, response.batch_id());
            std::lock_guard<std::mutex> lock(mutex);
            --count;
        });

    auto start = std::chrono::steady_clock::now();
    auto elapsed = [start]() -> float {
        return std::chrono::duration<float>(std::chrono::steady_clock::now() - start).count();
    };

    for(int i = 1; i < FLAGS_count + 1; i++)
    {
        {
            std::lock_guard<std::mutex> lock(mutex);
            ++count;
        }
        Input input;
        input.set_batch_id(i);
        stream->Write(std::move(input));
    }
    std::cout << FLAGS_count << " queued in " << elapsed() << "seconds" << std::endl;
    auto future = stream->Done();
    // auto future = stream->Status();
    auto status = future.get();
    std::cout << FLAGS_count << " completed in " << elapsed() << "seconds" << std::endl;
    std::cout << "gRPC Status: " << (status.ok() ? "OK" : "NOT OK") << std::endl;
    executor->ShutdownAndJoin();
    CHECK_EQ(count, 0UL);
    return 0;
}

================================================
FILE: examples/nvRPC/StreamingService/common.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

#include <chrono>
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <thread>

#include "tensorrt/laboratory/core/pool.h"
#include "tensorrt/laboratory/core/resources.h"
#include "tensorrt/laboratory/core/thread_pool.h"

using trtlab::Resources;
using trtlab::ThreadPool;

#include "nvrpc/executor.h"
#include "nvrpc/server.h"
#include "nvrpc/service.h"

using nvrpc::AsyncRPC;
using nvrpc::AsyncService;
using nvrpc::Executor;
using nvrpc::Server;
using nvrpc::StreamingContext;

#include "echo.grpc.pb.h"
#include "echo.pb.h"


================================================
FILE: examples/nvRPC/StreamingService/even-odds.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "common.h"

// clang-format off
struct SimpleResources : public Resources
{
    SimpleResources(int numThreadsInPool = 3) : m_ThreadPool(numThreadsInPool) {}
    ThreadPool& AcquireThreadPool() { return m_ThreadPool; }
  private:
    ThreadPool m_ThreadPool;
};
// clang-format on

class SimpleContext final : public StreamingContext<simple::Input, simple::Output, SimpleResources>
{
    void RequestReceived(RequestType&& input, std::shared_ptr<ServerStream> stream) final override
    {
        LOG_FIRST_N(INFO, 10) << "BatchID: " << input.batch_id() << " Tag = " << Tag()
                              << " Thread = " << std::this_thread::get_id();

        // If even, send back two responses.
        // If odd, do nothing
        if(input.batch_id() % 2 == 0)
        {
            LOG_FIRST_N(INFO, 5) << "Received Even an BatchID: Sending back two responses";
            for(int i = input.batch_id() - 1; i <= input.batch_id(); i++)
            {
                ResponseType output;
                output.set_batch_id(i);
                stream->WriteResponse(std::move(output));
            }
        }
        else
        {
            LOG_FIRST_N(INFO, 5) << "Received an Odd BatchID: No Response will be sent";
        }
    }
};

// CLI Options
DEFINE_int32(thread_count, 1, "Size of thread pool");
DEFINE_string(ip_port, "0.0.0.0:50051", "IP/Port");

int main(int argc, char* argv[])
{
    FLAGS_alsologtostderr = 1; // Log to console
    ::google::InitGoogleLogging("simpleServer");
    ::google::ParseCommandLineFlags(&argc, &argv, true);

    Server server(FLAGS_ip_port);
    auto simpleInference = server.RegisterAsyncService<simple::Inference>();
    auto rpcCompute = simpleInference->RegisterRPC<SimpleContext>(
        &simple::Inference::AsyncService::RequestBidirectional);
    auto rpcResources = std::make_shared<SimpleResources>(FLAGS_thread_count);
    auto executor = server.RegisterExecutor(new Executor(1));
    executor->RegisterContexts(rpcCompute, rpcResources, 10);
    server.Run(std::chrono::milliseconds(2000), [] {});
}


================================================
FILE: examples/nvRPC/StreamingService/ping-pong.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "common.h"

// clang-format off
struct SimpleResources : public Resources
{
    SimpleResources(int numThreadsInPool = 3) : m_ThreadPool(numThreadsInPool) {}
    ThreadPool& AcquireThreadPool() { return m_ThreadPool; }
  private:
    ThreadPool m_ThreadPool;
};
// clang-format on

class SimpleContext final : public StreamingContext<simple::Input, simple::Output, SimpleResources>
{
    void RequestReceived(RequestType&& input, std::shared_ptr<ServerStream> stream) final override
    {
        LOG_FIRST_N(INFO, 10) << "BatchID: " << input.batch_id() << " Tag = " << Tag()
                              << " Thread = " << std::this_thread::get_id();
        ResponseType output;
        output.set_batch_id(input.batch_id());
        stream->WriteResponse(std::move(output));
        // TODO: add a test which the server closes the before the client does
        // stream->FinishStream();
    }
};

// CLI Options
DEFINE_int32(thread_count, 1, "Size of thread pool");
DEFINE_string(ip_port, "0.0.0.0:50051", "IP/Port");

int main(int argc, char* argv[])
{
    FLAGS_alsologtostderr = 1; // Log to console
    ::google::InitGoogleLogging("simpleServer");
    ::google::ParseCommandLineFlags(&argc, &argv, true);

    Server server(FLAGS_ip_port);
    auto simpleInference = server.RegisterAsyncService<simple::Inference>();
    auto rpcCompute = simpleInference->RegisterRPC<SimpleContext>(
        &simple::Inference::AsyncService::RequestBidirectional);
    auto rpcResources = std::make_shared<SimpleResources>(FLAGS_thread_count);
    auto executor = server.RegisterExecutor(new Executor(1));
    executor->RegisterContexts(rpcCompute, rpcResources, 10);
    server.Run(std::chrono::milliseconds(2000), [] {});
}


================================================
FILE: examples/nvRPC/StreamingService/test.sh
================================================
#!/bin/bash 

cleanup() {
  kill $(jobs -p) ||:
}
trap "cleanup" EXIT SIGINT SIGTERM

export PATH=".:$PATH"

exe=${1:-"./nvrpc-ping-pong-server.x"}

$exe --ip_port="0.0.0.0:5555" &

f=$(mktmp)
cat <<EOF > $f
PS1='nvRPC Bidirectional: '

go() { ./nvrpc-streaming-client.x --hostname="localhost:5555" --count=${1:-100} }
EOF

ps aux

echo
echo 'Try ./nvrpc-streaming-client.x --hostname="localhost:5555" --count=100'
bash --rcfile <(echo "PS1='nvRPC StreamingService: '")


================================================
FILE: examples/nvRPC/UnaryService/CMakeLists.txt
================================================
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

add_executable(nvrpc-unary-server.x
    server.cc)

target_link_libraries(nvrpc-unary-server.x
    nvrpc
    echo-protos
    gflags
)

add_executable(nvrpc-unary-client.x
    client.cc)

target_link_libraries(nvrpc-unary-client.x
    nvrpc
    nvrpc-client
    echo-protos
    gflags
)

================================================
FILE: examples/nvRPC/UnaryService/client.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <chrono>
#include <iostream>
#include <memory>
#include <string>

#include <gflags/gflags.h>
#include <glog/logging.h>
#include <grpcpp/grpcpp.h>

#include "nvrpc/client/client_unary.h"
#include "nvrpc/client/executor.h"

using grpc::Channel;
using grpc::ClientContext;
using grpc::Status;

using nvrpc::client::ClientUnary;
using nvrpc::client::Executor;

#include "echo.grpc.pb.h"

using simple::Inference;
using simple::Input;
using simple::Output;

DEFINE_int32(count, 100, "number of grpc messages to send");
DEFINE_int32(thread_count, 1, "Size of thread pool");

int main(int argc, char** argv)
{
    // Instantiate the client. It requires a channel, out of which the actual RPCs
    // are created. This channel models a connection to an endpoint (in this case,
    // localhost at port 50051). We indicate that the channel isn't authenticated
    // (use of InsecureChannelCredentials()).
    FLAGS_alsologtostderr = 1; // It will dump to console
    ::google::ParseCommandLineFlags(&argc, &argv, true);

    auto executor = std::make_shared<Executor>(FLAGS_thread_count);

    auto channel = grpc::CreateChannel("localhost:50051", grpc::InsecureChannelCredentials());
    auto stub = Inference::NewStub(channel);

    auto infer_prepare_fn = [&stub](::grpc::ClientContext * context, const ::simple::Input& request,
                                    ::grpc::CompletionQueue* cq) -> auto
    {
        return std::move(stub->PrepareAsyncCompute(context, request, cq));
    };

    auto runner = std::make_unique<ClientUnary<Input, Output>>(infer_prepare_fn, executor);

    auto start = std::chrono::steady_clock::now();
    auto elapsed = [start]() -> float {
        return std::chrono::duration<float>(std::chrono::steady_clock::now() - start).count();
    };

    for(int i = 0; i < FLAGS_count; i++)
    {
        Input input;
        input.set_batch_id(i);
        runner->Enqueue(std::move(input),
                        [i](Input& input, Output& output, ::grpc::Status& status) -> bool {
                            CHECK(output.batch_id() == i);
                            LOG_FIRST_N(INFO, 20) << "Check: " << i;
                            return (bool)(output.batch_id() == i);
                        });
    }
    std::cout << FLAGS_count << " queued in " << elapsed() << "seconds" << std::endl;
    executor->ShutdownAndJoin();
    std::cout << FLAGS_count << " completed in " << elapsed() << "seconds" << std::endl;
    return 0;
}

================================================
FILE: examples/nvRPC/UnaryService/server.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <chrono>
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <thread>

#include "tensorrt/laboratory/core/pool.h"
#include "tensorrt/laboratory/core/resources.h"
#include "tensorrt/laboratory/core/thread_pool.h"

using trtlab::Resources;
using trtlab::ThreadPool;

#include "nvrpc/executor.h"
#include "nvrpc/server.h"
#include "nvrpc/service.h"

using nvrpc::AsyncRPC;
using nvrpc::AsyncService;
using nvrpc::Context;
using nvrpc::Executor;
using nvrpc::Server;

#include "echo.grpc.pb.h"
#include "echo.pb.h"

// CLI Options
DEFINE_int32(thread_count, 1, "Size of thread pool");

/**
 * Embedding a copy of the Protobuf specification for the gRPC service.
 *
 * Package Name: simple
 * Service Name: Inference
 *     RPC Name: Compute
 *
 * Incoming Message: Input
 * Outgoing Message: Ouput
 **

syntax = "proto3";

package simple;

service Inference {
   rpc Compute (Input) returns (Output) {}
}

message Input {
    uint64 batch_id = 1;
}

message Output {
    uint64 batch_id = 1;
}
*/

// Define the resources your RPC will need to execute
// ==================================================
// In this case, all simple::Inference::Compute RPCs share a threadpool in which they will
// queue up some work on.  This essentially means, after the message as been received and
// processed, the actual work for the RPC is pushed to a worker pool outside the scope of
// the transaction processing system (TPS).  This is essentially async computing, we have
// decoupled the transaction from the workers executing the implementation.  The TPS can
// continue to queue work, while the workers process the load.
struct SimpleResources : public Resources
{
    SimpleResources(int numThreadsInPool = 3) : m_ThreadPool(numThreadsInPool)
    {
        LOG(INFO) << "Server ThreadCount: " << numThreadsInPool;
    }

    ThreadPool& AcquireThreadPool() { return m_ThreadPool; }

  private:
    ThreadPool m_ThreadPool;
};

// Contexts hold the state and provide the definition of the work to be performed by the RPC.
// This is where you define what gets executed for a given RPC.
// Incoming Message = simple::Input (RequestType)
// Outgoing Message = simple::Output (ResponseType)
class SimpleContext final : public Context<simple::Input, simple::Output, SimpleResources>
{
    void ExecuteRPC(RequestType& input, ResponseType& output) final override
    {
        // We could do work here, but we'd block the TPS, i.e. the threads pulling messages
        // off the incoming recieve queue.  Very quick responses are best done here; however,
        // longer running workload should be offloaded so the TPS can avoid being blocked.
        GetResources()->AcquireThreadPool().enqueue([this, &input, &output] {
            // Now running on a worker thread of the ThreadPool defined in SimpleResources.
            // Here we are just echoing back the incoming // batch_id; however, in later
            // examples, we'll show how to run an async cuda pipline.
            LOG_FIRST_N(INFO, 20) << "Tag = " << Tag()
                                  << " Thread = " << std::this_thread::get_id();
            output.set_batch_id(input.batch_id());
            this->FinishResponse();
        });
        // The TPS thread is now free to continue processing message - async ftw!
    }
};

int main(int argc, char* argv[])
{
    FLAGS_alsologtostderr = 1; // Log to console

    ::google::InitGoogleLogging("simpleServer");
    ::google::ParseCommandLineFlags(&argc, &argv, true);

    // A server will bind an IP:PORT to listen on
    Server server("0.0.0.0:50051");

    // A server can host multiple services
    LOG(INFO) << "Register Service (simple::Inference) with Server";
    auto simpleInference = server.RegisterAsyncService<simple::Inference>();

    // An RPC has two components that need to be specified when registering with the service:
    //  1) Type of Execution Context (SimpleContext).  The execution context defines the behavor
    //     of the RPC, i.e. it contains the control logic for the execution of the RPC.
    //  2) The Request function (RequestCompute) which was generated by gRPC when compiling the
    //     protobuf which defined the service.  This function is responsible for queuing the
    //     RPC's execution context to the
    LOG(INFO) << "Register RPC (simple::Inference::Compute) with Service (simple::Inference)";
    auto rpcCompute = simpleInference->RegisterRPC<SimpleContext>(
        &simple::Inference::AsyncService::RequestCompute);

    LOG(INFO) << "Initializing Resources for RPC (simple::Inference::Compute)";
    auto rpcResources = std::make_shared<SimpleResources>(FLAGS_thread_count);

    // Create Executors - Executors provide the messaging processing resources for the RPCs
    // Multiple Executors can be registered with a Server.  The executor is responsible
    // for pulling incoming message off the receive queue and executing the associated
    // context.  By default, an executor only uses a single thread.  A typical usecase is
    // an Executor executes a context, which immediate pushes the work to a thread pool.
    // However, for very low-latency messaging, you might want to use a multi-threaded
    // Executor and a Blocking Context - meaning the Context performs the entire RPC function
    // on the Executor's thread.
    LOG(INFO) << "Creating Executor";
    auto executor = server.RegisterExecutor(new Executor(1));

    // You can register RPC execution contexts from any registered RPC on any executor.
    // The power of that will become clear in later examples. For now, we will register
    // 10 instances of the simple::Inference::Compute RPC's SimpleContext execution context
    // with the Executor.
    LOG(INFO) << "Creating Execution Contexts for RPC (simple::Inference::Compute) with Executor";
    executor->RegisterContexts(rpcCompute, rpcResources, 10);

    LOG(INFO) << "Running Server";
    server.Run(std::chrono::milliseconds(2000), [] {
        // This is a timeout loop executed every 2seconds
        // Run() with no arguments will run an empty timeout loop every 5 seconds.
        // RunAsync() will return immediately, its your responsibility to ensure the
        // server doesn't go out of scope or a Shutdown will be triggered on your services.
    });
}


================================================
FILE: jupyter_notebook_config.py
================================================
# Configuration file for jupyter-notebook.

#------------------------------------------------------------------------------
# Application(SingletonConfigurable) configuration
#------------------------------------------------------------------------------

## This is an application.

## The date format used by logging formatters for %(asctime)s
#c.Application.log_datefmt = '%Y-%m-%d %H:%M:%S'

## The Logging format template
#c.Application.log_format = '[%(name)s]%(highlevel)s %(message)s'

## Set the log level by value or name.
#c.Application.log_level = 30

#------------------------------------------------------------------------------
# JupyterApp(Application) configuration
#------------------------------------------------------------------------------

## Base class for Jupyter applications

## Answer yes to any prompts.
#c.JupyterApp.answer_yes = False

## Full path of a config file.
#c.JupyterApp.config_file = ''

## Specify a config file to load.
#c.JupyterApp.config_file_name = ''

## Generate default config file.
#c.JupyterApp.generate_config = False

#------------------------------------------------------------------------------
# NotebookApp(JupyterApp) configuration
#------------------------------------------------------------------------------

## Set the Access-Control-Allow-Credentials: true header
#c.NotebookApp.allow_credentials = False

## Set the Access-Control-Allow-Origin header
#  
#  Use '*' to allow any origin to access your server.
#  
#  Takes precedence over allow_origin_pat.
#c.NotebookApp.allow_origin = ''

## Use a regular expression for the Access-Control-Allow-Origin header
#  
#  Requests from an origin matching the expression will get replies with:
#  
#      Access-Control-Allow-Origin: origin
#  
#  where `origin` is the origin of the request.
#  
#  Ignored if allow_origin is set.
#c.NotebookApp.allow_origin_pat = ''

## Allow password to be changed at login for the notebook server.
#  
#  While loggin in with a token, the notebook server UI will give the opportunity
#  to the user to enter a new password at the same time that will replace the
#  token login mechanism.
#  
#  This can be set to false to prevent changing password from the UI/API.
#c.NotebookApp.allow_password_change = True

## Allow requests where the Host header doesn't point to a local server
#  
#  By default, requests get a 403 forbidden response if the 'Host' header shows
#  that the browser thinks it's on a non-local domain. Setting this option to
#  True disables this check.
#  
#  This protects against 'DNS rebinding' attacks, where a remote web server
#  serves you a page and then changes its DNS to send later requests to a local
#  IP, bypassing same-origin checks.
#  
#  Local IP addresses (such as 127.0.0.1 and ::1) are allowed as local, along
#  with hostnames configured in local_hostnames.
c.NotebookApp.allow_remote_access = True

## Whether to allow the user to run the notebook as root.
c.NotebookApp.allow_root = True

## DEPRECATED use base_url
#c.NotebookApp.base_project_url = '/'

## The base URL for the notebook server.
#  
#  Leading and trailing slashes can be omitted, and will automatically be added.
#c.NotebookApp.base_url = '/'

## Specify what command to use to invoke a web browser when opening the notebook.
#  If not specified, the default browser will be determined by the `webbrowser`
#  standard library module, which allows setting of the BROWSER environment
#  variable to override it.
#c.NotebookApp.browser = ''

## The full path to an SSL/TLS certificate file.
#c.NotebookApp.certfile = ''

## The full path to a certificate authority certificate for SSL/TLS client
#  authentication.
#c.NotebookApp.client_ca = ''

## The config manager class to use
#c.NotebookApp.config_manager_class = 'notebook.services.config.manager.ConfigManager'

## The notebook manager class to use.
#c.NotebookApp.contents_manager_class = 'notebook.services.contents.largefilemanager.LargeFileManager'

## Extra keyword arguments to pass to `set_secure_cookie`. See tornado's
#  set_secure_cookie docs for details.
#c.NotebookApp.cookie_options = {}

## The random bytes used to secure cookies. By default this is a new random
#  number every time you start the Notebook. Set it to a value in a config file
#  to enable logins to persist across server sessions.
#  
#  Note: Cookie secrets should be kept private, do not share config files with
#  cookie_secret stored in plaintext (you can read the value from a file).
#c.NotebookApp.cookie_secret = b''

## The file where the cookie secret is stored.
#c.NotebookApp.cookie_secret_file = ''

## Override URL shown to users.
#  
#  Replace actual URL, including protocol, address, port and base URL, with the
#  given value when displaying URL to the users. Do not change the actual
#  connection URL. If authentication token is enabled, the token is added to the
#  custom URL automatically.
#  
#  This option is intended to be used when the URL to display to the user cannot
#  be determined reliably by the Jupyter notebook server (proxified or
#  containerized setups for example).
#c.NotebookApp.custom_display_url = ''

## The default URL to redirect to from `/`
#c.NotebookApp.default_url = '/tree'

## Disable cross-site-request-forgery protection
#  
#  Jupyter notebook 4.3.1 introduces protection from cross-site request
#  forgeries, requiring API requests to either:
#  
#  - originate from pages served by this server (validated with XSRF cookie and
#  token), or - authenticate with a token
#  
#  Some anonymous compute resources still desire the ability to run code,
#  completely without authentication. These services can disable all
#  authentication and security checks, with the full knowledge of what that
#  implies.
#c.NotebookApp.disable_check_xsrf = False

## Whether to enable MathJax for typesetting math/TeX
#  
#  MathJax is the javascript library Jupyter uses to render math/LaTeX. It is
#  very large, so you may want to disable it if you have a slow internet
#  connection, or for offline use of the notebook.
#  
#  When disabled, equations etc. will appear as their untransformed TeX source.
#c.NotebookApp.enable_mathjax = True

## extra paths to look for Javascript notebook extensions
#c.NotebookApp.extra_nbextensions_path = []

## handlers that should be loaded at higher priority than the default services
#c.NotebookApp.extra_services = []

## Extra paths to search for serving static files.
#  
#  This allows adding javascript/css to be available from the notebook server
#  machine, or overriding individual files in the IPython
#c.NotebookApp.extra_static_paths = []

## Extra paths to search for serving jinja templates.
#  
#  Can be used to override templates from notebook.templates.
#c.NotebookApp.extra_template_paths = []

## 
#c.NotebookApp.file_to_run = ''

## Extra keyword arguments to pass to `get_secure_cookie`. See tornado's
#  get_secure_cookie docs for details.
#c.NotebookApp.get_secure_cookie_kwargs = {}

## Deprecated: Use minified JS file or not, mainly use during dev to avoid JS
#  recompilation
#c.NotebookApp.ignore_minified_js = False

## (bytes/sec) Maximum rate at which stream output can be sent on iopub before
#  they are limited.
#c.NotebookApp.iopub_data_rate_limit = 1000000

## (msgs/sec) Maximum rate at which messages can be sent on iopub before they are
#  limited.
#c.NotebookApp.iopub_msg_rate_limit = 1000

## The IP address the notebook server will listen on.
c.NotebookApp.ip = '0.0.0.0'

## Supply extra arguments that will be passed to Jinja environment.
#c.NotebookApp.jinja_environment_options = {}

## Extra variables to supply to jinja templates when rendering.
#c.NotebookApp.jinja_template_vars = {}

## The kernel manager class to use.
#c.NotebookApp.kernel_manager_class = 'notebook.services.kernels.kernelmanager.MappingKernelManager'

## The kernel spec manager class to use. Should be a subclass of
#  `jupyter_client.kernelspec.KernelSpecManager`.
#  
#  The Api of KernelSpecManager is provisional and might change without warning
#  between this version of Jupyter and the next stable one.
#c.NotebookApp.kernel_spec_manager_class = 'jupyter_client.kernelspec.KernelSpecManager'

## The full path to a private key file for usage with SSL/TLS.
#c.NotebookApp.keyfile = ''

## Hostnames to allow as local when allow_remote_access is False.
#  
#  Local IP addresses (such as 127.0.0.1 and ::1) are automatically accepted as
#  local as well.
#c.NotebookApp.local_hostnames = ['localhost']

## The login handler class to use.
#c.NotebookApp.login_handler_class = 'notebook.auth.login.LoginHandler'

## The logout handler class to use.
#c.NotebookApp.logout_handler_class = 'notebook.auth.logout.LogoutHandler'

## The MathJax.js configuration file that is to be used.
#c.NotebookApp.mathjax_config = 'TeX-AMS-MML_HTMLorMML-full,Safe'

## A custom url for MathJax.js. Should be in the form of a case-sensitive url to
#  MathJax, for example:  /static/components/MathJax/MathJax.js
#c.NotebookApp.mathjax_url = ''

## Sets the maximum allowed size of the client request body, specified in  the
#  Content-Length request header field. If the size in a request  exceeds the
#  configured value, a malformed HTTP message is returned to the client.
#  
#  Note: max_body_size is applied even in streaming mode.
#c.NotebookApp.max_body_size = 536870912

## Gets or sets the maximum amount of memory, in bytes, that is allocated  for
#  use by the buffer manager.
#c.NotebookApp.max_buffer_size = 536870912

## Dict of Python modules to load as notebook server extensions.Entry values can
#  be used to enable and disable the loading ofthe extensions. The extensions
#  will be loaded in alphabetical order.
#c.NotebookApp.nbserver_extensions = {}

## The directory to use for notebooks and kernels.
c.NotebookApp.notebook_dir = '/work'

## Whether to open in a browser after starting. The specific browser used is
#  platform dependent and determined by the python standard library `webbrowser`
#  module, unless it is overridden using the --browser (NotebookApp.browser)
#  configuration option.
#c.NotebookApp.open_browser = True

## Hashed password to use for web authentication.
#  
#  To generate, type in a python/IPython shell:
#  
#    from notebook.auth import passwd; passwd()
#  
#  The string should be of the form type:salt:hashed-password.
#c.NotebookApp.password = ''

## Forces users to use a password for the Notebook server. This is useful in a
#  multi user environment, for instance when everybody in the LAN can access each
#  other's machine through ssh.
#  
#  In such a case, server the notebook server on localhost is not secure since
#  any user can connect to the notebook server via ssh.
#c.NotebookApp.password_required = False

## The port the notebook server will listen on.
#c.NotebookApp.port = 8888

## The number of additional ports to try if the specified port is not available.
#c.NotebookApp.port_retries = 50

## DISABLED: use %pylab or %matplotlib in the notebook to enable matplotlib.
#c.NotebookApp.pylab = 'disabled'

## If True, display a button in the dashboard to quit (shutdown the notebook
#  server).
#c.NotebookApp.quit_button = True

## (sec) Time window used to  check the message and data rate limits.
#c.NotebookApp.rate_limit_window = 3

## Reraise exceptions encountered loading server extensions?
#c.NotebookApp.reraise_server_extension_failures = False

## DEPRECATED use the nbserver_extensions dict instead
#c.NotebookApp.server_extensions = []

## The session manager class to use.
#c.NotebookApp.session_manager_class = 'notebook.services.sessions.sessionmanager.SessionManager'

## Shut down the server after N seconds with no kernels or terminals running and
#  no activity. This can be used together with culling idle kernels
#  (MappingKernelManager.cull_idle_timeout) to shutdown the notebook server when
#  it's not in use. This is not precisely timed: it may shut down up to a minute
#  later. 0 (the default) disables this automatic shutdown.
#c.NotebookApp.shutdown_no_activity_timeout = 0

## Supply SSL options for the tornado HTTPServer. See the tornado docs for
#  details.
#c.NotebookApp.ssl_options = {}

## Supply overrides for terminado. Currently only supports "shell_command".
#c.NotebookApp.terminado_settings = {}

## Set to False to disable terminals.
#  
#  This does *not* make the notebook server more secure by itself. Anything the
#  user can in a terminal, they can also do in a notebook.
#  
#  Terminals may also be automatically disabled if the terminado package is not
#  available.
#c.NotebookApp.terminals_enabled = True

## Token used for authenticating first-time connections to the server.
#  
#  When no password is enabled, the default is to generate a new, random token.
#  
#  Setting to an empty string disables authentication altogether, which is NOT
#  RECOMMENDED.
#c.NotebookApp.token = '<generated>'

## Supply overrides for the tornado.web.Application that the Jupyter notebook
#  uses.
#c.NotebookApp.tornado_settings = {}

## Whether to trust or not X-Scheme/X-Forwarded-Proto and X-Real-Ip/X-Forwarded-
#  For headerssent by the upstream reverse proxy. Necessary if the proxy handles
#  SSL
#c.NotebookApp.trust_xheaders = False

## DEPRECATED, use tornado_settings
#c.NotebookApp.webapp_settings = {}

## Specify Where to open the notebook on startup. This is the `new` argument
#  passed to the standard library method `webbrowser.open`. The behaviour is not
#  guaranteed, but depends on browser support. Valid values are:
#  
#   - 2 opens a new tab,
#   - 1 opens a new window,
#   - 0 opens in an existing window.
#  
#  See the `webbrowser.open` documentation for details.
#c.NotebookApp.webbrowser_open_new = 2

## Set the tornado compression options for websocket connections.
#  
#  This value will be returned from
#  :meth:`WebSocketHandler.get_compression_options`. None (default) will disable
#  compression. A dict (even an empty one) will enable compression.
#  
#  See the tornado docs for WebSocketHandler.get_compression_options for details.
#c.NotebookApp.websocket_compression_options = None

## The base URL for websockets, if it differs from the HTTP server (hint: it
#  almost certainly doesn't).
#  
#  Should be in the form of an HTTP origin: ws[s]://hostname[:port]
#c.NotebookApp.websocket_url = ''

#------------------------------------------------------------------------------
# ConnectionFileMixin(LoggingConfigurable) configuration
#------------------------------------------------------------------------------

## Mixin for configurable classes that work with connection files

## JSON file in which to store connection info [default: kernel-<pid>.json]
#  
#  This file will contain the IP, ports, and authentication key needed to connect
#  clients to this kernel. By default, this file will be created in the security
#  dir of the current profile, but can be specified by absolute path.
#c.ConnectionFileMixin.connection_file = ''

## set the control (ROUTER) port [default: random]
#c.ConnectionFileMixin.control_port = 0

## set the heartbeat port [default: random]
#c.ConnectionFileMixin.hb_port = 0

## set the iopub (PUB) port [default: random]
#c.ConnectionFileMixin.iopub_port = 0

## Set the kernel's IP address [default localhost]. If the IP address is
#  something other than localhost, then Consoles on other machines will be able
#  to connect to the Kernel, so be careful!
#c.ConnectionFileMixin.ip = ''

## set the shell (ROUTER) port [default: random]
#c.ConnectionFileMixin.shell_port = 0

## set the stdin (ROUTER) port [default: random]
#c.ConnectionFileMixin.stdin_port = 0

## 
#c.ConnectionFileMixin.transport = 'tcp'

#------------------------------------------------------------------------------
# KernelManager(ConnectionFileMixin) configuration
#------------------------------------------------------------------------------

## Manages a single kernel in a subprocess on this host.
#  
#  This version starts kernels with Popen.

## Should we autorestart the kernel if it dies.
#c.KernelManager.autorestart = True

## DEPRECATED: Use kernel_name instead.
#  
#  The Popen Command to launch the kernel. Override this if you have a custom
#  kernel. If kernel_cmd is specified in a configuration file, Jupyter does not
#  pass any arguments to the kernel, because it cannot make any assumptions about
#  the arguments that the kernel understands. In particular, this means that the
#  kernel does not receive the option --debug if it given on the Jupyter command
#  line.
#c.KernelManager.kernel_cmd = []

## Time to wait for a kernel to terminate before killing it, in seconds.
#c.KernelManager.shutdown_wait_time = 5.0

#------------------------------------------------------------------------------
# Session(Configurable) configuration
#------------------------------------------------------------------------------

## Object for handling serialization and sending of messages.
#  
#  The Session object handles building messages and sending them with ZMQ sockets
#  or ZMQStream objects.  Objects can communicate with each other over the
#  network via Session objects, and only need to work with the dict-based IPython
#  message spec. The Session will handle serialization/deserialization, security,
#  and metadata.
#  
#  Sessions support configurable serialization via packer/unpacker traits, and
#  signing with HMAC digests via the key/keyfile traits.
#  
#  Parameters ----------
#  
#  debug : bool
#      whether to trigger extra debugging statements
#  packer/unpacker : str : 'json', 'pickle' or import_string
#      importstrings for methods to serialize message parts.  If just
#      'json' or 'pickle', predefined JSON and pickle packers will be used.
#      Otherwise, the entire importstring must be used.
#  
#      The functions must accept at least valid JSON input, and output *bytes*.
#  
#      For example, to use msgpack:
#      packer = 'msgpack.packb', unpacker='msgpack.unpackb'
#  pack/unpack : callables
#      You can also set the pack/unpack callables for serialization directly.
#  session : bytes
#      the ID of this Session object.  The default is to generate a new UUID.
#  username : unicode
#      username added to message headers.  The default is to ask the OS.
#  key : bytes
#      The key used to initialize an HMAC signature.  If unset, messages
#      will not be signed or checked.
#  keyfile : filepath
#      The file containing a key.  If this is set, `key` will be initialized
#      to the contents of the file.

## Threshold (in bytes) beyond which an object's buffer should be extracted to
#  avoid pickling.
#c.Session.buffer_threshold = 1024

## Whether to check PID to protect against calls after fork.
#  
#  This check can be disabled if fork-safety is handled elsewhere.
#c.Session.check_pid = True

## Threshold (in bytes) beyond which a buffer should be sent without copying.
#c.Session.copy_threshold = 65536

## Debug output in the Session
#c.Session.debug = False

## The maximum number of digests to remember.
#  
#  The digest history will be culled when it exceeds this value.
#c.Session.digest_history_size = 65536

## The maximum number of items for a container to be introspected for custom
#  serialization. Containers larger than this are pickled outright.
#c.Session.item_threshold = 64

## execution key, for signing messages.
#c.Session.key = b''

## path to file containing execution key.
#c.Session.keyfile = ''

## Metadata dictionary, which serves as the default top-level metadata dict for
#  each message.
#c.Session.metadata = {}

## The name of the packer for serializing messages. Should be one of 'json',
#  'pickle', or an import name for a custom callable serializer.
#c.Session.packer = 'json'

## The UUID identifying this session.
#c.Session.session = ''

## The digest scheme used to construct the message signatures. Must have the form
#  'hmac-HASH'.
#c.Session.signature_scheme = 'hmac-sha256'

## The name of the unpacker for unserializing messages. Only used with custom
#  functions for `packer`.
#c.Session.unpacker = 'json'

## Username for the Session. Default is your system username.
#c.Session.username = 'username'

#------------------------------------------------------------------------------
# MultiKernelManager(LoggingConfigurable) configuration
#------------------------------------------------------------------------------

## A class for managing multiple kernels.

## The name of the default kernel to start
#c.MultiKernelManager.default_kernel_name = 'python3'

## The kernel manager class.  This is configurable to allow subclassing of the
#  KernelManager for customized behavior.
#c.MultiKernelManager.kernel_manager_class = 'jupyter_client.ioloop.IOLoopKernelManager'

#------------------------------------------------------------------------------
# MappingKernelManager(MultiKernelManager) configuration
#------------------------------------------------------------------------------

## A KernelManager that handles notebook mapping and HTTP error handling

## Whether messages from kernels whose frontends have disconnected should be
#  buffered in-memory.
#  
#  When True (default), messages are buffered and replayed on reconnect, avoiding
#  lost messages due to interrupted connectivity.
#  
#  Disable if long-running kernels will produce too much output while no
#  frontends are connected.
#c.MappingKernelManager.buffer_offline_messages = True

## Whether to consider culling kernels which are busy. Only effective if
#  cull_idle_timeout > 0.
#c.MappingKernelManager.cull_busy = False

## Whether to consider culling kernels which have one or more connections. Only
#  effective if cull_idle_timeout > 0.
#c.MappingKernelManager.cull_connected = False

## Timeout (in seconds) after which a kernel is considered idle and ready to be
#  culled. Values of 0 or lower disable culling. Very short timeouts may result
#  in kernels being culled for users with poor network connections.
#c.MappingKernelManager.cull_idle_timeout = 0

## The interval (in seconds) on which to check for idle kernels exceeding the
#  cull timeout value.
#c.MappingKernelManager.cull_interval = 300

## Timeout for giving up on a kernel (in seconds).
#  
#  On starting and restarting kernels, we check whether the kernel is running and
#  responsive by sending kernel_info_requests. This sets the timeout in seconds
#  for how long the kernel can take before being presumed dead.  This affects the
#  MappingKernelManager (which handles kernel restarts)  and the
#  ZMQChannelsHandler (which handles the startup).
#c.MappingKernelManager.kernel_info_timeout = 60

## 
#c.MappingKernelManager.root_dir = ''

#------------------------------------------------------------------------------
# ContentsManager(LoggingConfigurable) configuration
#------------------------------------------------------------------------------

## Base class for serving files and directories.
#  
#  This serves any text or binary file, as well as directories, with special
#  handling for JSON notebook documents.
#  
#  Most APIs take a path argument, which is always an API-style unicode path, and
#  always refers to a directory.
#  
#  - unicode, not url-escaped
#  - '/'-separated
#  - leading and trailing '/' will be stripped
#  - if unspecified, path defaults to '',
#    indicating the root path.

## Allow access to hidden files
#c.ContentsManager.allow_hidden = False

## 
#c.ContentsManager.checkpoints = None

## 
#c.ContentsManager.checkpoints_class = 'notebook.services.contents.checkpoints.Checkpoints'

## 
#c.ContentsManager.checkpoints_kwargs = {}

## handler class to use when serving raw file requests.
#  
#  Default is a fallback that talks to the ContentsManager API, which may be
#  inefficient, especially for large files.
#  
#  Local files-based ContentsManagers can use a StaticFileHandler subclass, which
#  will be much more efficient.
#  
#  Access to these files should be Authenticated.
#c.ContentsManager.files_handler_class = 'notebook.files.handlers.FilesHandler'

## Extra parameters to pass to files_handler_class.
#  
#  For example, StaticFileHandlers generally expect a `path` argument specifying
#  the root directory from which to serve files.
#c.ContentsManager.files_handler_params = {}

## Glob patterns to hide in file and directory listings.
#c.ContentsManager.hide_globs = ['__pycache__', '*.pyc', '*.pyo', '.DS_Store', '*.so', '*.dylib', '*~']

## Python callable or importstring thereof
#  
#  To be called on a contents model prior to save.
#  
#  This can be used to process the structure, such as removing notebook outputs
#  or other side effects that should not be saved.
#  
#  It will be called as (all arguments passed by keyword)::
#  
#      hook(path=path, model=model, contents_manager=self)
#  
#  - model: the model to be saved. Includes file contents.
#    Modifying this dict will affect the file that is stored.
#  - path: the API path of the save destination
#  - contents_manager: this ContentsManager instance
#c.ContentsManager.pre_save_hook = None

## 
#c.ContentsManager.root_dir = '/'

## The base name used when creating untitled directories.
#c.ContentsManager.untitled_directory = 'Untitled Folder'

## The base name used when creating untitled files.
#c.ContentsManager.untitled_file = 'untitled'

## The base name used when creating untitled notebooks.
#c.ContentsManager.untitled_notebook = 'Untitled'

#------------------------------------------------------------------------------
# FileManagerMixin(Configurable) configuration
#------------------------------------------------------------------------------

## Mixin for ContentsAPI classes that interact with the filesystem.
#  
#  Provides facilities for reading, writing, and copying both notebooks and
#  generic files.
#  
#  Shared by FileContentsManager and FileCheckpoints.
#  
#  Note ---- Classes using this mixin must provide the following attributes:
#  
#  root_dir : unicode
#      A directory against against which API-style paths are to be resolved.
#  
#  log : logging.Logger

## By default notebooks are saved on disk on a temporary file and then if
#  succefully written, it replaces the old ones. This procedure, namely
#  'atomic_writing', causes some bugs on file system whitout operation order
#  enforcement (like some networked fs). If set to False, the new notebook is
#  written directly on the old one which could fail (eg: full filesystem or quota
#  )
#c.FileManagerMixin.use_atomic_writing = True

#------------------------------------------------------------------------------
# FileContentsManager(FileManagerMixin,ContentsManager) configuration
#------------------------------------------------------------------------------

## If True (default), deleting files will send them to the platform's
#  trash/recycle bin, where they can be recovered. If False, deleting files
#  really deletes them.
#c.FileContentsManager.delete_to_trash = True

## Python callable or importstring thereof
#  
#  to be called on the path of a file just saved.
#  
#  This can be used to process the file on disk, such as converting the notebook
#  to a script or HTML via nbconvert.
#  
#  It will be called as (all arguments passed by keyword)::
#  
#      hook(os_path=os_path, model=model, contents_manager=instance)
#  
#  - path: the filesystem path to the file just written - model: the model
#  representing the file - contents_manager: this ContentsManager instance
#c.FileContentsManager.post_save_hook = None

## 
#c.FileContentsManager.root_dir = ''

## DEPRECATED, use post_save_hook. Will be removed in Notebook 5.0
#c.FileContentsManager.save_script = False

#------------------------------------------------------------------------------
# NotebookNotary(LoggingConfigurable) configuration
#------------------------------------------------------------------------------

## A class for computing and verifying notebook signatures.

## The hashing algorithm used to sign notebooks.
#c.NotebookNotary.algorithm = 'sha256'

## The sqlite file in which to store notebook signatures. By default, this will
#  be in your Jupyter data directory. You can set it to ':memory:' to disable
#  sqlite writing to the filesystem.
#c.NotebookNotary.db_file = ''

## The secret key with which notebooks are signed.
#c.NotebookNotary.secret = b''

## The file where the secret key is stored.
#c.NotebookNotary.secret_file = ''

## A callable returning the storage backend for notebook signatures. The default
#  uses an SQLite database.
#c.NotebookNotary.store_factory = traitlets.Undefined

#------------------------------------------------------------------------------
# KernelSpecManager(LoggingConfigurable) configuration
#------------------------------------------------------------------------------

## If there is no Python kernelspec registered and the IPython kernel is
#  available, ensure it is added to the spec list.
#c.KernelSpecManager.ensure_native_kernel = True

## The kernel spec class.  This is configurable to allow subclassing of the
#  KernelSpecManager for customized behavior.
#c.KernelSpecManager.kernel_spec_class = 'jupyter_client.kernelspec.KernelSpec'

## Whitelist of allowed kernel names.
#  
#  By default, all installed kernels are allowed.
#c.KernelSpecManager.whitelist = set()


================================================
FILE: models/README.md
================================================
## Sample Models

Included in this folder are a collection of open source models and
some scripts to build TensorRT engines from these models.

Currently, the samples provided only generate TensorRT engines
with random weights and are only good for synthetic tests.

TODO: update scripts to pull open sourced weights for fully functional
models.


## Credits

Caffe ResNet-50 and ResNet-152 models from [KaimingHe/deep-residual-networks]
(https://github.com/KaimingHe/deep-residual-networks) are included without modification.

> The MIT License (MIT)
> 
> Copyright (c) 2016 Shaoqing Ren
> 
> Permission is hereby granted, free of charge, to any person obtaining a copy
> of this software and associated documentation files (the "Software"), to deal
> in the Software without restriction, including without limitation the rights
> to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> copies of the Software, and to permit persons to whom the Software is
> furnished to do so, subject to the following conditions:
> 
> The above copyright notice and this permission notice shall be included in all
> copies or substantial portions of the Software.
> 
> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
> AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> SOFTWARE.


================================================
FILE: models/ResNet-152-deploy.prototxt
================================================
name: "ResNet-152"
input: "data"
input_dim: 1
input_dim: 3
input_dim: 224
input_dim: 224

layer {
	bottom: "data"
	top: "conv1"
	name: "conv1"
	type: "Convolution"
	convolution_param {
		num_output: 64
		kernel_size: 7
		pad: 3
		stride: 2
		bias_term: false
	}
}

layer {
	bottom: "conv1"
	top: "conv1"
	name: "bn_conv1"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "conv1"
	top: "conv1"
	name: "scale_conv1"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "conv1"
	bottom: "conv1"
	name: "conv1_relu"
	type: "ReLU"
}

layer {
	bottom: "conv1"
	top: "pool1"
	name: "pool1"
	type: "Pooling"
	pooling_param {
		kernel_size: 3
		stride: 2
		pool: MAX
	}
}

layer {
	bottom: "pool1"
	top: "res2a_branch1"
	name: "res2a_branch1"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res2a_branch1"
	top: "res2a_branch1"
	name: "bn2a_branch1"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res2a_branch1"
	top: "res2a_branch1"
	name: "scale2a_branch1"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "pool1"
	top: "res2a_branch2a"
	name: "res2a_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 64
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res2a_branch2a"
	top: "res2a_branch2a"
	name: "bn2a_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res2a_branch2a"
	top: "res2a_branch2a"
	name: "scale2a_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res2a_branch2a"
	bottom: "res2a_branch2a"
	name: "res2a_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res2a_branch2a"
	top: "res2a_branch2b"
	name: "res2a_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 64
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res2a_branch2b"
	top: "res2a_branch2b"
	name: "bn2a_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res2a_branch2b"
	top: "res2a_branch2b"
	name: "scale2a_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res2a_branch2b"
	bottom: "res2a_branch2b"
	name: "res2a_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res2a_branch2b"
	top: "res2a_branch2c"
	name: "res2a_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res2a_branch2c"
	top: "res2a_branch2c"
	name: "bn2a_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res2a_branch2c"
	top: "res2a_branch2c"
	name: "scale2a_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res2a_branch1"
	bottom: "res2a_branch2c"
	top: "res2a"
	name: "res2a"
	type: "Eltwise"
}

layer {
	bottom: "res2a"
	top: "res2a"
	name: "res2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res2a"
	top: "res2b_branch2a"
	name: "res2b_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 64
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res2b_branch2a"
	top: "res2b_branch2a"
	name: "bn2b_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res2b_branch2a"
	top: "res2b_branch2a"
	name: "scale2b_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res2b_branch2a"
	bottom: "res2b_branch2a"
	name: "res2b_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res2b_branch2a"
	top: "res2b_branch2b"
	name: "res2b_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 64
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res2b_branch2b"
	top: "res2b_branch2b"
	name: "bn2b_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res2b_branch2b"
	top: "res2b_branch2b"
	name: "scale2b_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res2b_branch2b"
	bottom: "res2b_branch2b"
	name: "res2b_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res2b_branch2b"
	top: "res2b_branch2c"
	name: "res2b_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res2b_branch2c"
	top: "res2b_branch2c"
	name: "bn2b_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res2b_branch2c"
	top: "res2b_branch2c"
	name: "scale2b_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res2a"
	bottom: "res2b_branch2c"
	top: "res2b"
	name: "res2b"
	type: "Eltwise"
}

layer {
	bottom: "res2b"
	top: "res2b"
	name: "res2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res2b"
	top: "res2c_branch2a"
	name: "res2c_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 64
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res2c_branch2a"
	top: "res2c_branch2a"
	name: "bn2c_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res2c_branch2a"
	top: "res2c_branch2a"
	name: "scale2c_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res2c_branch2a"
	bottom: "res2c_branch2a"
	name: "res2c_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res2c_branch2a"
	top: "res2c_branch2b"
	name: "res2c_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 64
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res2c_branch2b"
	top: "res2c_branch2b"
	name: "bn2c_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res2c_branch2b"
	top: "res2c_branch2b"
	name: "scale2c_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res2c_branch2b"
	bottom: "res2c_branch2b"
	name: "res2c_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res2c_branch2b"
	top: "res2c_branch2c"
	name: "res2c_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res2c_branch2c"
	top: "res2c_branch2c"
	name: "bn2c_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res2c_branch2c"
	top: "res2c_branch2c"
	name: "scale2c_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res2b"
	bottom: "res2c_branch2c"
	top: "res2c"
	name: "res2c"
	type: "Eltwise"
}

layer {
	bottom: "res2c"
	top: "res2c"
	name: "res2c_relu"
	type: "ReLU"
}

layer {
	bottom: "res2c"
	top: "res3a_branch1"
	name: "res3a_branch1"
	type: "Convolution"
	convolution_param {
		num_output: 512
		kernel_size: 1
		pad: 0
		stride: 2
		bias_term: false
	}
}

layer {
	bottom: "res3a_branch1"
	top: "res3a_branch1"
	name: "bn3a_branch1"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3a_branch1"
	top: "res3a_branch1"
	name: "scale3a_branch1"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res2c"
	top: "res3a_branch2a"
	name: "res3a_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 128
		kernel_size: 1
		pad: 0
		stride: 2
		bias_term: false
	}
}

layer {
	bottom: "res3a_branch2a"
	top: "res3a_branch2a"
	name: "bn3a_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3a_branch2a"
	top: "res3a_branch2a"
	name: "scale3a_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res3a_branch2a"
	bottom: "res3a_branch2a"
	name: "res3a_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res3a_branch2a"
	top: "res3a_branch2b"
	name: "res3a_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 128
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3a_branch2b"
	top: "res3a_branch2b"
	name: "bn3a_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3a_branch2b"
	top: "res3a_branch2b"
	name: "scale3a_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res3a_branch2b"
	bottom: "res3a_branch2b"
	name: "res3a_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res3a_branch2b"
	top: "res3a_branch2c"
	name: "res3a_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 512
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3a_branch2c"
	top: "res3a_branch2c"
	name: "bn3a_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3a_branch2c"
	top: "res3a_branch2c"
	name: "scale3a_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res3a_branch1"
	bottom: "res3a_branch2c"
	top: "res3a"
	name: "res3a"
	type: "Eltwise"
}

layer {
	bottom: "res3a"
	top: "res3a"
	name: "res3a_relu"
	type: "ReLU"
}

layer {
	bottom: "res3a"
	top: "res3b1_branch2a"
	name: "res3b1_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 128
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3b1_branch2a"
	top: "res3b1_branch2a"
	name: "bn3b1_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3b1_branch2a"
	top: "res3b1_branch2a"
	name: "scale3b1_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res3b1_branch2a"
	bottom: "res3b1_branch2a"
	name: "res3b1_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res3b1_branch2a"
	top: "res3b1_branch2b"
	name: "res3b1_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 128
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3b1_branch2b"
	top: "res3b1_branch2b"
	name: "bn3b1_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3b1_branch2b"
	top: "res3b1_branch2b"
	name: "scale3b1_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res3b1_branch2b"
	bottom: "res3b1_branch2b"
	name: "res3b1_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res3b1_branch2b"
	top: "res3b1_branch2c"
	name: "res3b1_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 512
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3b1_branch2c"
	top: "res3b1_branch2c"
	name: "bn3b1_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3b1_branch2c"
	top: "res3b1_branch2c"
	name: "scale3b1_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res3a"
	bottom: "res3b1_branch2c"
	top: "res3b1"
	name: "res3b1"
	type: "Eltwise"
}

layer {
	bottom: "res3b1"
	top: "res3b1"
	name: "res3b1_relu"
	type: "ReLU"
}

layer {
	bottom: "res3b1"
	top: "res3b2_branch2a"
	name: "res3b2_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 128
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3b2_branch2a"
	top: "res3b2_branch2a"
	name: "bn3b2_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3b2_branch2a"
	top: "res3b2_branch2a"
	name: "scale3b2_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res3b2_branch2a"
	bottom: "res3b2_branch2a"
	name: "res3b2_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res3b2_branch2a"
	top: "res3b2_branch2b"
	name: "res3b2_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 128
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3b2_branch2b"
	top: "res3b2_branch2b"
	name: "bn3b2_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3b2_branch2b"
	top: "res3b2_branch2b"
	name: "scale3b2_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res3b2_branch2b"
	bottom: "res3b2_branch2b"
	name: "res3b2_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res3b2_branch2b"
	top: "res3b2_branch2c"
	name: "res3b2_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 512
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3b2_branch2c"
	top: "res3b2_branch2c"
	name: "bn3b2_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3b2_branch2c"
	top: "res3b2_branch2c"
	name: "scale3b2_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res3b1"
	bottom: "res3b2_branch2c"
	top: "res3b2"
	name: "res3b2"
	type: "Eltwise"
}

layer {
	bottom: "res3b2"
	top: "res3b2"
	name: "res3b2_relu"
	type: "ReLU"
}

layer {
	bottom: "res3b2"
	top: "res3b3_branch2a"
	name: "res3b3_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 128
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3b3_branch2a"
	top: "res3b3_branch2a"
	name: "bn3b3_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3b3_branch2a"
	top: "res3b3_branch2a"
	name: "scale3b3_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res3b3_branch2a"
	bottom: "res3b3_branch2a"
	name: "res3b3_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res3b3_branch2a"
	top: "res3b3_branch2b"
	name: "res3b3_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 128
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3b3_branch2b"
	top: "res3b3_branch2b"
	name: "bn3b3_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3b3_branch2b"
	top: "res3b3_branch2b"
	name: "scale3b3_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res3b3_branch2b"
	bottom: "res3b3_branch2b"
	name: "res3b3_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res3b3_branch2b"
	top: "res3b3_branch2c"
	name: "res3b3_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 512
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3b3_branch2c"
	top: "res3b3_branch2c"
	name: "bn3b3_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3b3_branch2c"
	top: "res3b3_branch2c"
	name: "scale3b3_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res3b2"
	bottom: "res3b3_branch2c"
	top: "res3b3"
	name: "res3b3"
	type: "Eltwise"
}

layer {
	bottom: "res3b3"
	top: "res3b3"
	name: "res3b3_relu"
	type: "ReLU"
}

layer {
	bottom: "res3b3"
	top: "res3b4_branch2a"
	name: "res3b4_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 128
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3b4_branch2a"
	top: "res3b4_branch2a"
	name: "bn3b4_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3b4_branch2a"
	top: "res3b4_branch2a"
	name: "scale3b4_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res3b4_branch2a"
	bottom: "res3b4_branch2a"
	name: "res3b4_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res3b4_branch2a"
	top: "res3b4_branch2b"
	name: "res3b4_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 128
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3b4_branch2b"
	top: "res3b4_branch2b"
	name: "bn3b4_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3b4_branch2b"
	top: "res3b4_branch2b"
	name: "scale3b4_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res3b4_branch2b"
	bottom: "res3b4_branch2b"
	name: "res3b4_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res3b4_branch2b"
	top: "res3b4_branch2c"
	name: "res3b4_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 512
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3b4_branch2c"
	top: "res3b4_branch2c"
	name: "bn3b4_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3b4_branch2c"
	top: "res3b4_branch2c"
	name: "scale3b4_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res3b3"
	bottom: "res3b4_branch2c"
	top: "res3b4"
	name: "res3b4"
	type: "Eltwise"
}

layer {
	bottom: "res3b4"
	top: "res3b4"
	name: "res3b4_relu"
	type: "ReLU"
}

layer {
	bottom: "res3b4"
	top: "res3b5_branch2a"
	name: "res3b5_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 128
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3b5_branch2a"
	top: "res3b5_branch2a"
	name: "bn3b5_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3b5_branch2a"
	top: "res3b5_branch2a"
	name: "scale3b5_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res3b5_branch2a"
	bottom: "res3b5_branch2a"
	name: "res3b5_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res3b5_branch2a"
	top: "res3b5_branch2b"
	name: "res3b5_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 128
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3b5_branch2b"
	top: "res3b5_branch2b"
	name: "bn3b5_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3b5_branch2b"
	top: "res3b5_branch2b"
	name: "scale3b5_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res3b5_branch2b"
	bottom: "res3b5_branch2b"
	name: "res3b5_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res3b5_branch2b"
	top: "res3b5_branch2c"
	name: "res3b5_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 512
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3b5_branch2c"
	top: "res3b5_branch2c"
	name: "bn3b5_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3b5_branch2c"
	top: "res3b5_branch2c"
	name: "scale3b5_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res3b4"
	bottom: "res3b5_branch2c"
	top: "res3b5"
	name: "res3b5"
	type: "Eltwise"
}

layer {
	bottom: "res3b5"
	top: "res3b5"
	name: "res3b5_relu"
	type: "ReLU"
}

layer {
	bottom: "res3b5"
	top: "res3b6_branch2a"
	name: "res3b6_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 128
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3b6_branch2a"
	top: "res3b6_branch2a"
	name: "bn3b6_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3b6_branch2a"
	top: "res3b6_branch2a"
	name: "scale3b6_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res3b6_branch2a"
	bottom: "res3b6_branch2a"
	name: "res3b6_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res3b6_branch2a"
	top: "res3b6_branch2b"
	name: "res3b6_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 128
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3b6_branch2b"
	top: "res3b6_branch2b"
	name: "bn3b6_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3b6_branch2b"
	top: "res3b6_branch2b"
	name: "scale3b6_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res3b6_branch2b"
	bottom: "res3b6_branch2b"
	name: "res3b6_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res3b6_branch2b"
	top: "res3b6_branch2c"
	name: "res3b6_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 512
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3b6_branch2c"
	top: "res3b6_branch2c"
	name: "bn3b6_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3b6_branch2c"
	top: "res3b6_branch2c"
	name: "scale3b6_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res3b5"
	bottom: "res3b6_branch2c"
	top: "res3b6"
	name: "res3b6"
	type: "Eltwise"
}

layer {
	bottom: "res3b6"
	top: "res3b6"
	name: "res3b6_relu"
	type: "ReLU"
}

layer {
	bottom: "res3b6"
	top: "res3b7_branch2a"
	name: "res3b7_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 128
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3b7_branch2a"
	top: "res3b7_branch2a"
	name: "bn3b7_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3b7_branch2a"
	top: "res3b7_branch2a"
	name: "scale3b7_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res3b7_branch2a"
	bottom: "res3b7_branch2a"
	name: "res3b7_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res3b7_branch2a"
	top: "res3b7_branch2b"
	name: "res3b7_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 128
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3b7_branch2b"
	top: "res3b7_branch2b"
	name: "bn3b7_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3b7_branch2b"
	top: "res3b7_branch2b"
	name: "scale3b7_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res3b7_branch2b"
	bottom: "res3b7_branch2b"
	name: "res3b7_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res3b7_branch2b"
	top: "res3b7_branch2c"
	name: "res3b7_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 512
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3b7_branch2c"
	top: "res3b7_branch2c"
	name: "bn3b7_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3b7_branch2c"
	top: "res3b7_branch2c"
	name: "scale3b7_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res3b6"
	bottom: "res3b7_branch2c"
	top: "res3b7"
	name: "res3b7"
	type: "Eltwise"
}

layer {
	bottom: "res3b7"
	top: "res3b7"
	name: "res3b7_relu"
	type: "ReLU"
}

layer {
	bottom: "res3b7"
	top: "res4a_branch1"
	name: "res4a_branch1"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 2
		bias_term: false
	}
}

layer {
	bottom: "res4a_branch1"
	top: "res4a_branch1"
	name: "bn4a_branch1"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4a_branch1"
	top: "res4a_branch1"
	name: "scale4a_branch1"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res3b7"
	top: "res4a_branch2a"
	name: "res4a_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 2
		bias_term: false
	}
}

layer {
	bottom: "res4a_branch2a"
	top: "res4a_branch2a"
	name: "bn4a_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4a_branch2a"
	top: "res4a_branch2a"
	name: "scale4a_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4a_branch2a"
	bottom: "res4a_branch2a"
	name: "res4a_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4a_branch2a"
	top: "res4a_branch2b"
	name: "res4a_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4a_branch2b"
	top: "res4a_branch2b"
	name: "bn4a_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4a_branch2b"
	top: "res4a_branch2b"
	name: "scale4a_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4a_branch2b"
	bottom: "res4a_branch2b"
	name: "res4a_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4a_branch2b"
	top: "res4a_branch2c"
	name: "res4a_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4a_branch2c"
	top: "res4a_branch2c"
	name: "bn4a_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4a_branch2c"
	top: "res4a_branch2c"
	name: "scale4a_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4a_branch1"
	bottom: "res4a_branch2c"
	top: "res4a"
	name: "res4a"
	type: "Eltwise"
}

layer {
	bottom: "res4a"
	top: "res4a"
	name: "res4a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4a"
	top: "res4b1_branch2a"
	name: "res4b1_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b1_branch2a"
	top: "res4b1_branch2a"
	name: "bn4b1_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b1_branch2a"
	top: "res4b1_branch2a"
	name: "scale4b1_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b1_branch2a"
	bottom: "res4b1_branch2a"
	name: "res4b1_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b1_branch2a"
	top: "res4b1_branch2b"
	name: "res4b1_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b1_branch2b"
	top: "res4b1_branch2b"
	name: "bn4b1_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b1_branch2b"
	top: "res4b1_branch2b"
	name: "scale4b1_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b1_branch2b"
	bottom: "res4b1_branch2b"
	name: "res4b1_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b1_branch2b"
	top: "res4b1_branch2c"
	name: "res4b1_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b1_branch2c"
	top: "res4b1_branch2c"
	name: "bn4b1_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b1_branch2c"
	top: "res4b1_branch2c"
	name: "scale4b1_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4a"
	bottom: "res4b1_branch2c"
	top: "res4b1"
	name: "res4b1"
	type: "Eltwise"
}

layer {
	bottom: "res4b1"
	top: "res4b1"
	name: "res4b1_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b1"
	top: "res4b2_branch2a"
	name: "res4b2_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b2_branch2a"
	top: "res4b2_branch2a"
	name: "bn4b2_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b2_branch2a"
	top: "res4b2_branch2a"
	name: "scale4b2_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b2_branch2a"
	bottom: "res4b2_branch2a"
	name: "res4b2_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b2_branch2a"
	top: "res4b2_branch2b"
	name: "res4b2_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b2_branch2b"
	top: "res4b2_branch2b"
	name: "bn4b2_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b2_branch2b"
	top: "res4b2_branch2b"
	name: "scale4b2_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b2_branch2b"
	bottom: "res4b2_branch2b"
	name: "res4b2_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b2_branch2b"
	top: "res4b2_branch2c"
	name: "res4b2_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b2_branch2c"
	top: "res4b2_branch2c"
	name: "bn4b2_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b2_branch2c"
	top: "res4b2_branch2c"
	name: "scale4b2_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b1"
	bottom: "res4b2_branch2c"
	top: "res4b2"
	name: "res4b2"
	type: "Eltwise"
}

layer {
	bottom: "res4b2"
	top: "res4b2"
	name: "res4b2_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b2"
	top: "res4b3_branch2a"
	name: "res4b3_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b3_branch2a"
	top: "res4b3_branch2a"
	name: "bn4b3_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b3_branch2a"
	top: "res4b3_branch2a"
	name: "scale4b3_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b3_branch2a"
	bottom: "res4b3_branch2a"
	name: "res4b3_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b3_branch2a"
	top: "res4b3_branch2b"
	name: "res4b3_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b3_branch2b"
	top: "res4b3_branch2b"
	name: "bn4b3_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b3_branch2b"
	top: "res4b3_branch2b"
	name: "scale4b3_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b3_branch2b"
	bottom: "res4b3_branch2b"
	name: "res4b3_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b3_branch2b"
	top: "res4b3_branch2c"
	name: "res4b3_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b3_branch2c"
	top: "res4b3_branch2c"
	name: "bn4b3_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b3_branch2c"
	top: "res4b3_branch2c"
	name: "scale4b3_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b2"
	bottom: "res4b3_branch2c"
	top: "res4b3"
	name: "res4b3"
	type: "Eltwise"
}

layer {
	bottom: "res4b3"
	top: "res4b3"
	name: "res4b3_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b3"
	top: "res4b4_branch2a"
	name: "res4b4_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b4_branch2a"
	top: "res4b4_branch2a"
	name: "bn4b4_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b4_branch2a"
	top: "res4b4_branch2a"
	name: "scale4b4_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b4_branch2a"
	bottom: "res4b4_branch2a"
	name: "res4b4_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b4_branch2a"
	top: "res4b4_branch2b"
	name: "res4b4_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b4_branch2b"
	top: "res4b4_branch2b"
	name: "bn4b4_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b4_branch2b"
	top: "res4b4_branch2b"
	name: "scale4b4_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b4_branch2b"
	bottom: "res4b4_branch2b"
	name: "res4b4_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b4_branch2b"
	top: "res4b4_branch2c"
	name: "res4b4_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b4_branch2c"
	top: "res4b4_branch2c"
	name: "bn4b4_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b4_branch2c"
	top: "res4b4_branch2c"
	name: "scale4b4_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b3"
	bottom: "res4b4_branch2c"
	top: "res4b4"
	name: "res4b4"
	type: "Eltwise"
}

layer {
	bottom: "res4b4"
	top: "res4b4"
	name: "res4b4_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b4"
	top: "res4b5_branch2a"
	name: "res4b5_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b5_branch2a"
	top: "res4b5_branch2a"
	name: "bn4b5_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b5_branch2a"
	top: "res4b5_branch2a"
	name: "scale4b5_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b5_branch2a"
	bottom: "res4b5_branch2a"
	name: "res4b5_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b5_branch2a"
	top: "res4b5_branch2b"
	name: "res4b5_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b5_branch2b"
	top: "res4b5_branch2b"
	name: "bn4b5_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b5_branch2b"
	top: "res4b5_branch2b"
	name: "scale4b5_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b5_branch2b"
	bottom: "res4b5_branch2b"
	name: "res4b5_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b5_branch2b"
	top: "res4b5_branch2c"
	name: "res4b5_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b5_branch2c"
	top: "res4b5_branch2c"
	name: "bn4b5_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b5_branch2c"
	top: "res4b5_branch2c"
	name: "scale4b5_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b4"
	bottom: "res4b5_branch2c"
	top: "res4b5"
	name: "res4b5"
	type: "Eltwise"
}

layer {
	bottom: "res4b5"
	top: "res4b5"
	name: "res4b5_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b5"
	top: "res4b6_branch2a"
	name: "res4b6_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b6_branch2a"
	top: "res4b6_branch2a"
	name: "bn4b6_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b6_branch2a"
	top: "res4b6_branch2a"
	name: "scale4b6_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b6_branch2a"
	bottom: "res4b6_branch2a"
	name: "res4b6_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b6_branch2a"
	top: "res4b6_branch2b"
	name: "res4b6_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b6_branch2b"
	top: "res4b6_branch2b"
	name: "bn4b6_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b6_branch2b"
	top: "res4b6_branch2b"
	name: "scale4b6_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b6_branch2b"
	bottom: "res4b6_branch2b"
	name: "res4b6_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b6_branch2b"
	top: "res4b6_branch2c"
	name: "res4b6_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b6_branch2c"
	top: "res4b6_branch2c"
	name: "bn4b6_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b6_branch2c"
	top: "res4b6_branch2c"
	name: "scale4b6_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b5"
	bottom: "res4b6_branch2c"
	top: "res4b6"
	name: "res4b6"
	type: "Eltwise"
}

layer {
	bottom: "res4b6"
	top: "res4b6"
	name: "res4b6_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b6"
	top: "res4b7_branch2a"
	name: "res4b7_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b7_branch2a"
	top: "res4b7_branch2a"
	name: "bn4b7_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b7_branch2a"
	top: "res4b7_branch2a"
	name: "scale4b7_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b7_branch2a"
	bottom: "res4b7_branch2a"
	name: "res4b7_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b7_branch2a"
	top: "res4b7_branch2b"
	name: "res4b7_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b7_branch2b"
	top: "res4b7_branch2b"
	name: "bn4b7_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b7_branch2b"
	top: "res4b7_branch2b"
	name: "scale4b7_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b7_branch2b"
	bottom: "res4b7_branch2b"
	name: "res4b7_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b7_branch2b"
	top: "res4b7_branch2c"
	name: "res4b7_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b7_branch2c"
	top: "res4b7_branch2c"
	name: "bn4b7_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b7_branch2c"
	top: "res4b7_branch2c"
	name: "scale4b7_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b6"
	bottom: "res4b7_branch2c"
	top: "res4b7"
	name: "res4b7"
	type: "Eltwise"
}

layer {
	bottom: "res4b7"
	top: "res4b7"
	name: "res4b7_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b7"
	top: "res4b8_branch2a"
	name: "res4b8_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b8_branch2a"
	top: "res4b8_branch2a"
	name: "bn4b8_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b8_branch2a"
	top: "res4b8_branch2a"
	name: "scale4b8_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b8_branch2a"
	bottom: "res4b8_branch2a"
	name: "res4b8_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b8_branch2a"
	top: "res4b8_branch2b"
	name: "res4b8_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b8_branch2b"
	top: "res4b8_branch2b"
	name: "bn4b8_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b8_branch2b"
	top: "res4b8_branch2b"
	name: "scale4b8_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b8_branch2b"
	bottom: "res4b8_branch2b"
	name: "res4b8_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b8_branch2b"
	top: "res4b8_branch2c"
	name: "res4b8_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b8_branch2c"
	top: "res4b8_branch2c"
	name: "bn4b8_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b8_branch2c"
	top: "res4b8_branch2c"
	name: "scale4b8_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b7"
	bottom: "res4b8_branch2c"
	top: "res4b8"
	name: "res4b8"
	type: "Eltwise"
}

layer {
	bottom: "res4b8"
	top: "res4b8"
	name: "res4b8_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b8"
	top: "res4b9_branch2a"
	name: "res4b9_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b9_branch2a"
	top: "res4b9_branch2a"
	name: "bn4b9_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b9_branch2a"
	top: "res4b9_branch2a"
	name: "scale4b9_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b9_branch2a"
	bottom: "res4b9_branch2a"
	name: "res4b9_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b9_branch2a"
	top: "res4b9_branch2b"
	name: "res4b9_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b9_branch2b"
	top: "res4b9_branch2b"
	name: "bn4b9_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b9_branch2b"
	top: "res4b9_branch2b"
	name: "scale4b9_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b9_branch2b"
	bottom: "res4b9_branch2b"
	name: "res4b9_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b9_branch2b"
	top: "res4b9_branch2c"
	name: "res4b9_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b9_branch2c"
	top: "res4b9_branch2c"
	name: "bn4b9_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b9_branch2c"
	top: "res4b9_branch2c"
	name: "scale4b9_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b8"
	bottom: "res4b9_branch2c"
	top: "res4b9"
	name: "res4b9"
	type: "Eltwise"
}

layer {
	bottom: "res4b9"
	top: "res4b9"
	name: "res4b9_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b9"
	top: "res4b10_branch2a"
	name: "res4b10_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b10_branch2a"
	top: "res4b10_branch2a"
	name: "bn4b10_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b10_branch2a"
	top: "res4b10_branch2a"
	name: "scale4b10_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b10_branch2a"
	bottom: "res4b10_branch2a"
	name: "res4b10_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b10_branch2a"
	top: "res4b10_branch2b"
	name: "res4b10_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b10_branch2b"
	top: "res4b10_branch2b"
	name: "bn4b10_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b10_branch2b"
	top: "res4b10_branch2b"
	name: "scale4b10_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b10_branch2b"
	bottom: "res4b10_branch2b"
	name: "res4b10_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b10_branch2b"
	top: "res4b10_branch2c"
	name: "res4b10_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b10_branch2c"
	top: "res4b10_branch2c"
	name: "bn4b10_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b10_branch2c"
	top: "res4b10_branch2c"
	name: "scale4b10_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b9"
	bottom: "res4b10_branch2c"
	top: "res4b10"
	name: "res4b10"
	type: "Eltwise"
}

layer {
	bottom: "res4b10"
	top: "res4b10"
	name: "res4b10_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b10"
	top: "res4b11_branch2a"
	name: "res4b11_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b11_branch2a"
	top: "res4b11_branch2a"
	name: "bn4b11_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b11_branch2a"
	top: "res4b11_branch2a"
	name: "scale4b11_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b11_branch2a"
	bottom: "res4b11_branch2a"
	name: "res4b11_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b11_branch2a"
	top: "res4b11_branch2b"
	name: "res4b11_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b11_branch2b"
	top: "res4b11_branch2b"
	name: "bn4b11_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b11_branch2b"
	top: "res4b11_branch2b"
	name: "scale4b11_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b11_branch2b"
	bottom: "res4b11_branch2b"
	name: "res4b11_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b11_branch2b"
	top: "res4b11_branch2c"
	name: "res4b11_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b11_branch2c"
	top: "res4b11_branch2c"
	name: "bn4b11_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b11_branch2c"
	top: "res4b11_branch2c"
	name: "scale4b11_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b10"
	bottom: "res4b11_branch2c"
	top: "res4b11"
	name: "res4b11"
	type: "Eltwise"
}

layer {
	bottom: "res4b11"
	top: "res4b11"
	name: "res4b11_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b11"
	top: "res4b12_branch2a"
	name: "res4b12_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b12_branch2a"
	top: "res4b12_branch2a"
	name: "bn4b12_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b12_branch2a"
	top: "res4b12_branch2a"
	name: "scale4b12_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b12_branch2a"
	bottom: "res4b12_branch2a"
	name: "res4b12_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b12_branch2a"
	top: "res4b12_branch2b"
	name: "res4b12_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b12_branch2b"
	top: "res4b12_branch2b"
	name: "bn4b12_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b12_branch2b"
	top: "res4b12_branch2b"
	name: "scale4b12_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b12_branch2b"
	bottom: "res4b12_branch2b"
	name: "res4b12_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b12_branch2b"
	top: "res4b12_branch2c"
	name: "res4b12_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b12_branch2c"
	top: "res4b12_branch2c"
	name: "bn4b12_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b12_branch2c"
	top: "res4b12_branch2c"
	name: "scale4b12_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b11"
	bottom: "res4b12_branch2c"
	top: "res4b12"
	name: "res4b12"
	type: "Eltwise"
}

layer {
	bottom: "res4b12"
	top: "res4b12"
	name: "res4b12_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b12"
	top: "res4b13_branch2a"
	name: "res4b13_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b13_branch2a"
	top: "res4b13_branch2a"
	name: "bn4b13_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b13_branch2a"
	top: "res4b13_branch2a"
	name: "scale4b13_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b13_branch2a"
	bottom: "res4b13_branch2a"
	name: "res4b13_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b13_branch2a"
	top: "res4b13_branch2b"
	name: "res4b13_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b13_branch2b"
	top: "res4b13_branch2b"
	name: "bn4b13_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b13_branch2b"
	top: "res4b13_branch2b"
	name: "scale4b13_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b13_branch2b"
	bottom: "res4b13_branch2b"
	name: "res4b13_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b13_branch2b"
	top: "res4b13_branch2c"
	name: "res4b13_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b13_branch2c"
	top: "res4b13_branch2c"
	name: "bn4b13_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b13_branch2c"
	top: "res4b13_branch2c"
	name: "scale4b13_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b12"
	bottom: "res4b13_branch2c"
	top: "res4b13"
	name: "res4b13"
	type: "Eltwise"
}

layer {
	bottom: "res4b13"
	top: "res4b13"
	name: "res4b13_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b13"
	top: "res4b14_branch2a"
	name: "res4b14_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b14_branch2a"
	top: "res4b14_branch2a"
	name: "bn4b14_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b14_branch2a"
	top: "res4b14_branch2a"
	name: "scale4b14_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b14_branch2a"
	bottom: "res4b14_branch2a"
	name: "res4b14_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b14_branch2a"
	top: "res4b14_branch2b"
	name: "res4b14_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b14_branch2b"
	top: "res4b14_branch2b"
	name: "bn4b14_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b14_branch2b"
	top: "res4b14_branch2b"
	name: "scale4b14_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b14_branch2b"
	bottom: "res4b14_branch2b"
	name: "res4b14_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b14_branch2b"
	top: "res4b14_branch2c"
	name: "res4b14_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b14_branch2c"
	top: "res4b14_branch2c"
	name: "bn4b14_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b14_branch2c"
	top: "res4b14_branch2c"
	name: "scale4b14_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b13"
	bottom: "res4b14_branch2c"
	top: "res4b14"
	name: "res4b14"
	type: "Eltwise"
}

layer {
	bottom: "res4b14"
	top: "res4b14"
	name: "res4b14_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b14"
	top: "res4b15_branch2a"
	name: "res4b15_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b15_branch2a"
	top: "res4b15_branch2a"
	name: "bn4b15_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b15_branch2a"
	top: "res4b15_branch2a"
	name: "scale4b15_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b15_branch2a"
	bottom: "res4b15_branch2a"
	name: "res4b15_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b15_branch2a"
	top: "res4b15_branch2b"
	name: "res4b15_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b15_branch2b"
	top: "res4b15_branch2b"
	name: "bn4b15_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b15_branch2b"
	top: "res4b15_branch2b"
	name: "scale4b15_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b15_branch2b"
	bottom: "res4b15_branch2b"
	name: "res4b15_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b15_branch2b"
	top: "res4b15_branch2c"
	name: "res4b15_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b15_branch2c"
	top: "res4b15_branch2c"
	name: "bn4b15_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b15_branch2c"
	top: "res4b15_branch2c"
	name: "scale4b15_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b14"
	bottom: "res4b15_branch2c"
	top: "res4b15"
	name: "res4b15"
	type: "Eltwise"
}

layer {
	bottom: "res4b15"
	top: "res4b15"
	name: "res4b15_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b15"
	top: "res4b16_branch2a"
	name: "res4b16_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b16_branch2a"
	top: "res4b16_branch2a"
	name: "bn4b16_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b16_branch2a"
	top: "res4b16_branch2a"
	name: "scale4b16_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b16_branch2a"
	bottom: "res4b16_branch2a"
	name: "res4b16_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b16_branch2a"
	top: "res4b16_branch2b"
	name: "res4b16_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b16_branch2b"
	top: "res4b16_branch2b"
	name: "bn4b16_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b16_branch2b"
	top: "res4b16_branch2b"
	name: "scale4b16_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b16_branch2b"
	bottom: "res4b16_branch2b"
	name: "res4b16_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b16_branch2b"
	top: "res4b16_branch2c"
	name: "res4b16_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b16_branch2c"
	top: "res4b16_branch2c"
	name: "bn4b16_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b16_branch2c"
	top: "res4b16_branch2c"
	name: "scale4b16_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b15"
	bottom: "res4b16_branch2c"
	top: "res4b16"
	name: "res4b16"
	type: "Eltwise"
}

layer {
	bottom: "res4b16"
	top: "res4b16"
	name: "res4b16_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b16"
	top: "res4b17_branch2a"
	name: "res4b17_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b17_branch2a"
	top: "res4b17_branch2a"
	name: "bn4b17_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b17_branch2a"
	top: "res4b17_branch2a"
	name: "scale4b17_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b17_branch2a"
	bottom: "res4b17_branch2a"
	name: "res4b17_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b17_branch2a"
	top: "res4b17_branch2b"
	name: "res4b17_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b17_branch2b"
	top: "res4b17_branch2b"
	name: "bn4b17_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b17_branch2b"
	top: "res4b17_branch2b"
	name: "scale4b17_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b17_branch2b"
	bottom: "res4b17_branch2b"
	name: "res4b17_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b17_branch2b"
	top: "res4b17_branch2c"
	name: "res4b17_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b17_branch2c"
	top: "res4b17_branch2c"
	name: "bn4b17_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b17_branch2c"
	top: "res4b17_branch2c"
	name: "scale4b17_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b16"
	bottom: "res4b17_branch2c"
	top: "res4b17"
	name: "res4b17"
	type: "Eltwise"
}

layer {
	bottom: "res4b17"
	top: "res4b17"
	name: "res4b17_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b17"
	top: "res4b18_branch2a"
	name: "res4b18_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b18_branch2a"
	top: "res4b18_branch2a"
	name: "bn4b18_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b18_branch2a"
	top: "res4b18_branch2a"
	name: "scale4b18_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b18_branch2a"
	bottom: "res4b18_branch2a"
	name: "res4b18_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b18_branch2a"
	top: "res4b18_branch2b"
	name: "res4b18_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b18_branch2b"
	top: "res4b18_branch2b"
	name: "bn4b18_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b18_branch2b"
	top: "res4b18_branch2b"
	name: "scale4b18_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b18_branch2b"
	bottom: "res4b18_branch2b"
	name: "res4b18_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b18_branch2b"
	top: "res4b18_branch2c"
	name: "res4b18_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b18_branch2c"
	top: "res4b18_branch2c"
	name: "bn4b18_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b18_branch2c"
	top: "res4b18_branch2c"
	name: "scale4b18_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b17"
	bottom: "res4b18_branch2c"
	top: "res4b18"
	name: "res4b18"
	type: "Eltwise"
}

layer {
	bottom: "res4b18"
	top: "res4b18"
	name: "res4b18_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b18"
	top: "res4b19_branch2a"
	name: "res4b19_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b19_branch2a"
	top: "res4b19_branch2a"
	name: "bn4b19_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b19_branch2a"
	top: "res4b19_branch2a"
	name: "scale4b19_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b19_branch2a"
	bottom: "res4b19_branch2a"
	name: "res4b19_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b19_branch2a"
	top: "res4b19_branch2b"
	name: "res4b19_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b19_branch2b"
	top: "res4b19_branch2b"
	name: "bn4b19_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b19_branch2b"
	top: "res4b19_branch2b"
	name: "scale4b19_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b19_branch2b"
	bottom: "res4b19_branch2b"
	name: "res4b19_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b19_branch2b"
	top: "res4b19_branch2c"
	name: "res4b19_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b19_branch2c"
	top: "res4b19_branch2c"
	name: "bn4b19_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b19_branch2c"
	top: "res4b19_branch2c"
	name: "scale4b19_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b18"
	bottom: "res4b19_branch2c"
	top: "res4b19"
	name: "res4b19"
	type: "Eltwise"
}

layer {
	bottom: "res4b19"
	top: "res4b19"
	name: "res4b19_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b19"
	top: "res4b20_branch2a"
	name: "res4b20_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b20_branch2a"
	top: "res4b20_branch2a"
	name: "bn4b20_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b20_branch2a"
	top: "res4b20_branch2a"
	name: "scale4b20_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b20_branch2a"
	bottom: "res4b20_branch2a"
	name: "res4b20_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b20_branch2a"
	top: "res4b20_branch2b"
	name: "res4b20_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b20_branch2b"
	top: "res4b20_branch2b"
	name: "bn4b20_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b20_branch2b"
	top: "res4b20_branch2b"
	name: "scale4b20_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b20_branch2b"
	bottom: "res4b20_branch2b"
	name: "res4b20_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b20_branch2b"
	top: "res4b20_branch2c"
	name: "res4b20_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b20_branch2c"
	top: "res4b20_branch2c"
	name: "bn4b20_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b20_branch2c"
	top: "res4b20_branch2c"
	name: "scale4b20_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b19"
	bottom: "res4b20_branch2c"
	top: "res4b20"
	name: "res4b20"
	type: "Eltwise"
}

layer {
	bottom: "res4b20"
	top: "res4b20"
	name: "res4b20_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b20"
	top: "res4b21_branch2a"
	name: "res4b21_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b21_branch2a"
	top: "res4b21_branch2a"
	name: "bn4b21_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b21_branch2a"
	top: "res4b21_branch2a"
	name: "scale4b21_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b21_branch2a"
	bottom: "res4b21_branch2a"
	name: "res4b21_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b21_branch2a"
	top: "res4b21_branch2b"
	name: "res4b21_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b21_branch2b"
	top: "res4b21_branch2b"
	name: "bn4b21_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b21_branch2b"
	top: "res4b21_branch2b"
	name: "scale4b21_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b21_branch2b"
	bottom: "res4b21_branch2b"
	name: "res4b21_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b21_branch2b"
	top: "res4b21_branch2c"
	name: "res4b21_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b21_branch2c"
	top: "res4b21_branch2c"
	name: "bn4b21_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b21_branch2c"
	top: "res4b21_branch2c"
	name: "scale4b21_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b20"
	bottom: "res4b21_branch2c"
	top: "res4b21"
	name: "res4b21"
	type: "Eltwise"
}

layer {
	bottom: "res4b21"
	top: "res4b21"
	name: "res4b21_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b21"
	top: "res4b22_branch2a"
	name: "res4b22_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b22_branch2a"
	top: "res4b22_branch2a"
	name: "bn4b22_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b22_branch2a"
	top: "res4b22_branch2a"
	name: "scale4b22_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b22_branch2a"
	bottom: "res4b22_branch2a"
	name: "res4b22_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b22_branch2a"
	top: "res4b22_branch2b"
	name: "res4b22_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b22_branch2b"
	top: "res4b22_branch2b"
	name: "bn4b22_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b22_branch2b"
	top: "res4b22_branch2b"
	name: "scale4b22_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b22_branch2b"
	bottom: "res4b22_branch2b"
	name: "res4b22_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b22_branch2b"
	top: "res4b22_branch2c"
	name: "res4b22_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b22_branch2c"
	top: "res4b22_branch2c"
	name: "bn4b22_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b22_branch2c"
	top: "res4b22_branch2c"
	name: "scale4b22_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b21"
	bottom: "res4b22_branch2c"
	top: "res4b22"
	name: "res4b22"
	type: "Eltwise"
}

layer {
	bottom: "res4b22"
	top: "res4b22"
	name: "res4b22_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b22"
	top: "res4b23_branch2a"
	name: "res4b23_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b23_branch2a"
	top: "res4b23_branch2a"
	name: "bn4b23_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b23_branch2a"
	top: "res4b23_branch2a"
	name: "scale4b23_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b23_branch2a"
	bottom: "res4b23_branch2a"
	name: "res4b23_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b23_branch2a"
	top: "res4b23_branch2b"
	name: "res4b23_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b23_branch2b"
	top: "res4b23_branch2b"
	name: "bn4b23_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b23_branch2b"
	top: "res4b23_branch2b"
	name: "scale4b23_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b23_branch2b"
	bottom: "res4b23_branch2b"
	name: "res4b23_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b23_branch2b"
	top: "res4b23_branch2c"
	name: "res4b23_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b23_branch2c"
	top: "res4b23_branch2c"
	name: "bn4b23_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b23_branch2c"
	top: "res4b23_branch2c"
	name: "scale4b23_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b22"
	bottom: "res4b23_branch2c"
	top: "res4b23"
	name: "res4b23"
	type: "Eltwise"
}

layer {
	bottom: "res4b23"
	top: "res4b23"
	name: "res4b23_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b23"
	top: "res4b24_branch2a"
	name: "res4b24_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b24_branch2a"
	top: "res4b24_branch2a"
	name: "bn4b24_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b24_branch2a"
	top: "res4b24_branch2a"
	name: "scale4b24_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b24_branch2a"
	bottom: "res4b24_branch2a"
	name: "res4b24_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b24_branch2a"
	top: "res4b24_branch2b"
	name: "res4b24_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b24_branch2b"
	top: "res4b24_branch2b"
	name: "bn4b24_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b24_branch2b"
	top: "res4b24_branch2b"
	name: "scale4b24_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b24_branch2b"
	bottom: "res4b24_branch2b"
	name: "res4b24_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b24_branch2b"
	top: "res4b24_branch2c"
	name: "res4b24_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b24_branch2c"
	top: "res4b24_branch2c"
	name: "bn4b24_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b24_branch2c"
	top: "res4b24_branch2c"
	name: "scale4b24_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b23"
	bottom: "res4b24_branch2c"
	top: "res4b24"
	name: "res4b24"
	type: "Eltwise"
}

layer {
	bottom: "res4b24"
	top: "res4b24"
	name: "res4b24_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b24"
	top: "res4b25_branch2a"
	name: "res4b25_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b25_branch2a"
	top: "res4b25_branch2a"
	name: "bn4b25_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b25_branch2a"
	top: "res4b25_branch2a"
	name: "scale4b25_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b25_branch2a"
	bottom: "res4b25_branch2a"
	name: "res4b25_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b25_branch2a"
	top: "res4b25_branch2b"
	name: "res4b25_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b25_branch2b"
	top: "res4b25_branch2b"
	name: "bn4b25_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b25_branch2b"
	top: "res4b25_branch2b"
	name: "scale4b25_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b25_branch2b"
	bottom: "res4b25_branch2b"
	name: "res4b25_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b25_branch2b"
	top: "res4b25_branch2c"
	name: "res4b25_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b25_branch2c"
	top: "res4b25_branch2c"
	name: "bn4b25_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b25_branch2c"
	top: "res4b25_branch2c"
	name: "scale4b25_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b24"
	bottom: "res4b25_branch2c"
	top: "res4b25"
	name: "res4b25"
	type: "Eltwise"
}

layer {
	bottom: "res4b25"
	top: "res4b25"
	name: "res4b25_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b25"
	top: "res4b26_branch2a"
	name: "res4b26_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b26_branch2a"
	top: "res4b26_branch2a"
	name: "bn4b26_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b26_branch2a"
	top: "res4b26_branch2a"
	name: "scale4b26_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b26_branch2a"
	bottom: "res4b26_branch2a"
	name: "res4b26_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b26_branch2a"
	top: "res4b26_branch2b"
	name: "res4b26_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b26_branch2b"
	top: "res4b26_branch2b"
	name: "bn4b26_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b26_branch2b"
	top: "res4b26_branch2b"
	name: "scale4b26_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b26_branch2b"
	bottom: "res4b26_branch2b"
	name: "res4b26_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b26_branch2b"
	top: "res4b26_branch2c"
	name: "res4b26_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b26_branch2c"
	top: "res4b26_branch2c"
	name: "bn4b26_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b26_branch2c"
	top: "res4b26_branch2c"
	name: "scale4b26_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b25"
	bottom: "res4b26_branch2c"
	top: "res4b26"
	name: "res4b26"
	type: "Eltwise"
}

layer {
	bottom: "res4b26"
	top: "res4b26"
	name: "res4b26_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b26"
	top: "res4b27_branch2a"
	name: "res4b27_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b27_branch2a"
	top: "res4b27_branch2a"
	name: "bn4b27_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b27_branch2a"
	top: "res4b27_branch2a"
	name: "scale4b27_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b27_branch2a"
	bottom: "res4b27_branch2a"
	name: "res4b27_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b27_branch2a"
	top: "res4b27_branch2b"
	name: "res4b27_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b27_branch2b"
	top: "res4b27_branch2b"
	name: "bn4b27_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b27_branch2b"
	top: "res4b27_branch2b"
	name: "scale4b27_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b27_branch2b"
	bottom: "res4b27_branch2b"
	name: "res4b27_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b27_branch2b"
	top: "res4b27_branch2c"
	name: "res4b27_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b27_branch2c"
	top: "res4b27_branch2c"
	name: "bn4b27_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b27_branch2c"
	top: "res4b27_branch2c"
	name: "scale4b27_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b26"
	bottom: "res4b27_branch2c"
	top: "res4b27"
	name: "res4b27"
	type: "Eltwise"
}

layer {
	bottom: "res4b27"
	top: "res4b27"
	name: "res4b27_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b27"
	top: "res4b28_branch2a"
	name: "res4b28_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b28_branch2a"
	top: "res4b28_branch2a"
	name: "bn4b28_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b28_branch2a"
	top: "res4b28_branch2a"
	name: "scale4b28_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b28_branch2a"
	bottom: "res4b28_branch2a"
	name: "res4b28_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b28_branch2a"
	top: "res4b28_branch2b"
	name: "res4b28_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b28_branch2b"
	top: "res4b28_branch2b"
	name: "bn4b28_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b28_branch2b"
	top: "res4b28_branch2b"
	name: "scale4b28_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b28_branch2b"
	bottom: "res4b28_branch2b"
	name: "res4b28_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b28_branch2b"
	top: "res4b28_branch2c"
	name: "res4b28_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b28_branch2c"
	top: "res4b28_branch2c"
	name: "bn4b28_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b28_branch2c"
	top: "res4b28_branch2c"
	name: "scale4b28_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b27"
	bottom: "res4b28_branch2c"
	top: "res4b28"
	name: "res4b28"
	type: "Eltwise"
}

layer {
	bottom: "res4b28"
	top: "res4b28"
	name: "res4b28_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b28"
	top: "res4b29_branch2a"
	name: "res4b29_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b29_branch2a"
	top: "res4b29_branch2a"
	name: "bn4b29_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b29_branch2a"
	top: "res4b29_branch2a"
	name: "scale4b29_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b29_branch2a"
	bottom: "res4b29_branch2a"
	name: "res4b29_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b29_branch2a"
	top: "res4b29_branch2b"
	name: "res4b29_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b29_branch2b"
	top: "res4b29_branch2b"
	name: "bn4b29_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b29_branch2b"
	top: "res4b29_branch2b"
	name: "scale4b29_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b29_branch2b"
	bottom: "res4b29_branch2b"
	name: "res4b29_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b29_branch2b"
	top: "res4b29_branch2c"
	name: "res4b29_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b29_branch2c"
	top: "res4b29_branch2c"
	name: "bn4b29_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b29_branch2c"
	top: "res4b29_branch2c"
	name: "scale4b29_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b28"
	bottom: "res4b29_branch2c"
	top: "res4b29"
	name: "res4b29"
	type: "Eltwise"
}

layer {
	bottom: "res4b29"
	top: "res4b29"
	name: "res4b29_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b29"
	top: "res4b30_branch2a"
	name: "res4b30_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b30_branch2a"
	top: "res4b30_branch2a"
	name: "bn4b30_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b30_branch2a"
	top: "res4b30_branch2a"
	name: "scale4b30_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b30_branch2a"
	bottom: "res4b30_branch2a"
	name: "res4b30_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b30_branch2a"
	top: "res4b30_branch2b"
	name: "res4b30_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b30_branch2b"
	top: "res4b30_branch2b"
	name: "bn4b30_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b30_branch2b"
	top: "res4b30_branch2b"
	name: "scale4b30_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b30_branch2b"
	bottom: "res4b30_branch2b"
	name: "res4b30_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b30_branch2b"
	top: "res4b30_branch2c"
	name: "res4b30_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b30_branch2c"
	top: "res4b30_branch2c"
	name: "bn4b30_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b30_branch2c"
	top: "res4b30_branch2c"
	name: "scale4b30_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b29"
	bottom: "res4b30_branch2c"
	top: "res4b30"
	name: "res4b30"
	type: "Eltwise"
}

layer {
	bottom: "res4b30"
	top: "res4b30"
	name: "res4b30_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b30"
	top: "res4b31_branch2a"
	name: "res4b31_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b31_branch2a"
	top: "res4b31_branch2a"
	name: "bn4b31_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b31_branch2a"
	top: "res4b31_branch2a"
	name: "scale4b31_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b31_branch2a"
	bottom: "res4b31_branch2a"
	name: "res4b31_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b31_branch2a"
	top: "res4b31_branch2b"
	name: "res4b31_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b31_branch2b"
	top: "res4b31_branch2b"
	name: "bn4b31_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b31_branch2b"
	top: "res4b31_branch2b"
	name: "scale4b31_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b31_branch2b"
	bottom: "res4b31_branch2b"
	name: "res4b31_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b31_branch2b"
	top: "res4b31_branch2c"
	name: "res4b31_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b31_branch2c"
	top: "res4b31_branch2c"
	name: "bn4b31_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b31_branch2c"
	top: "res4b31_branch2c"
	name: "scale4b31_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b30"
	bottom: "res4b31_branch2c"
	top: "res4b31"
	name: "res4b31"
	type: "Eltwise"
}

layer {
	bottom: "res4b31"
	top: "res4b31"
	name: "res4b31_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b31"
	top: "res4b32_branch2a"
	name: "res4b32_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b32_branch2a"
	top: "res4b32_branch2a"
	name: "bn4b32_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b32_branch2a"
	top: "res4b32_branch2a"
	name: "scale4b32_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b32_branch2a"
	bottom: "res4b32_branch2a"
	name: "res4b32_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b32_branch2a"
	top: "res4b32_branch2b"
	name: "res4b32_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b32_branch2b"
	top: "res4b32_branch2b"
	name: "bn4b32_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b32_branch2b"
	top: "res4b32_branch2b"
	name: "scale4b32_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b32_branch2b"
	bottom: "res4b32_branch2b"
	name: "res4b32_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b32_branch2b"
	top: "res4b32_branch2c"
	name: "res4b32_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b32_branch2c"
	top: "res4b32_branch2c"
	name: "bn4b32_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b32_branch2c"
	top: "res4b32_branch2c"
	name: "scale4b32_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b31"
	bottom: "res4b32_branch2c"
	top: "res4b32"
	name: "res4b32"
	type: "Eltwise"
}

layer {
	bottom: "res4b32"
	top: "res4b32"
	name: "res4b32_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b32"
	top: "res4b33_branch2a"
	name: "res4b33_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b33_branch2a"
	top: "res4b33_branch2a"
	name: "bn4b33_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b33_branch2a"
	top: "res4b33_branch2a"
	name: "scale4b33_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b33_branch2a"
	bottom: "res4b33_branch2a"
	name: "res4b33_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b33_branch2a"
	top: "res4b33_branch2b"
	name: "res4b33_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b33_branch2b"
	top: "res4b33_branch2b"
	name: "bn4b33_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b33_branch2b"
	top: "res4b33_branch2b"
	name: "scale4b33_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b33_branch2b"
	bottom: "res4b33_branch2b"
	name: "res4b33_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b33_branch2b"
	top: "res4b33_branch2c"
	name: "res4b33_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b33_branch2c"
	top: "res4b33_branch2c"
	name: "bn4b33_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b33_branch2c"
	top: "res4b33_branch2c"
	name: "scale4b33_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b32"
	bottom: "res4b33_branch2c"
	top: "res4b33"
	name: "res4b33"
	type: "Eltwise"
}

layer {
	bottom: "res4b33"
	top: "res4b33"
	name: "res4b33_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b33"
	top: "res4b34_branch2a"
	name: "res4b34_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b34_branch2a"
	top: "res4b34_branch2a"
	name: "bn4b34_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b34_branch2a"
	top: "res4b34_branch2a"
	name: "scale4b34_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b34_branch2a"
	bottom: "res4b34_branch2a"
	name: "res4b34_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b34_branch2a"
	top: "res4b34_branch2b"
	name: "res4b34_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b34_branch2b"
	top: "res4b34_branch2b"
	name: "bn4b34_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b34_branch2b"
	top: "res4b34_branch2b"
	name: "scale4b34_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b34_branch2b"
	bottom: "res4b34_branch2b"
	name: "res4b34_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b34_branch2b"
	top: "res4b34_branch2c"
	name: "res4b34_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b34_branch2c"
	top: "res4b34_branch2c"
	name: "bn4b34_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b34_branch2c"
	top: "res4b34_branch2c"
	name: "scale4b34_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b33"
	bottom: "res4b34_branch2c"
	top: "res4b34"
	name: "res4b34"
	type: "Eltwise"
}

layer {
	bottom: "res4b34"
	top: "res4b34"
	name: "res4b34_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b34"
	top: "res4b35_branch2a"
	name: "res4b35_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b35_branch2a"
	top: "res4b35_branch2a"
	name: "bn4b35_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b35_branch2a"
	top: "res4b35_branch2a"
	name: "scale4b35_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b35_branch2a"
	bottom: "res4b35_branch2a"
	name: "res4b35_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b35_branch2a"
	top: "res4b35_branch2b"
	name: "res4b35_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b35_branch2b"
	top: "res4b35_branch2b"
	name: "bn4b35_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b35_branch2b"
	top: "res4b35_branch2b"
	name: "scale4b35_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res4b35_branch2b"
	bottom: "res4b35_branch2b"
	name: "res4b35_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b35_branch2b"
	top: "res4b35_branch2c"
	name: "res4b35_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b35_branch2c"
	top: "res4b35_branch2c"
	name: "bn4b35_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b35_branch2c"
	top: "res4b35_branch2c"
	name: "scale4b35_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b34"
	bottom: "res4b35_branch2c"
	top: "res4b35"
	name: "res4b35"
	type: "Eltwise"
}

layer {
	bottom: "res4b35"
	top: "res4b35"
	name: "res4b35_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b35"
	top: "res5a_branch1"
	name: "res5a_branch1"
	type: "Convolution"
	convolution_param {
		num_output: 2048
		kernel_size: 1
		pad: 0
		stride: 2
		bias_term: false
	}
}

layer {
	bottom: "res5a_branch1"
	top: "res5a_branch1"
	name: "bn5a_branch1"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res5a_branch1"
	top: "res5a_branch1"
	name: "scale5a_branch1"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b35"
	top: "res5a_branch2a"
	name: "res5a_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 512
		kernel_size: 1
		pad: 0
		stride: 2
		bias_term: false
	}
}

layer {
	bottom: "res5a_branch2a"
	top: "res5a_branch2a"
	name: "bn5a_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res5a_branch2a"
	top: "res5a_branch2a"
	name: "scale5a_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res5a_branch2a"
	bottom: "res5a_branch2a"
	name: "res5a_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res5a_branch2a"
	top: "res5a_branch2b"
	name: "res5a_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 512
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res5a_branch2b"
	top: "res5a_branch2b"
	name: "bn5a_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res5a_branch2b"
	top: "res5a_branch2b"
	name: "scale5a_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res5a_branch2b"
	bottom: "res5a_branch2b"
	name: "res5a_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res5a_branch2b"
	top: "res5a_branch2c"
	name: "res5a_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 2048
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res5a_branch2c"
	top: "res5a_branch2c"
	name: "bn5a_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res5a_branch2c"
	top: "res5a_branch2c"
	name: "scale5a_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res5a_branch1"
	bottom: "res5a_branch2c"
	top: "res5a"
	name: "res5a"
	type: "Eltwise"
}

layer {
	bottom: "res5a"
	top: "res5a"
	name: "res5a_relu"
	type: "ReLU"
}

layer {
	bottom: "res5a"
	top: "res5b_branch2a"
	name: "res5b_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 512
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res5b_branch2a"
	top: "res5b_branch2a"
	name: "bn5b_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res5b_branch2a"
	top: "res5b_branch2a"
	name: "scale5b_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res5b_branch2a"
	bottom: "res5b_branch2a"
	name: "res5b_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res5b_branch2a"
	top: "res5b_branch2b"
	name: "res5b_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 512
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res5b_branch2b"
	top: "res5b_branch2b"
	name: "bn5b_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res5b_branch2b"
	top: "res5b_branch2b"
	name: "scale5b_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res5b_branch2b"
	bottom: "res5b_branch2b"
	name: "res5b_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res5b_branch2b"
	top: "res5b_branch2c"
	name: "res5b_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 2048
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res5b_branch2c"
	top: "res5b_branch2c"
	name: "bn5b_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res5b_branch2c"
	top: "res5b_branch2c"
	name: "scale5b_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res5a"
	bottom: "res5b_branch2c"
	top: "res5b"
	name: "res5b"
	type: "Eltwise"
}

layer {
	bottom: "res5b"
	top: "res5b"
	name: "res5b_relu"
	type: "ReLU"
}

layer {
	bottom: "res5b"
	top: "res5c_branch2a"
	name: "res5c_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 512
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res5c_branch2a"
	top: "res5c_branch2a"
	name: "bn5c_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res5c_branch2a"
	top: "res5c_branch2a"
	name: "scale5c_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res5c_branch2a"
	bottom: "res5c_branch2a"
	name: "res5c_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res5c_branch2a"
	top: "res5c_branch2b"
	name: "res5c_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 512
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res5c_branch2b"
	top: "res5c_branch2b"
	name: "bn5c_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res5c_branch2b"
	top: "res5c_branch2b"
	name: "scale5c_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	top: "res5c_branch2b"
	bottom: "res5c_branch2b"
	name: "res5c_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res5c_branch2b"
	top: "res5c_branch2c"
	name: "res5c_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 2048
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res5c_branch2c"
	top: "res5c_branch2c"
	name: "bn5c_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res5c_branch2c"
	top: "res5c_branch2c"
	name: "scale5c_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res5b"
	bottom: "res5c_branch2c"
	top: "res5c"
	name: "res5c"
	type: "Eltwise"
}

layer {
	bottom: "res5c"
	top: "res5c"
	name: "res5c_relu"
	type: "ReLU"
}

layer {
	bottom: "res5c"
	top: "pool5"
	name: "pool5"
	type: "Pooling"
	pooling_param {
		kernel_size: 7
		stride: 1
		pool: AVE
	}
}

layer {
	bottom: "pool5"
	top: "fc1000"
	name: "fc1000"
	type: "InnerProduct"
	inner_product_param {
		num_output: 1000
	}
}

layer {
	bottom: "fc1000"
	top: "prob"
	name: "prob"
	type: "Softmax"
}


================================================
FILE: models/ResNet-50-deploy.prototxt
================================================
name: "ResNet-50"
input: "data"
input_dim: 1
input_dim: 3
input_dim: 224
input_dim: 224

layer {
	bottom: "data"
	top: "conv1"
	name: "conv1"
	type: "Convolution"
	convolution_param {
		num_output: 64
		kernel_size: 7
		pad: 3
		stride: 2
	}
}

layer {
	bottom: "conv1"
	top: "conv1"
	name: "bn_conv1"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "conv1"
	top: "conv1"
	name: "scale_conv1"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "conv1"
	top: "conv1"
	name: "conv1_relu"
	type: "ReLU"
}

layer {
	bottom: "conv1"
	top: "pool1"
	name: "pool1"
	type: "Pooling"
	pooling_param {
		kernel_size: 3
		stride: 2
		pool: MAX
	}
}

layer {
	bottom: "pool1"
	top: "res2a_branch1"
	name: "res2a_branch1"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res2a_branch1"
	top: "res2a_branch1"
	name: "bn2a_branch1"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res2a_branch1"
	top: "res2a_branch1"
	name: "scale2a_branch1"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "pool1"
	top: "res2a_branch2a"
	name: "res2a_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 64
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res2a_branch2a"
	top: "res2a_branch2a"
	name: "bn2a_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res2a_branch2a"
	top: "res2a_branch2a"
	name: "scale2a_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res2a_branch2a"
	top: "res2a_branch2a"
	name: "res2a_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res2a_branch2a"
	top: "res2a_branch2b"
	name: "res2a_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 64
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res2a_branch2b"
	top: "res2a_branch2b"
	name: "bn2a_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res2a_branch2b"
	top: "res2a_branch2b"
	name: "scale2a_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res2a_branch2b"
	top: "res2a_branch2b"
	name: "res2a_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res2a_branch2b"
	top: "res2a_branch2c"
	name: "res2a_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res2a_branch2c"
	top: "res2a_branch2c"
	name: "bn2a_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res2a_branch2c"
	top: "res2a_branch2c"
	name: "scale2a_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res2a_branch1"
	bottom: "res2a_branch2c"
	top: "res2a"
	name: "res2a"
	type: "Eltwise"
}

layer {
	bottom: "res2a"
	top: "res2a"
	name: "res2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res2a"
	top: "res2b_branch2a"
	name: "res2b_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 64
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res2b_branch2a"
	top: "res2b_branch2a"
	name: "bn2b_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res2b_branch2a"
	top: "res2b_branch2a"
	name: "scale2b_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res2b_branch2a"
	top: "res2b_branch2a"
	name: "res2b_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res2b_branch2a"
	top: "res2b_branch2b"
	name: "res2b_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 64
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res2b_branch2b"
	top: "res2b_branch2b"
	name: "bn2b_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res2b_branch2b"
	top: "res2b_branch2b"
	name: "scale2b_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res2b_branch2b"
	top: "res2b_branch2b"
	name: "res2b_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res2b_branch2b"
	top: "res2b_branch2c"
	name: "res2b_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res2b_branch2c"
	top: "res2b_branch2c"
	name: "bn2b_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res2b_branch2c"
	top: "res2b_branch2c"
	name: "scale2b_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res2a"
	bottom: "res2b_branch2c"
	top: "res2b"
	name: "res2b"
	type: "Eltwise"
}

layer {
	bottom: "res2b"
	top: "res2b"
	name: "res2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res2b"
	top: "res2c_branch2a"
	name: "res2c_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 64
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res2c_branch2a"
	top: "res2c_branch2a"
	name: "bn2c_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res2c_branch2a"
	top: "res2c_branch2a"
	name: "scale2c_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res2c_branch2a"
	top: "res2c_branch2a"
	name: "res2c_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res2c_branch2a"
	top: "res2c_branch2b"
	name: "res2c_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 64
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res2c_branch2b"
	top: "res2c_branch2b"
	name: "bn2c_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res2c_branch2b"
	top: "res2c_branch2b"
	name: "scale2c_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res2c_branch2b"
	top: "res2c_branch2b"
	name: "res2c_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res2c_branch2b"
	top: "res2c_branch2c"
	name: "res2c_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res2c_branch2c"
	top: "res2c_branch2c"
	name: "bn2c_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res2c_branch2c"
	top: "res2c_branch2c"
	name: "scale2c_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res2b"
	bottom: "res2c_branch2c"
	top: "res2c"
	name: "res2c"
	type: "Eltwise"
}

layer {
	bottom: "res2c"
	top: "res2c"
	name: "res2c_relu"
	type: "ReLU"
}

layer {
	bottom: "res2c"
	top: "res3a_branch1"
	name: "res3a_branch1"
	type: "Convolution"
	convolution_param {
		num_output: 512
		kernel_size: 1
		pad: 0
		stride: 2
		bias_term: false
	}
}

layer {
	bottom: "res3a_branch1"
	top: "res3a_branch1"
	name: "bn3a_branch1"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3a_branch1"
	top: "res3a_branch1"
	name: "scale3a_branch1"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res2c"
	top: "res3a_branch2a"
	name: "res3a_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 128
		kernel_size: 1
		pad: 0
		stride: 2
		bias_term: false
	}
}

layer {
	bottom: "res3a_branch2a"
	top: "res3a_branch2a"
	name: "bn3a_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3a_branch2a"
	top: "res3a_branch2a"
	name: "scale3a_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res3a_branch2a"
	top: "res3a_branch2a"
	name: "res3a_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res3a_branch2a"
	top: "res3a_branch2b"
	name: "res3a_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 128
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3a_branch2b"
	top: "res3a_branch2b"
	name: "bn3a_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3a_branch2b"
	top: "res3a_branch2b"
	name: "scale3a_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res3a_branch2b"
	top: "res3a_branch2b"
	name: "res3a_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res3a_branch2b"
	top: "res3a_branch2c"
	name: "res3a_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 512
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3a_branch2c"
	top: "res3a_branch2c"
	name: "bn3a_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3a_branch2c"
	top: "res3a_branch2c"
	name: "scale3a_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res3a_branch1"
	bottom: "res3a_branch2c"
	top: "res3a"
	name: "res3a"
	type: "Eltwise"
}

layer {
	bottom: "res3a"
	top: "res3a"
	name: "res3a_relu"
	type: "ReLU"
}

layer {
	bottom: "res3a"
	top: "res3b_branch2a"
	name: "res3b_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 128
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3b_branch2a"
	top: "res3b_branch2a"
	name: "bn3b_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3b_branch2a"
	top: "res3b_branch2a"
	name: "scale3b_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res3b_branch2a"
	top: "res3b_branch2a"
	name: "res3b_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res3b_branch2a"
	top: "res3b_branch2b"
	name: "res3b_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 128
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3b_branch2b"
	top: "res3b_branch2b"
	name: "bn3b_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3b_branch2b"
	top: "res3b_branch2b"
	name: "scale3b_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res3b_branch2b"
	top: "res3b_branch2b"
	name: "res3b_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res3b_branch2b"
	top: "res3b_branch2c"
	name: "res3b_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 512
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3b_branch2c"
	top: "res3b_branch2c"
	name: "bn3b_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3b_branch2c"
	top: "res3b_branch2c"
	name: "scale3b_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res3a"
	bottom: "res3b_branch2c"
	top: "res3b"
	name: "res3b"
	type: "Eltwise"
}

layer {
	bottom: "res3b"
	top: "res3b"
	name: "res3b_relu"
	type: "ReLU"
}

layer {
	bottom: "res3b"
	top: "res3c_branch2a"
	name: "res3c_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 128
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3c_branch2a"
	top: "res3c_branch2a"
	name: "bn3c_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3c_branch2a"
	top: "res3c_branch2a"
	name: "scale3c_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res3c_branch2a"
	top: "res3c_branch2a"
	name: "res3c_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res3c_branch2a"
	top: "res3c_branch2b"
	name: "res3c_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 128
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3c_branch2b"
	top: "res3c_branch2b"
	name: "bn3c_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3c_branch2b"
	top: "res3c_branch2b"
	name: "scale3c_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res3c_branch2b"
	top: "res3c_branch2b"
	name: "res3c_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res3c_branch2b"
	top: "res3c_branch2c"
	name: "res3c_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 512
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3c_branch2c"
	top: "res3c_branch2c"
	name: "bn3c_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3c_branch2c"
	top: "res3c_branch2c"
	name: "scale3c_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res3b"
	bottom: "res3c_branch2c"
	top: "res3c"
	name: "res3c"
	type: "Eltwise"
}

layer {
	bottom: "res3c"
	top: "res3c"
	name: "res3c_relu"
	type: "ReLU"
}

layer {
	bottom: "res3c"
	top: "res3d_branch2a"
	name: "res3d_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 128
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3d_branch2a"
	top: "res3d_branch2a"
	name: "bn3d_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3d_branch2a"
	top: "res3d_branch2a"
	name: "scale3d_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res3d_branch2a"
	top: "res3d_branch2a"
	name: "res3d_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res3d_branch2a"
	top: "res3d_branch2b"
	name: "res3d_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 128
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3d_branch2b"
	top: "res3d_branch2b"
	name: "bn3d_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3d_branch2b"
	top: "res3d_branch2b"
	name: "scale3d_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res3d_branch2b"
	top: "res3d_branch2b"
	name: "res3d_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res3d_branch2b"
	top: "res3d_branch2c"
	name: "res3d_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 512
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res3d_branch2c"
	top: "res3d_branch2c"
	name: "bn3d_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res3d_branch2c"
	top: "res3d_branch2c"
	name: "scale3d_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res3c"
	bottom: "res3d_branch2c"
	top: "res3d"
	name: "res3d"
	type: "Eltwise"
}

layer {
	bottom: "res3d"
	top: "res3d"
	name: "res3d_relu"
	type: "ReLU"
}

layer {
	bottom: "res3d"
	top: "res4a_branch1"
	name: "res4a_branch1"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 2
		bias_term: false
	}
}

layer {
	bottom: "res4a_branch1"
	top: "res4a_branch1"
	name: "bn4a_branch1"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4a_branch1"
	top: "res4a_branch1"
	name: "scale4a_branch1"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res3d"
	top: "res4a_branch2a"
	name: "res4a_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 2
		bias_term: false
	}
}

layer {
	bottom: "res4a_branch2a"
	top: "res4a_branch2a"
	name: "bn4a_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4a_branch2a"
	top: "res4a_branch2a"
	name: "scale4a_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4a_branch2a"
	top: "res4a_branch2a"
	name: "res4a_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4a_branch2a"
	top: "res4a_branch2b"
	name: "res4a_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4a_branch2b"
	top: "res4a_branch2b"
	name: "bn4a_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4a_branch2b"
	top: "res4a_branch2b"
	name: "scale4a_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4a_branch2b"
	top: "res4a_branch2b"
	name: "res4a_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4a_branch2b"
	top: "res4a_branch2c"
	name: "res4a_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4a_branch2c"
	top: "res4a_branch2c"
	name: "bn4a_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4a_branch2c"
	top: "res4a_branch2c"
	name: "scale4a_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4a_branch1"
	bottom: "res4a_branch2c"
	top: "res4a"
	name: "res4a"
	type: "Eltwise"
}

layer {
	bottom: "res4a"
	top: "res4a"
	name: "res4a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4a"
	top: "res4b_branch2a"
	name: "res4b_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b_branch2a"
	top: "res4b_branch2a"
	name: "bn4b_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b_branch2a"
	top: "res4b_branch2a"
	name: "scale4b_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b_branch2a"
	top: "res4b_branch2a"
	name: "res4b_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b_branch2a"
	top: "res4b_branch2b"
	name: "res4b_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b_branch2b"
	top: "res4b_branch2b"
	name: "bn4b_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b_branch2b"
	top: "res4b_branch2b"
	name: "scale4b_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b_branch2b"
	top: "res4b_branch2b"
	name: "res4b_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b_branch2b"
	top: "res4b_branch2c"
	name: "res4b_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4b_branch2c"
	top: "res4b_branch2c"
	name: "bn4b_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4b_branch2c"
	top: "res4b_branch2c"
	name: "scale4b_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4a"
	bottom: "res4b_branch2c"
	top: "res4b"
	name: "res4b"
	type: "Eltwise"
}

layer {
	bottom: "res4b"
	top: "res4b"
	name: "res4b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4b"
	top: "res4c_branch2a"
	name: "res4c_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4c_branch2a"
	top: "res4c_branch2a"
	name: "bn4c_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4c_branch2a"
	top: "res4c_branch2a"
	name: "scale4c_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4c_branch2a"
	top: "res4c_branch2a"
	name: "res4c_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4c_branch2a"
	top: "res4c_branch2b"
	name: "res4c_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4c_branch2b"
	top: "res4c_branch2b"
	name: "bn4c_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4c_branch2b"
	top: "res4c_branch2b"
	name: "scale4c_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4c_branch2b"
	top: "res4c_branch2b"
	name: "res4c_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4c_branch2b"
	top: "res4c_branch2c"
	name: "res4c_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4c_branch2c"
	top: "res4c_branch2c"
	name: "bn4c_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4c_branch2c"
	top: "res4c_branch2c"
	name: "scale4c_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4b"
	bottom: "res4c_branch2c"
	top: "res4c"
	name: "res4c"
	type: "Eltwise"
}

layer {
	bottom: "res4c"
	top: "res4c"
	name: "res4c_relu"
	type: "ReLU"
}

layer {
	bottom: "res4c"
	top: "res4d_branch2a"
	name: "res4d_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4d_branch2a"
	top: "res4d_branch2a"
	name: "bn4d_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4d_branch2a"
	top: "res4d_branch2a"
	name: "scale4d_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4d_branch2a"
	top: "res4d_branch2a"
	name: "res4d_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4d_branch2a"
	top: "res4d_branch2b"
	name: "res4d_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4d_branch2b"
	top: "res4d_branch2b"
	name: "bn4d_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4d_branch2b"
	top: "res4d_branch2b"
	name: "scale4d_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4d_branch2b"
	top: "res4d_branch2b"
	name: "res4d_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4d_branch2b"
	top: "res4d_branch2c"
	name: "res4d_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4d_branch2c"
	top: "res4d_branch2c"
	name: "bn4d_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4d_branch2c"
	top: "res4d_branch2c"
	name: "scale4d_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4c"
	bottom: "res4d_branch2c"
	top: "res4d"
	name: "res4d"
	type: "Eltwise"
}

layer {
	bottom: "res4d"
	top: "res4d"
	name: "res4d_relu"
	type: "ReLU"
}

layer {
	bottom: "res4d"
	top: "res4e_branch2a"
	name: "res4e_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4e_branch2a"
	top: "res4e_branch2a"
	name: "bn4e_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4e_branch2a"
	top: "res4e_branch2a"
	name: "scale4e_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4e_branch2a"
	top: "res4e_branch2a"
	name: "res4e_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4e_branch2a"
	top: "res4e_branch2b"
	name: "res4e_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4e_branch2b"
	top: "res4e_branch2b"
	name: "bn4e_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4e_branch2b"
	top: "res4e_branch2b"
	name: "scale4e_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4e_branch2b"
	top: "res4e_branch2b"
	name: "res4e_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4e_branch2b"
	top: "res4e_branch2c"
	name: "res4e_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4e_branch2c"
	top: "res4e_branch2c"
	name: "bn4e_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4e_branch2c"
	top: "res4e_branch2c"
	name: "scale4e_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4d"
	bottom: "res4e_branch2c"
	top: "res4e"
	name: "res4e"
	type: "Eltwise"
}

layer {
	bottom: "res4e"
	top: "res4e"
	name: "res4e_relu"
	type: "ReLU"
}

layer {
	bottom: "res4e"
	top: "res4f_branch2a"
	name: "res4f_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4f_branch2a"
	top: "res4f_branch2a"
	name: "bn4f_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4f_branch2a"
	top: "res4f_branch2a"
	name: "scale4f_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4f_branch2a"
	top: "res4f_branch2a"
	name: "res4f_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res4f_branch2a"
	top: "res4f_branch2b"
	name: "res4f_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 256
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4f_branch2b"
	top: "res4f_branch2b"
	name: "bn4f_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4f_branch2b"
	top: "res4f_branch2b"
	name: "scale4f_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4f_branch2b"
	top: "res4f_branch2b"
	name: "res4f_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res4f_branch2b"
	top: "res4f_branch2c"
	name: "res4f_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 1024
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res4f_branch2c"
	top: "res4f_branch2c"
	name: "bn4f_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res4f_branch2c"
	top: "res4f_branch2c"
	name: "scale4f_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4e"
	bottom: "res4f_branch2c"
	top: "res4f"
	name: "res4f"
	type: "Eltwise"
}

layer {
	bottom: "res4f"
	top: "res4f"
	name: "res4f_relu"
	type: "ReLU"
}

layer {
	bottom: "res4f"
	top: "res5a_branch1"
	name: "res5a_branch1"
	type: "Convolution"
	convolution_param {
		num_output: 2048
		kernel_size: 1
		pad: 0
		stride: 2
		bias_term: false
	}
}

layer {
	bottom: "res5a_branch1"
	top: "res5a_branch1"
	name: "bn5a_branch1"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res5a_branch1"
	top: "res5a_branch1"
	name: "scale5a_branch1"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res4f"
	top: "res5a_branch2a"
	name: "res5a_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 512
		kernel_size: 1
		pad: 0
		stride: 2
		bias_term: false
	}
}

layer {
	bottom: "res5a_branch2a"
	top: "res5a_branch2a"
	name: "bn5a_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res5a_branch2a"
	top: "res5a_branch2a"
	name: "scale5a_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res5a_branch2a"
	top: "res5a_branch2a"
	name: "res5a_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res5a_branch2a"
	top: "res5a_branch2b"
	name: "res5a_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 512
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res5a_branch2b"
	top: "res5a_branch2b"
	name: "bn5a_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res5a_branch2b"
	top: "res5a_branch2b"
	name: "scale5a_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res5a_branch2b"
	top: "res5a_branch2b"
	name: "res5a_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res5a_branch2b"
	top: "res5a_branch2c"
	name: "res5a_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 2048
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res5a_branch2c"
	top: "res5a_branch2c"
	name: "bn5a_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res5a_branch2c"
	top: "res5a_branch2c"
	name: "scale5a_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res5a_branch1"
	bottom: "res5a_branch2c"
	top: "res5a"
	name: "res5a"
	type: "Eltwise"
}

layer {
	bottom: "res5a"
	top: "res5a"
	name: "res5a_relu"
	type: "ReLU"
}

layer {
	bottom: "res5a"
	top: "res5b_branch2a"
	name: "res5b_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 512
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res5b_branch2a"
	top: "res5b_branch2a"
	name: "bn5b_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res5b_branch2a"
	top: "res5b_branch2a"
	name: "scale5b_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res5b_branch2a"
	top: "res5b_branch2a"
	name: "res5b_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res5b_branch2a"
	top: "res5b_branch2b"
	name: "res5b_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 512
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res5b_branch2b"
	top: "res5b_branch2b"
	name: "bn5b_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res5b_branch2b"
	top: "res5b_branch2b"
	name: "scale5b_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res5b_branch2b"
	top: "res5b_branch2b"
	name: "res5b_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res5b_branch2b"
	top: "res5b_branch2c"
	name: "res5b_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 2048
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res5b_branch2c"
	top: "res5b_branch2c"
	name: "bn5b_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res5b_branch2c"
	top: "res5b_branch2c"
	name: "scale5b_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res5a"
	bottom: "res5b_branch2c"
	top: "res5b"
	name: "res5b"
	type: "Eltwise"
}

layer {
	bottom: "res5b"
	top: "res5b"
	name: "res5b_relu"
	type: "ReLU"
}

layer {
	bottom: "res5b"
	top: "res5c_branch2a"
	name: "res5c_branch2a"
	type: "Convolution"
	convolution_param {
		num_output: 512
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res5c_branch2a"
	top: "res5c_branch2a"
	name: "bn5c_branch2a"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res5c_branch2a"
	top: "res5c_branch2a"
	name: "scale5c_branch2a"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res5c_branch2a"
	top: "res5c_branch2a"
	name: "res5c_branch2a_relu"
	type: "ReLU"
}

layer {
	bottom: "res5c_branch2a"
	top: "res5c_branch2b"
	name: "res5c_branch2b"
	type: "Convolution"
	convolution_param {
		num_output: 512
		kernel_size: 3
		pad: 1
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res5c_branch2b"
	top: "res5c_branch2b"
	name: "bn5c_branch2b"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res5c_branch2b"
	top: "res5c_branch2b"
	name: "scale5c_branch2b"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res5c_branch2b"
	top: "res5c_branch2b"
	name: "res5c_branch2b_relu"
	type: "ReLU"
}

layer {
	bottom: "res5c_branch2b"
	top: "res5c_branch2c"
	name: "res5c_branch2c"
	type: "Convolution"
	convolution_param {
		num_output: 2048
		kernel_size: 1
		pad: 0
		stride: 1
		bias_term: false
	}
}

layer {
	bottom: "res5c_branch2c"
	top: "res5c_branch2c"
	name: "bn5c_branch2c"
	type: "BatchNorm"
	batch_norm_param {
		use_global_stats: true
	}
}

layer {
	bottom: "res5c_branch2c"
	top: "res5c_branch2c"
	name: "scale5c_branch2c"
	type: "Scale"
	scale_param {
		bias_term: true
	}
}

layer {
	bottom: "res5b"
	bottom: "res5c_branch2c"
	top: "res5c"
	name: "res5c"
	type: "Eltwise"
}

layer {
	bottom: "res5c"
	top: "res5c"
	name: "res5c_relu"
	type: "ReLU"
}

layer {
	bottom: "res5c"
	top: "pool5"
	name: "pool5"
	type: "Pooling"
	pooling_param {
		kernel_size: 7
		stride: 1
		pool: AVE
	}
}

layer {
	bottom: "pool5"
	top: "fc1000"
	name: "fc1000"
	type: "InnerProduct"
	inner_product_param {
		num_output: 1000
	}
}

layer {
	bottom: "fc1000"
	top: "prob"
	name: "prob"
	type: "Softmax"
}


================================================
FILE: models/mps_builder
================================================
#!/bin/bash -e
#
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
cleanup() {
  kill $(jobs -p) ||:
  echo quit | nvidia-cuda-mps-control > /dev/null 2>&1 ||:
}
trap "cleanup" EXIT SIGINT SIGTERM

active_sms=${1:-100}
echo "Setting Active SM Percentage: ${active_sms}"
export CUDA_MPS_ACTIVE_THREAD_PERCENTAGE=${active_sms}

nvidia-cuda-mps-control -d ||:

sleep 1

echo
echo "Starting a new shell with MPS running..."
bash --rcfile <(echo "PS1='MPS Subshell: '")


================================================
FILE: models/onnx/common.py
================================================
import os
import argparse
import numpy as np
import pycuda.driver as cuda
import tensorrt as trt

try:
    # Sometimes python2 does not understand FileNotFoundError
    FileNotFoundError
except NameError:
    FileNotFoundError = IOError

def GiB(val):
    return val * 1 << 30

def find_sample_data(description="Runs a TensorRT Python sample", subfolder="", find_files=[]):
    '''
    Parses sample arguments.
    Args:
        description (str): Description of the sample.
        subfolder (str): The subfolder containing data relevant to this sample
        find_files (str): A list of filenames to find. Each filename will be replaced with an absolute path.
    Returns:
        str: Path of data directory.
    Raises:
        FileNotFoundError
    '''
    kDEFAULT_DATA_ROOT = os.path.abspath("/usr/src/tensorrt/data")
    parser = argparse.ArgumentParser(description=description)
    # Standard command-line arguments for all samples.
    parser.add_argument("-d", "--datadir", help="Location of the TensorRT sample data directory.")
    args, unknown_args = parser.parse_known_args()
    # If data directory is not specified, use the default.
    data_root = args.datadir if args.datadir else kDEFAULT_DATA_ROOT
    data_path = os.path.join(data_root, subfolder) if subfolder else data_root
    # Make sure data directory exists.
    if not (os.path.exists(data_path)):
        raise FileNotFoundError(data_path + " does not exist. Please provide the correct data path with the -d option.")
    # Find all requested files.
    for index, f in enumerate(find_files):
        find_files[index] = os.path.abspath(os.path.join(data_path, f))
        if not os.path.exists(find_files[index]):
            raise FileNotFoundError(find_files[index] + " does not exist. Please provide the correct data path with the -d option.")
    if find_files:
        return data_path, find_files
    else:
        return data_path

# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding))
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream

# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async(bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]


================================================
FILE: models/onnx/mnist-v1.3/test_data_set_0/output_0.pb
================================================

J(sDUČtE'DWQeYQ3vNKB

================================================
FILE: models/onnx/mnist-v1.3/test_data_set_1/output_0.pb
================================================

J(E_;ÁXyĄ*_DHԺÓ!9Z

================================================
FILE: models/onnx/mnist-v1.3/test_data_set_2/output_0.pb
================================================

J(l4ĐDMWD˿>'a¤&B6hE

================================================
FILE: models/onnx/onnx_builder.py
================================================
# This sample uses an ONNX ResNet50 Model to create a TensorRT Inference Engine
import random
from PIL import Image
import numpy as np
import ctypes

import pycuda.driver as cuda
# This import causes pycuda to automatically manage CUDA context creation and cleanup.
import pycuda.autoinit

import tensorrt as trt

import sys, os
sys.path.insert(1, os.path.join(sys.path[0], "."))
import common

def softmax(X, theta = 1.0, axis = None):
    """
    Compute the softmax of each element along an axis of X.

    Parameters
    ----------
    X: ND-Array. Probably should be floats. 
    theta (optional): float parameter, used as a multiplier
        prior to exponentiation. Default = 1.0
    axis (optional): axis to compute values along. Default is the 
        first non-singleton axis.

    Returns an array the same size as X. The result will sum to 1
    along the specified axis.
    """

    # make X at least 2d
    y = np.atleast_2d(X)

    # find axis
    if axis is None:
        axis = next(j[0] for j in enumerate(y.shape) if j[1] > 1)

    # multiply y against the theta parameter, 
    y = y * float(theta)

    # subtract the max for numerical stability
    y = y - np.expand_dims(np.max(y, axis = axis), axis)
    
    # exponentiate y
    y = np.exp(y)

    # take the sum along the specified axis
    ax_sum = np.expand_dims(np.sum(y, axis = axis), axis)

    # finally: divide elementwise
    p = y / ax_sum

    # flatten if X was 1D
    if len(X.shape) == 1: p = p.flatten()

    return p

class ModelData(object):
    MODEL_PATH = "/work/models/flowers-152.onnx"
    INPUT_SHAPE = (3, 224, 224)
    # We can convert TensorRT data types to numpy types with trt.nptype()
    DTYPE = trt.float32

# You can set the logger severity higher to suppress messages (or lower to display more messages).
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)

# Allocate host and device buffers, and create a stream.
def allocate_buffers(engine):
    # Determine dimensions and create page-locked memory buffers (i.e. won't be swapped to disk) to hold host inputs/outputs.
    h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)), dtype=trt.nptype(ModelData.DTYPE))
    h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)), dtype=trt.nptype(ModelData.DTYPE))
    # Allocate device memory for inputs and outputs.
    d_input = cuda.mem_alloc(h_input.nbytes)
    d_output = cuda.mem_alloc(h_output.nbytes)
    # Create a stream in which to copy inputs/outputs and run inference.
    stream = cuda.Stream()
    return h_input, d_input, h_output, d_output, stream

def do_inference(context, h_input, d_input, h_output, d_output, stream):
    # Transfer input data to the GPU.
    cuda.memcpy_htod_async(d_input, h_input, stream)
    # Run inference.
    context.execute_async(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    cuda.memcpy_dtoh_async(h_output, d_output, stream)
    # Synchronize the stream
    stream.synchronize()

# The Onnx path is used for Onnx models.
def build_engine_onnx(model_file, calibrator=None):
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network() as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
        builder.max_workspace_size = common.GiB(1)
        builder.max_batch_size = 8
        precision = "fp32"
        if calibrator:
            builder.int8_mode = True
            builder.int8_calibrator = calibrator
            precision = "int8"
        else:
            builder.fp16_mode = True
            precision = "fp16"
        # Load the Onnx model and parse it in order to populate the TensorRT network.
        with open(model_file, 'rb') as model:
            parser.parse(model.read())
        engine = builder.build_cuda_engine(network)
        serialized = engine.serialize()
        with open("/work/models/flowers-152-b{}-{}.engine".format(builder.max_batch_size, precision), "wb") as file:
            file.write(serialized)
        return engine

def normalize_image(image_name):
    image = Image.open(image_name)
    # Resize, antialias and transpose the image to CHW.
    c, h, w = ModelData.INPUT_SHAPE
    image_arr = np.asarray(image.resize((w, h), Image.ANTIALIAS)).transpose([2, 0, 1]).astype(trt.nptype(ModelData.DTYPE)).ravel()
    # This particular ResNet50 model requires some preprocessing, specifically, mean normalization.
    return ((image_arr / 255.0) - 0.5) * 2.0

def load_normalized_test_case(test_image, pagelocked_buffer):
    # Normalize the image and copy to pagelocked memory.
    np.copyto(pagelocked_buffer, normalize_image(test_image))
    return test_image

def create_calibration_dataset():
    jpegs = []
    for dirpath, subdirs, files in os.walk("/work/models/flowers-data/flowers"):
        for f in files:
            if f.endswith("jpg"):
                jpegs.append(os.path.join(dirpath, f))
    random.shuffle(jpegs)
    return jpegs[:200]

class ImageBatchStream:
    def __init__(self, batch_size, calibration_files):
        c, h, w = ModelData.INPUT_SHAPE
        self.batch_size = batch_size
        self.files = calibration_files
        self.batch = 0
        self.max_batches = (len(calibration_files) // batch_size) + \
                           (1 if (len(calibration_files) % batch_size) else 0)
        self.calibration_data = np.zeros((batch_size, c, h, w), dtype=np.float32)

    def reset(self):
        self.batch = 0
     
    def next_batch(self):
        c, h, w = ModelData.INPUT_SHAPE
        if self.batch < self.max_batches:
            imgs = []
            files_for_batch = self.files[self.batch_size * self.batch : \
                                  self.batch_size * (self.batch + 1)]
            for f in files_for_batch:
                print("[ImageBatchStream] Processing ", f)
                img = normalize_image(f)
                imgs.append(img.reshape((c, h, w)))
            for i in range(len(imgs)):
                self.calibration_data[i] = imgs[i]
            self.batch += 1
            return np.ascontiguousarray(self.calibration_data, dtype=np.float32)
        else:
            return np.array([])


class MyEntropyCalibrator(trt.IInt8EntropyCalibrator):
    def __init__(self, stream):
        trt.IInt8EntropyCalibrator.__init__(self)
        self.batchstream = stream
        self.d_input = cuda.mem_alloc(self.batchstream.calibration_data.nbytes)
        stream.reset()

    def get_batch_size(self):
        return self.batchstream.batch_size

    def get_batch(self, bindings, names):
        batch = self.batchstream.next_batch()
        if not batch.size:
            return None
        cuda.memcpy_htod(self.d_input, batch)
        bindings[0] = int(self.d_input)
        return bindings

    def read_calibration_cache(self, length):
        return None

    def write_calibration_cache(self, ptr, size):
#       cache = ctypes.c_char_p(int(ptr))
#       with open('calibration_cache.bin', 'wb') as f:
#           f.write(cache.value)
        return None


def main():
    calibration_files = create_calibration_dataset()
    batch_stream = ImageBatchStream(8, calibration_files)
    int8_calibrator = None
    int8_calibrator = MyEntropyCalibrator(batch_stream)
    engine = build_engine_onnx("/work/models/flowers-152.onnx", calibrator=int8_calibrator)
#   serialized = engine.serialize()
#   with open("/work/models/flowers-152-b8-int8.engine", "wb") as file:
#       file.write(serialized)
#   h_input, d_input, h_output, d_output, stream = allocate_buffers(engine)
#   with engine.create_execution_context() as context:
#       for test_image in ["/work/models/flowers-data/test/image_07927.jpg",
#                          "/work/models/flowers-data/test/image_06969.jpg",]:
#           #test_image = "/work/models/flowers-data/test/image_07927.jpg" # 13 - blanket flower
#           #test_image = "/work/models/flowers-data/test/image_06969.jpg"  # 0 - alpine sea holly
#           test_case = load_normalized_test_case(test_image, h_input)
#           do_inference(context, h_input, d_input, h_output, d_output, stream)
#           # We use the highest probability as our prediction. Its index corresponds to the predicted label.
#           pred = np.argmax(h_output)
#           score = softmax(h_output)[pred]
#           print("Recognized " + test_case + " as " + str(pred) + " score: " + str(score))

def old_main():
    # Set the data path to the directory that contains the trained models and test images for inference.
    data_path, data_files = common.find_sample_data(description="Runs a ResNet50 network with a TensorRT inference engine.", subfolder="resnet50", find_files=["binoculars.jpeg", "reflex_camera.jpeg", "tabby_tiger_cat.jpg", ModelData.MODEL_PATH, "class_labels.txt"])
    # Get test images, models and labels.
    test_images = data_files[0:3]
    onnx_model_file, labels_file = data_files[3:]
    labels = open(labels_file, 'r').read().split('\n')

    # Build a TensorRT engine.
    with build_engine_onnx(onnx_model_file) as engine:
        # Inference is the same regardless of which parser is used to build the engine, since the model architecture is the same.
        # Allocate buffers and create a CUDA stream.
        h_input, d_input, h_output, d_output, stream = allocate_buffers(engine)
        # Contexts are used to perform inference.
        with engine.create_execution_context() as context:
            # Load a normalized test case into the host input page-locked buffer.
            test_image = random.choice(test_images)
            test_case = load_normalized_test_case(test_image, h_input)
            # Run the engine. The output will be a 1D tensor of length 1000, where each value represents the
            # probability that the image corresponds to that label
            do_inference(context, h_input, d_input, h_output, d_output, stream)
            # We use the highest probability as our prediction. Its index corresponds to the predicted label.
            pred = np.argmax(h_output)
            print("Recognized " + test_case + " as " + pred)


if __name__ == '__main__':
    main()


================================================
FILE: models/setup.py
================================================
#!/usr/bin/env python3
#
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
import os
import subprocess

models = [
    ("ResNet-50-deploy.prototxt", "prob"),
#   ("ResNet-152-deploy.prototxt", "prob"),
]

precisions = [
#   ("fp32", ""),
    ("fp16", "--fp16"),
#   ("int8", "--int8")
]

def main():
    for model, o in models:
        for name, p in precisions:
            for b in [1, 8]: #, 2, 4, 8]:
                n = "b{}-{}".format(b, name)
                e = model.replace("prototxt", "engine")
                e = e.replace("deploy", n)
                m = os.path.join("/work/models", model)
                if os.path.isfile(e):
                    continue
                subprocess.call("trtexec --deploy={} --batch={} --output={} {} --engine={}".format(
                    m, b, o, p, e
                ), shell=True)

if __name__ == "__main__":
    main()


================================================
FILE: notebooks/Demo Day 1.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import time\n",
    "import numpy as np\n",
    "import wurlitzer\n",
    "\n",
    "import trtlab\n",
    "import infer_test_utils as utils\n",
    "\n",
    "# this allows us to capture stdout and stderr from the backend c++ infer-runtime\n",
    "display_output = wurlitzer.sys_pipes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "inputs = utils.load_inputs(\"/work/models/onnx/mnist-v1.3/test_data_set_0\")\n",
    "expected = utils.load_outputs(\"/work/models/onnx/mnist-v1.3/test_data_set_0\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!trtexec --onnx=/work/models/onnx/mnist-v1.3/model.onnx --saveEngine=/tmp/mnist-v1.3.engine"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Local Inference Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with display_output():\n",
    "    manager = trtlab.InferenceManager(max_exec_concurrency=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with display_output():\n",
    "    manager.register_tensorrt_engine(\"mnist\", \"/tmp/mnist-v1.3.engine\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with display_output():\n",
    "    manager.update_resources()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Local Inference Properties"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mnist = manager.infer_runner(\"mnist\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mnist.input_bindings()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mnist.output_bindings()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Local Inference Compute"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "futures = [mnist.infer(Input3=input) for input in inputs]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# free to do other work while inference is being computed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = [f.get() for f in futures]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for r, e in zip(results, expected):\n",
    "    for key, val in r.items():\n",
    "        r = val.reshape((1,10))\n",
    "        np.testing.assert_almost_equal(r, e, decimal=3)\n",
    "        print(\"Test Passed\")\n",
    "        print(\"Result: {}\".format(np.argmax(utils.softmax(r))))\n",
    "\n",
    "utils.mnist_image(inputs[0]).show()\n",
    "expected[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}


================================================
FILE: notebooks/Demo Day 2.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import time\n",
    "import numpy as np\n",
    "import wurlitzer\n",
    "\n",
    "import trtlab\n",
    "import infer_test_utils as utils\n",
    "\n",
    "# this allows us to capture stdout and stderr from the backend c++ infer-runtime\n",
    "display_output = wurlitzer.sys_pipes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "inputs = utils.load_inputs(\"/work/models/onnx/mnist-v1.3/test_data_set_0\")\n",
    "expected = utils.load_outputs(\"/work/models/onnx/mnist-v1.3/test_data_set_0\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Remote Inference Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with display_output():\n",
    "    manager = trtlab.RemoteInferenceManager(hostname=\"localhost:50052\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with display_output():\n",
    "    models = manager.get_models()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with display_output():\n",
    "    print(models)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Remote Inference Properties"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mnist = manager.infer_runner(\"mnist\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mnist.input_bindings()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mnist.output_bindings()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Remote Inference Compute"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "futures = [mnist.infer(Input3=input) for input in inputs]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# free to do other work while inference is being computed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = [f.get() for f in futures]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for r, e in zip(results, expected):\n",
    "    for key, val in r.items():\n",
    "        r = val.reshape((1,10))\n",
    "        np.testing.assert_almost_equal(r, e, decimal=3)\n",
    "        print(\"Test Passed\")\n",
    "        print(\"Output Binding Name: {}; shape: {}\".format(key, val.shape))\n",
    "        print(\"Result: {}\".format(np.argmax(utils.softmax(r))))\n",
    "        \n",
    "utils.mnist_image(inputs[0]).show()\n",
    "expected[0]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}


================================================
FILE: notebooks/Demo Day 3.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import time\n",
    "import numpy as np\n",
    "import wurlitzer\n",
    "\n",
    "import trtlab\n",
    "import infer_test_utils as utils\n",
    "\n",
    "# this allows us to capture stdout and stderr from the backend c++ infer-runtime\n",
    "display_output = wurlitzer.sys_pipes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!trtexec --onnx=/work/models/onnx/mnist-v1.3/model.onnx --saveEngine=/tmp/mnist-v1.3.engine"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Remote Inference Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with display_output():\n",
    "    manager = trtlab.InferenceManager(max_exec_concurrency=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with display_output():\n",
    "    manager.register_tensorrt_engine(\"mnist\", \"/tmp/mnist-v1.3.engine\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with display_output():\n",
    "    manager.update_resources()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Remote Inference Server"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with display_output():\n",
    "    manager.serve()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}


================================================
FILE: notebooks/Multiple Models.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import time\n",
    "import numpy as np\n",
    "import wurlitzer\n",
    "\n",
    "import trtlab\n",
    "\n",
    "# this allows us to capture stdout and stderr from the backend c++ infer-runtime\n",
    "display_output = wurlitzer.sys_pipes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "!/work/models/setup.py"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Local Inference Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING: Logging before InitGoogleLogging() is written to STDERR\n",
      "I0204 22:01:27.543411   925 inference_manager.cc:64] -- Initialzing TensorRT Resource Manager --\n",
      "I0204 22:01:27.543426   925 inference_manager.cc:65] Maximum Execution Concurrency: 4\n",
      "I0204 22:01:27.543429   925 inference_manager.cc:66] Maximum Copy Concurrency: 8\n"
     ]
    }
   ],
   "source": [
    "with display_output():\n",
    "    manager = trtlab.InferenceManager(max_exec_concurrency=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "I0204 22:01:30.164453   925 model.cc:91] Binding: data; isInput: true; dtype size: 4; bytes per batch item: 602112\n",
      "I0204 22:01:30.164479   925 model.cc:91] Binding: prob; isInput: false; dtype size: 4; bytes per batch item: 4000\n",
      "I0204 22:01:30.169529   925 inference_manager.cc:149] -- Registering Model: rn50-b1 --\n",
      "I0204 22:01:30.169546   925 inference_manager.cc:150] Input/Output Tensors require 591.9 KiB\n",
      "I0204 22:01:30.169550   925 inference_manager.cc:151] Execution Activations require 5.7 MiB\n",
      "I0204 22:01:30.169554   925 inference_manager.cc:155] Weights require 75.8 MiB\n",
      "I0204 22:01:30.223752   925 model.cc:91] Binding: data; isInput: true; dtype size: 4; bytes per batch item: 602112\n",
      "I0204 22:01:30.223776   925 model.cc:91] Binding: prob; isInput: false; dtype size: 4; bytes per batch item: 4000\n",
      "I0204 22:01:30.227011   925 inference_manager.cc:149] -- Registering Model: rn50-b8 --\n",
      "I0204 22:01:30.227035   925 inference_manager.cc:150] Input/Output Tensors require 4.6 MiB\n",
      "I0204 22:01:30.227041   925 inference_manager.cc:151] Execution Activations require 39.8 MiB\n",
      "I0204 22:01:30.227046   925 inference_manager.cc:155] Weights require 49.0 MiB\n"
     ]
    }
   ],
   "source": [
    "with display_output():\n",
    "    manager.register_tensorrt_engine(\"rn50-b1\", \"/work/models/ResNet-50-b1-fp16.engine\")\n",
    "    manager.register_tensorrt_engine(\"rn50-b8\", \"/work/models/ResNet-50-b8-fp16.engine\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "I0204 22:01:31.025523   925 inference_manager.cc:194] -- Allocating TensorRT Resources --\n",
      "I0204 22:01:31.025539   925 inference_manager.cc:195] Creating 4 TensorRT execution tokens.\n",
      "I0204 22:01:31.025542   925 inference_manager.cc:196] Creating a Pool of 8 Host/Device Memory Stacks\n",
      "I0204 22:01:31.025550   925 inference_manager.cc:197] Each Host Stack contains 4.7 MiB\n",
      "I0204 22:01:31.025554   925 inference_manager.cc:198] Each Device Stack contains 4.8 MiB\n",
      "I0204 22:01:31.025559   925 inference_manager.cc:199] Total GPU Memory: 197.5 MiB\n"
     ]
    }
   ],
   "source": [
    "with display_output():\n",
    "    manager.update_resources()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Local Inference Properties"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "b1 = manager.infer_runner(\"rn50-b1\")\n",
    "b8 = manager.infer_runner(\"rn50-b8\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'data': {'dtype': dtype('float32'), 'shape': [3, 224, 224]}}"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "b1.input_bindings()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "b1.max_batch_size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'data': {'dtype': dtype('float32'), 'shape': [3, 224, 224]}}"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "b8.input_bindings()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "b8.max_batch_size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def max_batch_size_shape(x, input='data'):\n",
    "    batch = [x.max_batch_size()]\n",
    "    batch.extend(x.input_bindings()[input]['shape'])\n",
    "    return batch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[8, 3, 224, 224]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "max_batch_size_shape(b8)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Compute\n",
    "\n",
    "Here we launch two async inferences with two different TensorRT engines, one built for batch1, the other for batch8.  While these are the same ResNet-50 models, they could be any two unique TensorRT engines.\n",
    "\n",
    "Note: for this example the weights of the model and the input tensors are all random values."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "futures = [model.infer(data=np.random.random_sample(max_batch_size_shape(model))) for model in [b1, b8]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# free to do other work while inference is being computed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = [f.get() for f in futures]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "prob binding has shape: (1, 1000, 1, 1)\n",
      "prob binding has shape: (8, 1000, 1, 1)\n"
     ]
    }
   ],
   "source": [
    "for result in results:\n",
    "    for output, tensor in result.items():\n",
    "        print(\"{} binding has shape: {}\".format(output, tensor.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}


================================================
FILE: notebooks/Quickstart.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TensorRT Runtime\n",
    "\n",
    "This example walks through the basic usecase of:\n",
    "  1. initialization the infer-runtime\n",
    "  2. loading a model\n",
    "  3. allocating resources\n",
    "  4. inspecting the input/output bindings of the model\n",
    "  5. evaluating the model using async futures\n",
    "  6. testing for correctness"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import time\n",
    "import numpy as np\n",
    "import wurlitzer\n",
    "\n",
    "import trtlab\n",
    "import infer_test_utils as utils\n",
    "\n",
    "# this allows us to capture stdout and stderr from the backend c++ infer-runtime\n",
    "display_output = wurlitzer.sys_pipes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!trtexec --onnx=/work/models/onnx/mnist-v1.3/model.onnx --saveEngine=/work/models/onnx/mnist-v1.3/mnist-v1.3.engine"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Initialize infer-runtime\n",
    "\n",
    "The most important option when initializing the infer-runtime is to set the maximum number of conncurrent executions that can be executed at any given time.  This value is tunable for your application.  Lower setting reduce latency; higher-settings increase throughput.  Evaluate how your model performs using ...TODO-this-notebook..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with display_output():\n",
    "    models = infer.InferenceManager(max_executions=2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Register a Model\n",
    "\n",
    "To register a model, simply associate a `model_name` with a path to a TensorRT engine file. The returned object is an `InferRunner` object.  Use an `InferRunner` to submit work to the backend inference queue."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with display_output():\n",
    "    mnist = models.register_tensorrt_engine(\"mnist\", \"/work/models/onnx/mnist-v1.3/mnist-v1.3.engine\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Allocate Resources\n",
    "\n",
    "Before you can submit inference requests, you need to allocate some internal resources.  This should be done anytime new models are registered.  There maybe a runtime performance interruption if you update the resources while the queue is full."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with display_output():\n",
    "    models.update_resources()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Inspect Model\n",
    "\n",
    "Query the `InferenceRunner` to see what it expects for inputs and what it will return for outputs."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mnist.input_bindings()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mnist.output_bindings()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Submit Infer Requests\n",
    "\n",
    "`InferenceRunner.infer` accecpts a dict of numpy arrays that match the input description, submits this inference request to the backend compute engine and returns a future to a dict of numpy arrays.  \n",
    "\n",
    "That means, this method should returns almost immediately; however, that does not mean the inference is complete.  Use `get()` to wait for the result.  This is a blocking call."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "result = mnist.infer(Input3=np.random.random_sample([1,28,28]))\n",
    "result # result is a future"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "result = result.get()\n",
    "result # result is the value of the future - dict of np arrays"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with display_output():\n",
    "    start = time.process_time()\n",
    "    result = mnist.infer(**{k: np.random.random_sample(v['shape']) for k,v in mnist.input_bindings().items()})\n",
    "    print(\"Queue Time: {}\".format(time.process_time() - start))\n",
    "    result = result.get()\n",
    "    print(\"Compute Time: {}\".format(time.process_time() - start))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Test for Correctness\n",
    "\n",
    "Load test image and results.  [Thanks to the ONNX Model Zoo](https://github.com/onnx/models/tree/master/mnist) for this example."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "inputs = utils.load_inputs(\"/work/models/onnx/mnist-v1.3/test_data_set_0\")\n",
    "expected = utils.load_outputs(\"/work/models/onnx/mnist-v1.3/test_data_set_0\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "utils.mnist_image(inputs[0]).show()\n",
    "expected[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "Submit the images to the inference queue, then wait for each result to be returned."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = [mnist.infer(Input3=input) for input in inputs]\n",
    "results = [r.get() for r in results]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Check results.\n",
    "TODO - update the utils to return dictionaries instead of arrays"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for r, e in zip(results, expected):\n",
    "    for key, val in r.items():\n",
    "        r = val.reshape((1,10))\n",
    "        np.testing.assert_almost_equal(r, e, decimal=3)\n",
    "        print(\"Test Passed\")\n",
    "        print(\"Output Binding Name: {}; shape: {}\".format(key, val.shape))\n",
    "        print(\"Result: {}\".format(np.argmax(utils.softmax(r))))\n",
    "        # r # show the raw tensor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}


================================================
FILE: notebooks/README.md
================================================
# Jupyter Notebooks

Note: Many of the ONNX examples will fail until TensorRT 5.1 is available in the container.

This includes:
  - Demo Day 1
  - Demo Day 2
  - Demo Day 3

================================================
FILE: requirements.txt
================================================
appdirs==1.4.3
atomicwrites==1.2.1
attrs==18.2.0
backcall==0.1.0
bleach>=3.1.1
boto3==1.9.109
botocore==1.12.109
click==6.7
cmake==3.11.0
cycler==0.10.0
decorator==4.3.0
defusedxml==0.5.0
docutils==0.14
entrypoints==0.3
grpcio==1.16.1
ipykernel==5.1.0
ipython==7.3.0
ipython-genutils==0.2.0
jedi==0.13.3
Jinja2==2.10.1
jmespath==0.9.4
jsonschema==3.0.1
jupyter-client==5.2.4
jupyter-core==4.4.0
jupyterlab==0.35.4
jupyterlab-server==0.2.0
kiwisolver==1.0.1
Mako==1.0.7
MarkupSafe==1.0
matplotlib==3.0.2
mistune==0.8.4
more-itertools==4.3.0
mxnet==1.4.0.post0
nbconvert==5.4.1
nbformat==4.4.0
notebook==5.7.8
numpy==1.15.4
onnx==1.3.0
pandocfilters==1.4.2
parso==0.5.0
pathlib2==2.3.3
pexpect==4.6.0
pickleshare==0.7.5
Pillow>=6.2.2
pluggy==0.8.0
prometheus-client==0.6.0
prompt-toolkit==2.0.9
protobuf==3.7.0
ptyprocess==0.6.0
py==1.7.0
pycuda==2018.1.1
Pygments==2.3.1
pyparsing==2.3.1
pyrsistent==0.14.11
pytest==4.0.2
python-dateutil==2.8.0
pytools==2018.5.2
pyzmq==18.0.0
s3transfer==0.2.0
Send2Trash==1.5.0
six==1.12.0
terminado==0.8.1
testpath==0.4.2
tornado==5.1
traitlets==4.3.2
typing==3.6.6
typing-extensions==3.7.2
urllib3==1.24.2
wcwidth==0.1.7
webencodings==0.5.1
wurlitzer==1.0.2


================================================
FILE: trtlab/BUILD.bazel
================================================
exports_files([
    "core",
    "cuda",
    "nvrpc",
    "tensorrt",
    ],
    visibility = ["//visibility:public"],
)


================================================
FILE: trtlab/CMakeLists.txt
================================================
#cmake_minimum_required(VERSION 3.9 FATAL_ERROR)

#project(trtlab)

#include(GNUInstallDirs)

#option(ENABLE_TESTING "Build tests" ON)

#set(CMAKE_CXX_STANDARD 17)
#set(CMAKE_CXX_EXTENSIONS ON)

#set(CMAKE_DEBUG_POSTFIX "-d")
set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
#set(default_build_type "Release")

#set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
#find_package(Threads)

#if(ENABLE_TESTING)
#  message(STATUS "Building Tests")
#  find_package(GTest) #1.8.1 CONFIG REQUIRED)
#  find_package(benchmark)
#  enable_testing()
#endif()

# suppress warnings

#add_compile_options(
#  $<$<AND:$<STREQUAL:$<COMPILE_LANGUAGE>,CXX>,$<CXX_COMPILER_ID:AppleClang>>:-Wno-deprecated-declarations>
#)

if(BUILD_MEMORY)
  add_subdirectory(memory)
endif()

if(BUILD_CORE)
  add_subdirectory(core)
endif()

if(BUILD_CUDA)
  add_subdirectory(cuda)
endif()

if(BUILD_NVRPC)
  add_subdirectory(nvrpc)
endif()

if(BUILD_TENSORRT)
   add_subdirectory(tensorrt)
endif()

if(BUILD_PYTHON)
  add_subdirectory(pybind)
endif()


# install

# include(CMakePackageConfigHelpers)
#
# install(
#   EXPORT ${PROJECT_NAME}-targets
#   NAMESPACE ${PROJECT_NAME}::
#   FILE ${PROJECT_NAME}-targets.cmake
#   DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}"
# )
#
# configure_package_config_file(
#   "${CMAKE_CURRENT_SOURCE_DIR}/cmake/${PROJECT_NAME}-config.cmake.in"
#   ${PROJECT_NAME}-config.cmake
#   INSTALL_DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}"
#   NO_CHECK_REQUIRED_COMPONENTS_MACRO
#   PATH_VARS CMAKE_INSTALL_INCLUDEDIR
# )
#
# install(
# # FILES "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config.cmake"
#   DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}"
# )


================================================
FILE: trtlab/core/BUILD.bazel
================================================
cc_library(
    name = "core",
    srcs = glob([
        "src/**/*.cc",
        "src/**/*.h",
    ]),
    hdrs = glob(
        ["include/**/*.h"],
    ),
    deps = [
        "@com_google_glog//:glog",
    ],
    strip_include_prefix = "include",
    visibility = ["//visibility:public"],
)


================================================
FILE: trtlab/core/CMakeLists.txt
================================================
include(GNUInstallDirs)
set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
find_package(Threads)
find_package(cpuaff)
find_package(glog 0.3.5 REQUIRED)
find_package(dlpack)
find_package(Boost REQUIRED COMPONENTS fiber_numa fiber context)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS ON)

# when linking against static glog one also needs to link against gflags
# which version of gflags is depended on build type
find_package(gflags REQUIRED)
if( CMAKE_BUILD_TYPE STREQUAL "Debug" )
  find_package(gflags COMPONENTS nothreads_shared)
  set(trtlab_gflags "gflags_nothreads_shared")
  message(STATUS "gflags: ${trtlab_gflags}")
  message(STATUS "build_type: ${CMAKE_BUILD_TYPE}")
else()
  find_package(gflags COMPONENTS nothreads_static)
  set(trtlab_gflags "gflags_nothreads_static")
  message(STATUS "gflags: ${trtlab_gflags}")
  message(STATUS "build_type: ${CMAKE_BUILD_TYPE}")
endif()

set(header_path ${CMAKE_CURRENT_SOURCE_DIR}/include/trtlab/core)

add_library(core
# src/types.cc
  src/affinity.cc
  src/utils.cc
  src/cyclic_buffer.cc
  src/cyclic_windowed_buffer.cc
)

add_library(${PROJECT_NAME}::core ALIAS core)

message(STATUS "jemalloc: ${JEMALLOC_STATIC_LIBRARIES}")

target_link_libraries(core
  PUBLIC
    trtlab_memory

    dlpack::dlpack
    Threads::Threads
    $<$<AND:$<BOOL:UNIX>,$<NOT:$<BOOL:APPLE>>>:rt>
    glog::glog
    ${trtlab_gflags}
    Boost::fiber
    Boost::fiber_numa
    Boost::context
    ${JEMALLOC_STATIC_LIBRARIES}
    dl
)

target_include_directories(core
  PUBLIC
    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
  PRIVATE
    ${header_path}
)

set_target_properties(core PROPERTIES OUTPUT_NAME ${PROJECT_NAME}_core)

install(
  TARGETS core
  EXPORT ${PROJECT_NAME}-targets
  RUNTIME DESTINATION  ${CMAKE_INSTALL_BINDIR}
  LIBRARY DESTINATION  ${CMAKE_INSTALL_LIBDIR}
  ARCHIVE DESTINATION  ${CMAKE_INSTALL_LIBDIR}
  INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
)

install(
  DIRECTORY include/
  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
)

#if(ENABLE_TESTING)
  add_subdirectory(tests)
#endif()

#if(benchmark_FOUND)
  add_subdirectory(benchmarks)
#endif()


================================================
FILE: trtlab/core/benchmarks/CMakeLists.txt
================================================
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

find_package(benchmark)

add_executable(bench_core
  main.cc
  bench_pool.cc
  bench_thread_pool.cc
  bench_batcher.cc
# bench_memory.cc
# bench_memory_stack.cc
)

target_link_libraries(bench_core 
  PRIVATE 
    ${PROJECT_NAME}::core
    benchmark::benchmark
)

add_test(NAME bench_core COMMAND $<TARGET_FILE:bench_core>)


================================================
FILE: trtlab/core/benchmarks/bench_batcher.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <benchmark/benchmark.h>

#include <trtlab/core/standard_threads.h>

#include <trtlab/core/batcher.h>
#include <trtlab/core/dispatcher.h>

using namespace trtlab;

static void batcher_standard_batcher_int(benchmark::State& state)
{
    StandardBatcher<int, standard_threads> batcher(state.range(0));
    std::size_t                            counter = 0;

    for (auto _ : state)
    {
        auto future = batcher.enqueue(++counter);
        auto batch  = batcher.update();

        if (batch)
        {
            batch->promise.set_value();
            future.wait();
        }
    }
    state.SetItemsProcessed(static_cast<int64_t>(state.iterations()));
}

struct audio_state
{
    const std::uint16_t*  data;
    std::size_t           size;
    std::shared_ptr<long> state;
};

static void batcher_standard_batcher_audio(benchmark::State& state)
{
    StandardBatcher<audio_state, standard_threads> batcher(state.range(0));
    std::size_t                                    counter = 0;

    for (auto _ : state)
    {
        auto future = batcher.enqueue({nullptr, 0ul, nullptr});
        auto batch  = batcher.update();

        if (batch)
        {
            batch->promise.set_value();
            future.wait();
        }
    }
    state.SetItemsProcessed(static_cast<int64_t>(state.iterations()));
}

static void batcher_engine(benchmark::State& state)
{
    const std::size_t batch_size = state.range(0);

    auto execute_on_batch = [](const std::vector<int>& batch, std::function<void()> free_inputs) { free_inputs(); };

    auto                                   thread_pool = std::make_shared<ThreadPool>(1);
    auto                                   task_pool   = std::make_shared<DeferredShortTaskPool>();
    StandardBatcher<int, standard_threads> batcher(batch_size);
    Dispatcher<decltype(batcher)> dispatcher(std::move(batcher), std::chrono::milliseconds(15), thread_pool, task_pool, execute_on_batch);

    std::queue<std::shared_future<void>> f;

    int pre_load = 3;

    for (int i = 0; i < pre_load; i++)
    {
        f.push(dispatcher.enqueue(0));
    }
    for (int i = 0; i < (batch_size - 1) * pre_load; i++)
    {
        dispatcher.enqueue(i);
    }

    for (auto _ : state)
    {
        f.push(dispatcher.enqueue(0));
        for (int i = 0; i < batch_size - 1; i++)
        {
            dispatcher.enqueue(i);
        }
        f.front().wait();
        f.pop();
    }

    while (!f.empty())
    {
        f.front().wait();
        f.pop();
    }

    state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) * state.range(0));
}

BENCHMARK(batcher_standard_batcher_int)->RangeMultiplier(2)->Range(1, 1 << 7);
BENCHMARK(batcher_standard_batcher_audio)->RangeMultiplier(2)->Range(1 << 6, 1 << 7);
BENCHMARK(batcher_engine)->RangeMultiplier(2)->Range(4, 1 << 6)->UseRealTime()->MinTime(3.0);
;

================================================
FILE: trtlab/core/benchmarks/bench_memory.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <benchmark/benchmark.h>

#include "trtlab/core/memory/allocator.h"
#include "trtlab/core/memory/malloc.h"
#include "trtlab/core/memory/sysv_allocator.h"

using namespace trtlab;
using namespace trtlab;


static void BM_Memory_SystemMalloc(benchmark::State& state)
{
    for(auto _ : state)
    {
        //auto unique = std::make_unique<Allocator<Malloc>>(1024 * 1024);
        //auto shared = std::make_shared<Allocator<Malloc>>(1024 * 1024);
        Allocator<Malloc> memory1(1024 * 1024);
        Allocator<Malloc> memory2(1024 * 1024);
        Allocator<Malloc> memory3(1024 * 1024);
    }
}

static void BM_Memory_SystemV_descriptor(benchmark::State& state)
{
    auto master = std::make_unique<Allocator<SystemV>>(1024 * 1024);
    for(auto _ : state)
    {
        auto mdesc = SystemV::Attach(master->ShmID());
    }
}

/*
static void BM_Memory_HostDescriptor(benchmark::State& state)
{
    void *ptr = (void*)0xDEADBEEF;
    mem_size_t size = 1337;

    for(auto _ : state)
    {
        nextgen::HostDescriptor hdesc(ptr, size, []{});
    }
}
*/

/*
static void BM_Memory_SharedHostDescriptor(benchmark::State& state)
{
    void *ptr = (void*)0xDEADBEEF;
    mem_size_t size = 1337;

    for(auto _ : state)
    {
        nextgen::Descriptor<HostMemory> hdesc(ptr, size, []{});
        auto shared = std::make_shared<nextgen::SharedDescriptor<HostMemory>>(std::move(hdesc));
    }
}
*/

/*
static void BM_Memory_NextGenMalloc(benchmark::State& state)
{
    static mem_size_t one_mb = 1024*1024;

    for(auto _ : state)
    {
        auto hdesc0 = nextgen::Malloc::Allocate(one_mb);
        auto hdesc1 = nextgen::Malloc::Allocate(one_mb);
    }
}
*/

BENCHMARK(BM_Memory_SystemMalloc);
BENCHMARK(BM_Memory_SystemV_descriptor);
// BENCHMARK(BM_Memory_HostDescriptor);
// BENCHMARK(BM_Memory_SharedHostDescriptor);
// BENCHMARK(BM_Memory_NextGenMalloc);

================================================
FILE: trtlab/core/benchmarks/bench_memory_stack.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <benchmark/benchmark.h>

#include "trtlab/core/hybrid_mutex.h"

#include "trtlab/core/memory.h"

#include <foonathan/memory/namespace_alias.hpp>

using namespace trtlab;
using namespace trtlab;

static void allocators_transactional_raw(benchmark::State& state)
{
    using namespace memory::literals;

    auto alloc = memory::make_allocator_adapter(memory::malloc_allocator());
    auto block_alloc = memory::trtlab::make_growth_capped_block_allocator(1_MiB, 4u, std::move(alloc));
    auto trans_alloc = memory::trtlab::make_transactional_allocator(std::move(block_alloc));
    trans_alloc.reserve_blocks(4u);

    for(auto _ : state)
    {
        for(int i=0; i < state.range(0); i++)
        {
            auto ptr = trans_alloc.allocate_node(1024, 64);
            trans_alloc.deallocate_node(ptr, 0u, 0u);
        }
    }
}

static void allocators_transactional_std(benchmark::State& state)
{
    using namespace memory::literals;

    auto alloc = memory::make_allocator_adapter(memory::malloc_allocator());
    auto block_alloc = memory::trtlab::make_growth_capped_block_allocator(1_MiB, 4u, std::move(alloc));
    auto trans_alloc = memory::trtlab::make_transactional_allocator(std::move(block_alloc));
    trans_alloc.reserve_blocks(4u);

    auto smart = memory::trtlab::make_allocator(std::move(trans_alloc));

    for(auto _ : state)
    {
        for(int i=0; i < state.range(0); i++)
        {
            auto ptr = smart.allocate_node(1024, 64);
            smart.deallocate_node(ptr, 0u, 0u);
        }
    }
}

static void allocators_transactional_md(benchmark::State& state)
{
    using namespace memory::literals;

    auto alloc = memory::make_allocator_adapter(memory::malloc_allocator());
    auto block_alloc = memory::trtlab::make_growth_capped_block_allocator(1_MiB, 4u, std::move(alloc));
    auto trans_alloc = memory::trtlab::make_transactional_allocator(std::move(block_alloc));
    trans_alloc.reserve_blocks(4u);

    auto smart = memory::trtlab::make_allocator(std::move(trans_alloc));

    for(auto _ : state)
    {
        for(int i=0; i < state.range(0); i++)
        {
            auto md = smart.allocate_descriptor(1024, 64);
        }
    }
}

#if 0
template<typename T>
using custom_vector = std::vector<T, stl::temporary_allocator<T, Malloc>>;

template<typename T, typename RawAllocator>
auto make_vector(RawAllocator& alloc)
{
    using std_allocator = memory::std_allocator<T, RawAllocator>;
    return std::vector<T, std_allocator>(std_allocator(alloc));
}


static void BM_vector_transactional(benchmark::State& state)
{
    using namespace memory::literals;

    auto malloc = memory::make_allocator_reference(memory::MallocAllocator());
    auto block_alloc = memory::trtlab::make_growth_capped_block_allocator(128_MiB, 8, std::move(malloc));
    auto trans_alloc = memory::trtlab::make_transactional_allocator(std::move(block_alloc));
    trans_alloc.reserve_blocks(8);

    for(auto _ : state)
    {
        auto vec = make_vector<int>(trans_alloc);
        vec.reserve(1024*1024*8);
    }
}

static void BM_vector_smart_transactional(benchmark::State& state)
{
    using namespace memory::literals;

    auto malloc = memory::make_allocator_reference(memory::MallocAllocator());
    auto block_alloc = memory::trtlab::make_growth_capped_block_allocator(128_MiB, 8, std::move(malloc));
    auto trans_alloc = memory::trtlab::make_transactional_allocator(std::move(block_alloc));
    trans_alloc.reserve_blocks(8);

    auto smart = memory::trtlab::make_allocator<std::mutex>(std::move(trans_alloc));
    
    for(auto _ : state)
    {
        auto vec = memory::trtlab::make_vector<int>(smart);
        vec.reserve(1024*1024*8);
    }
}


static void BM_CyclicAllocator_stl_allocator(benchmark::State& state)
{
    {
        auto v0 = custom_vector<int>(1024);
    }
    for(auto _ : state)
    {
        custom_vector<int> vector;
        vector.reserve(1024*1024*8);
    }
}

static void BM_CyclicAllocator_stl_allocator2(benchmark::State& state)
{
    size_t ctr = 1024;
    for(auto _ : state)
    {
        custom_vector<int> v3;
        v3.reserve(ctr*ctr*8);
    }
}

static void BM_vector_default(benchmark::State& state)
{
    size_t ctr = 1024;
    for(auto _ : state)
    {
        std::vector<int> vec;
        vec.reserve(1024*1024*8);
    }
}

static void BM_stl_allocator_ctor(benchmark::State& state)
{
    for(auto _ : state)
    {
        auto a = stl::temporary_allocator<int, Malloc>();
    }
}

static void BM_stl_allocator_allocate_lifecycle(benchmark::State& state)
{
    for(auto _ : state)
    {
        auto a = stl::temporary_allocator<int, Malloc>();
        auto i = a.allocate(1024);
        a.deallocate(i, 1024);
    }
}
#endif

BENCHMARK(allocators_transactional_raw)->RangeMultiplier(2)->Range(1, 1 << 2);
BENCHMARK(allocators_transactional_std)->RangeMultiplier(2)->Range(1, 1 << 0);
BENCHMARK(allocators_transactional_md)->RangeMultiplier(2)->Range(1, 1 << 0);


================================================
FILE: trtlab/core/benchmarks/bench_pool.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "trtlab/core/pool.h"
#include "trtlab/core/userspace_threads.h"
#include <benchmark/benchmark.h>

static void BM_Pool_v1_Pop(benchmark::State& state)
{
    using trtlab::v1::Pool;
    struct Object
    {
    };
    auto pool = Pool<Object>::Create();
    pool->EmplacePush(new Object);

    for(auto _ : state)
    {
        auto obj = pool->Pop();
    }
}

static void BM_Pool_v2_Pop(benchmark::State& state)
{
    using trtlab::v2::Pool;
    struct Object
    {
    };
    auto pool = Pool<Object>::Create();
    pool->EmplacePush();

    for(auto _ : state)
    {
        auto obj = pool->Pop();
    }
}

static void BM_Pool_v3_Pop(benchmark::State& state)
{
    using trtlab::v3::Pool;
    struct Object
    {
    };
    auto pool = Pool<Object>::Create();
    pool->emplace_push();

    for(auto _ : state)
    {
        auto obj = std::move(pool->pop());
    }
}

static void BM_Pool_v4_Pop(benchmark::State& state)
{
    using trtlab::v4::Pool;
    struct Object
    {
    };
    auto pool = Pool<Object>::Create();
    pool->EmplacePush();

    for(auto _ : state)
    {
        auto obj = std::move(pool->pop_unique());
    }
}

static void BM_Pool_v4_Pop_Shared(benchmark::State& state)
{
    using trtlab::v4::Pool;
    struct Object
    {
    };
    auto pool = Pool<Object>::Create();
    pool->emplace_push();

    for(auto _ : state)
    {
        auto obj = std::move(pool->pop_shared());
    }
}

static void BM_Pool_v3_Pop_Userspace(benchmark::State& state)
{
    using trtlab::v3::Pool;
    struct Object
    {
    };
    auto pool = Pool<Object, trtlab::userspace_threads>::Create();
    pool->emplace_push();

    for(auto _ : state)
    {
        auto obj = std::move(pool->pop());
    }
}

BENCHMARK(BM_Pool_v1_Pop);
BENCHMARK(BM_Pool_v2_Pop);
BENCHMARK(BM_Pool_v3_Pop);
BENCHMARK(BM_Pool_v4_Pop);
BENCHMARK(BM_Pool_v4_Pop_Shared);
BENCHMARK(BM_Pool_v3_Pop_Userspace);


================================================
FILE: trtlab/core/benchmarks/bench_thread_pool.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "trtlab/core/hybrid_condition.h"
#include "trtlab/core/hybrid_mutex.h"
#include "trtlab/core/thread_pool.h"
#include <benchmark/benchmark.h>

static void BM_ThreadPool_Enqueue(benchmark::State& state)
{
    using trtlab::ThreadPool;
    auto pool = std::make_unique<ThreadPool>(1);

    for(auto _ : state)
    {
        CHECK(pool);
        // enqueue only
        auto future = pool->enqueue([] {});
        //future.get();
    }
}
BENCHMARK(BM_ThreadPool_Enqueue)->UseRealTime();

static void BM_HybridThreadPool_Enqueue(benchmark::State& state)
{
    using trtlab::BaseThreadPool;
    auto pool = std::make_unique<BaseThreadPool<hybrid_mutex, hybrid_condition>>(1);

    for(auto _ : state)
    {
        auto future = pool->enqueue([] {});
    }
}
BENCHMARK(BM_HybridThreadPool_Enqueue)->UseRealTime();

================================================
FILE: trtlab/core/benchmarks/main.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <benchmark/benchmark.h>

BENCHMARK_MAIN();

================================================
FILE: trtlab/core/include/trtlab/core/affinity.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once
#include <vector>

#include "cpuaff/cpuaff.hpp"

namespace trtlab
{
    struct affinity;

    class cpu_set final : public cpuaff::cpu_set
    {
    public:
        using cpuaff::cpu_set::cpu_set;

        cpu_set get_intersection(const cpu_set& other) const;
        cpu_set get_union(const cpu_set& other) const;
        cpu_set get_difference(const cpu_set& other) const;

        static cpu_set from_string(std::string);

        std::string cpus_string() const;
        std::string cores_string() const;
        std::string sockets_string() const;

        auto get_allocator() const -> cpuaff::round_robin_allocator
        {
            return cpuaff::round_robin_allocator(*this);
        };

        friend std::ostream& operator<<(std::ostream& s, const cpu_set& cpus);
    };

    std::ostream& operator<<(std::ostream& s, const cpu_set& cpus);

    class affinity_guard final
    {
        // hold the original affinity of the calling thread
        // the original affinity will be restored on destruction
        cpu_set m_original_cpus;

    public:
        affinity_guard();
        explicit affinity_guard(const cpu_set&);
        ~affinity_guard();

        affinity_guard(const affinity_guard&) = delete;
        affinity_guard& operator=(const affinity_guard&) = delete;

        affinity_guard(affinity_guard&&) noexcept = delete;
        affinity_guard& operator=(affinity_guard&&) noexcept = delete;
    };

    struct numa_node
    {
        unsigned              id;
        cpu_set               cpus;
        std::vector<unsigned> distances;

        friend std::ostream& operator<<(std::ostream& s, const numa_node& cpus);
    };

    std::ostream& operator<<(std::ostream& s, const numa_node& cpus);

    struct affinity final
    {
        struct this_thread final
        {
            static cpu_set get_affinity();
            static void    set_affinity(const cpu_set&);
        };

        struct system final
        {
            // static cpu_set cpus_by_numa(int numa_id);
            // static cpu_set cpus_by_socket(int socket_id);
            // static cpu_set cpus_by_core(int core_id);
            // static cpu_set cpus_by_hyperthread(int thread_id);

            static cpuaff::cpu cpu_from_logical_id(int id);

            static std::vector<numa_node> topology();
        };
    };

} // namespace trtlab


================================================
FILE: trtlab/core/include/trtlab/core/async_compute.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

#include <future>
#include <memory>

namespace trtlab
{
    namespace async
    {
        namespace detail
        {
            template <template <typename> class Promise, template <typename> class Future, typename CompleterFn>
            struct shared_packaged_task;

            template <template <typename> class Promise, template <typename> class Future, typename... Args>
            struct shared_packaged_task<Promise, Future, void(Args...)>
            {
                using CallingFn = std::function<void(Args...)>;
                using WrappedFn = std::function<void(Args...)>;

                shared_packaged_task(CallingFn calling_fn)
                {
                    m_WrappedFn = [this, calling_fn](Args&&... args) {
                        calling_fn(args...);
                        m_Promise.set_value();
                    };
                }

                Future<void> get_future()
                {
                    return m_Promise.get_future();
                }

                void operator()(Args&&... args)
                {
                    m_WrappedFn(args...);
                }

            private:
                WrappedFn     m_WrappedFn;
                Promise<void> m_Promise;
            };

            template <template <typename> class Promise, template <typename> class Future, typename ResultType, typename... Args>
            struct shared_packaged_task<Promise, Future, ResultType(Args...)>
            {
                using CallingFn = std::function<ResultType(Args...)>;
                using WrappedFn = std::function<void(Args...)>;

                shared_packaged_task(CallingFn calling_fn)
                {
                    m_WrappedFn = [this, calling_fn](Args&&... args) { m_Promise.set_value(std::move(calling_fn(args...))); };
                }

                std::future<ResultType> get_future()
                {
                    return m_Promise.get_future();
                }

                void operator()(Args&&... args)
                {
                    m_WrappedFn(args...);
                }

            private:
                WrappedFn                m_WrappedFn;
                std::promise<ResultType> m_Promise;
            };

        } // namespace detail

    } // namespace async

    template <typename CompleterFn>
    struct async_compute;

    template <typename... Args>
    struct async_compute<void(Args...)>
    {
        // create a shared object that holds both the promise and the user function
        // to call with some pre-defined arguments.
        // upon calling the () method on the created object, the value of the promise
        // is set by the return value of the wrapped  user function
        template <typename F>
        static auto wrap(F&& f)
        {
            using ResultType = typename std::result_of<F(Args...)>::type;
            using UserFn     = ResultType(Args...);
            //return std::make_shared<detail::async_compute_impl<UserFn>>(f);
            return std::make_shared<async::detail::shared_packaged_task<std::promise, std::future, UserFn>>(f);
        }
    };

} // namespace trtlab

================================================
FILE: trtlab/core/include/trtlab/core/batcher.h
================================================
#pragma once

#include <chrono>
#include <future>
#include <memory>
#include <optional>
#include <queue>
#include <vector>

#include <glog/logging.h>

namespace trtlab
{
    // Defines the batching logic used by the Batcher class.
    // This class only provides the core logic for batching,
    // managed the state, and provides the synchronization
    // future-promise mapping.
    // There are no public methods besides the constructor
    // and deconstructor.  There is also no internal mutex
    // for synchronization.  This class is designed to be
    // privately inherited by the Batcher which will provide
    // the necessary threads, mutexes, etc. for use.
    template <typename T, typename ThreadType>
    class StandardBatcher
    {
        using promise_t       = typename ThreadType::template promise<void>;
        using shared_future_t = typename ThreadType::template shared_future<void>;

    public:
        StandardBatcher(std::size_t max_batch_size) : m_MaxBatchSize(max_batch_size), m_BatchCounter(0)
        {
        }
        virtual ~StandardBatcher(){};

        StandardBatcher(StandardBatcher&&) = default;

    public:
        using thread_type = ThreadType;
        using clock_type  = std::chrono::high_resolution_clock;

        struct Batch
        {
            std::vector<T>    items;
            mutable promise_t promise;
            std::size_t       batch_id;
        };

        using batch_item  = T;
        using batch_type  = std::optional<Batch>;
        using future_type = shared_future_t;
        using release_fn  = std::function<void(void)>;

        // Enqueue a batch item
        // This is the data-in interface to the batcher.
        // This method is intended to collect together multiple BatchItems.
        // For each BatchItem a shared_future (unique to the batch) is returned
        // to the caller.  The shared_future is not unique to the BatchItem.
        // This future is the caller portion of the Batcher's promise to
        // use the data and to fulfill the promise by setting some value
        // of ReturnType at a later time.  Until the batcher fulfills the
        // promise, the caller should not modify or delete any of the data
        // passed to the batcher.  This is a performance optimization to avoid
        // data copies.
        future_type enqueue(T);

        // Updates the state of the batcher and tests to determine if the
        // conditions for closing the batching window have been met.
        // If the batch is complete, the optional to the batch is fulfilled;
        // otherwise, the optional is empty or std::nullopt;
        batch_type update();

        // Closes the current batch and returns an optional batch_type
        batch_type close_batch();

        bool empty()
        {
            return !m_State.has_value();
        }

        clock_type::time_point start_time()
        {
            return (m_State ? m_State->start_time : clock_type::time_point{});
        }

    private:
        struct State
        {
            Batch                  batch;
            shared_future_t        future;
            clock_type::time_point start_time;
        };

        State create_state();

        std::size_t          m_MaxBatchSize;
        std::optional<State> m_State;
        std::size_t          m_BatchCounter;
    };

    template <typename T, typename ThreadType>
    typename StandardBatcher<T, ThreadType>::future_type StandardBatcher<T, ThreadType>::enqueue(T new_item)
    {
        // no state is ok - simiply create a state to start the timing window
        if (!m_State)
        {
            m_State = create_state();
        }

        // perform some checks
        DCHECK_LT(m_State->batch.items.size(), m_MaxBatchSize);

        // push back new item (memory has been reserved)
        m_State->batch.items.push_back(std::move(new_item));

        return m_State->future;
    }

    template <typename T, typename ThreadType>
    typename StandardBatcher<T, ThreadType>::batch_type StandardBatcher<T, ThreadType>::update()
    {
        if (m_State)
        {
            if (m_State->batch.items.size() == m_MaxBatchSize) /* || clock_type::now() > m_State->deadline)*/
            {
                batch_type batch(std::move(m_State->batch));
                m_State = std::nullopt;
                return batch;
            }
        }
        return std::nullopt;
    }

    template <typename T, typename ThreadType>
    typename StandardBatcher<T, ThreadType>::batch_type StandardBatcher<T, ThreadType>::close_batch()
    {
        if (m_State)
        {
            batch_type batch(std::move(m_State->batch));
            m_State = std::nullopt;
            return batch;
        }
        return std::nullopt;
    }

    template <typename T, typename ThreadType>
    typename StandardBatcher<T, ThreadType>::State StandardBatcher<T, ThreadType>::create_state()
    {
        State state;
        state.batch.items.reserve(m_MaxBatchSize);
        state.batch.batch_id = m_BatchCounter++;
        state.future         = state.batch.promise.get_future().share();
        state.start_time     = clock_type::now();
        return state;
    }

} // namespace trtlab

================================================
FILE: trtlab/core/include/trtlab/core/cyclic_buffer.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <functional>
#include <memory>
#include <experimental/propagate_const>

#include <trtlab/memory/allocator.h>
#include <trtlab/memory/descriptor.h>

namespace trtlab
{
    class CyclicBuffer
    {
    public:
        using SyncFn     = std::function<bool(bool)>;
        using CallbackFn = std::function<SyncFn(std::size_t, const void*, std::size_t)>;

        CyclicBuffer(memory::descriptor&&, std::size_t window_size, std::size_t overlap_size, CallbackFn);
        virtual ~CyclicBuffer();

        // returns the number of bytes needed for count windows of size overlapping by overlap bytes
        static std::size_t SizeFor(std::size_t count, std::size_t size, std::size_t overlap)
        {
            CHECK_GT(count, 0);
            CHECK_GT(size, overlap);
            return size + (count - 1) * (size - overlap);
        }

        // as data is appended, the callback will be triggered when a window is completed.
        // after a window is completed, the window is shifted by ShiftSize. if the
        // subsequent window is also complete, callbacks will be triggered.
        // AppendData will block until all data the buffer has allocated memory for all the data.
        // AppendData could issue async copies.  The only guarantee is that all memory copies have
        // been issued, but may not have finished when AppendData returns.  This means that the
        // memory passed to the AppendData method is volatile and should not be changed until the
        // SyncFn has returns true. AppendData will use the thread to drive forward progress
        // of the buffers internal state.  before new data is appended, a window must be available
        // for use.  Under the worst case scenario, no windows are free, so we must wait on the
        // next window in the ring to become available.  data is copied into available windows.
        // when a window fills, the callback method is applied to that data region and the state
        // of the copy and callback are tracked by the manager.  this continues until all data
        // has been consumed.

        // blocks until all copies are in-flight
        // one must wait on the CopyStatus method to check for completeness
        // when CopyStatus() returns true, then the data buffer passed to AppendData can be reused
        SyncFn AppendData(void*, std::size_t);

        // tell the buffer to shutdown, this will attempt to push all windows into a "running"
        // state.  the final window may not be data complete, but a method in the manager
        // will handle that case.
        void Shutdown(std::function<void(std::size_t, const void* data, std::size_t)> fill);

        // waits for all windows to become available
        void Sync();

        void PreferFullWindows(bool);

    protected:
        std::size_t Size() const
        {
            return m_Size;
        }
        std::size_t WindowSize() const
        {
            return m_WindowSize;
        }
        std::size_t ShiftSize() const
        {
            return m_ShiftSize;
        }

        std::size_t Offset(const void* addr) const
        {
            auto mem = reinterpret_cast<std::uintptr_t>(addr);
            DCHECK_GE(mem, m_Data);
            return mem - m_Data;
        }

        std::size_t Offset(std::uintptr_t addr) const
        {
            DCHECK_GE(addr, m_Data);
            return addr - m_Data;
        }

    private:
        enum class WindowStatus
        {
            Ready,
            Running,
            Finished
        };

        struct WindowState
        {
            WindowStatus status;
            SyncFn       syncfn;
        };

        // used to copy data from AppendData into the buffer
        virtual void Copy(void* dst, const void* src, std::size_t size) = 0;

        // used to copy data to be replicated data internally within the buffer
        virtual void Replicate(void* dst, const void* src, std::size_t size) = 0;

        // if data can be async copied into the buffer, then AppendData
        // may return before the entirely the data in its source buffers
        // has be copied to the buffer; however, all copies are in-flight
        // the return value of SyncFn signifies when the source data can be reused
        virtual SyncFn FinishedAppendingData() = 0;

        // wrapper to set state and capture the sync function of the callback
        void RunCallback(const void* data, std::size_t size);

        // determine the amount of free bytes availabe in the current window
        std::size_t Free();

        // repllicate overlapping data from end of the buffer to the start
        // this method is called automatically by Free when the buffer resets
        void ReplicateData();

        // wrapper to copy external data to the buffer and shift internal pointers
        void CopyToHead(std::uintptr_t data, std::size_t size);

        // wrapper to copy/move internal data within the buffer
        void ReplicateToHead(std::uintptr_t data, std::size_t size);

        // called to launch callback on all available windows
        void ExecuteWindows();

        // wrapper to set state and capture the sync function of a callback
        void RunCallbackOnWindow(std::uintptr_t data, std::size_t size);

        // returns true of the window is ready for use; false if not
        bool SyncWindow(std::size_t window, bool wait);

        // resets the state of a window
        void ResetWindow(std::size_t window);

        // the following are directly set by constructor calling arguments
        memory::descriptor m_Descriptor;
        std::uintptr_t     m_Data;
        std::size_t        m_Size;
        std::size_t        m_WindowSize;
        std::size_t        m_ShiftSize;
        std::uintptr_t     m_CurrentLocation; // location of next data append
        std::uintptr_t     m_WindowStart;     // start of current window
        std::size_t        m_CurrentWindow;
        bool               m_PreferFullWindows;
        CallbackFn         m_Callback;

        // the remaining variables are set in the constructor's body
        std::uintptr_t           m_ReplicationStart; // location of data which will be copied to head
        std::uintptr_t           m_NextPassStart;    // location of first new data after replicated data
        std::uintptr_t           m_SyncLocation;     // location of the last byte of sync'ed memory
        std::size_t              m_SyncWindow;       // window number of
        std::size_t              m_WindowCount;
        std::vector<WindowState> m_State;
    };

    class HostCyclicBuffer : public CyclicBuffer
    {
    public:
        using CyclicBuffer::CyclicBuffer;

    private:
        void Copy(void* dst, const void* src, std::size_t size) final override
        {
            std::memcpy(dst, src, size);
        }

        void Replicate(void* dst, const void* src, std::size_t size) final override
        {
            std::memcpy(dst, src, size);
        }

        SyncFn FinishedAppendingData() final override
        {
            return [](bool) { return true; };
        }
    };

    template <typename T, typename WindowedBuffer>
    class TypedWindowedBuffer : private WindowedBuffer
    {
    public:
        using SyncFn     = std::function<bool(bool)>;
        using CallbackFn = std::function<SyncFn(const T*, std::size_t)>;

        TypedWindowedBuffer(std::size_t window_size, std::size_t shift_size, memory::descriptor&& md, CallbackFn callback)
        : WindowedBuffer(window_size * sizeof(T), shift_size * sizeof(T), std::move(md),
                         [callback](const void* data, std::size_t size) -> SyncFn {
                             DCHECK_EQ(size % sizeof(T), 0);
                             return callback(static_cast<const T*>(data), size / sizeof(T));
                         })
        {
        }

        virtual ~TypedWindowedBuffer() {}

        SyncFn AppendData(const T* data, std::size_t count)
        {
            return WindowedBuffer::AppendData(static_cast<void*>(data), count * sizeof(T));
        }

        void Sync()
        {
            WindowedBuffer::Sync();
        }
    };

} // namespace trtlab

================================================
FILE: trtlab/core/include/trtlab/core/cyclic_windowed_buffer.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <functional>
#include <future>
#include <memory>

#include <trtlab/memory/align.h>
#include <trtlab/memory/descriptor.h>
#include <trtlab/memory/memory_type.h>

namespace trtlab
{
    // forward declare the classes in this header
    // buffer hold the data and core properties
    class cyclic_windowed_buffer;

    // stack manages window lifecycle and syncing
    //template<typename MemoryType = memory::host_memory>
    //class cyclic_windowed_stack;

    // launches a task as each window is filled
    template <typename MemoryType, typename ThreadType>
    class cyclic_windowed_task_executor;

    // provides a reserved window with the requested amount of overlap
    // only one window can be reserved at a time
    class cyclic_windowed_reservation;

    // cyclic windowed buffer

    class cyclic_windowed_buffer
    {
    public:
        cyclic_windowed_buffer();

        cyclic_windowed_buffer(memory::descriptor md, std::size_t window_size, std::size_t overlap_size);

        virtual ~cyclic_windowed_buffer() {}

        cyclic_windowed_buffer(cyclic_windowed_buffer&&) noexcept;
        cyclic_windowed_buffer& operator=(cyclic_windowed_buffer&&) noexcept;

        cyclic_windowed_buffer(const cyclic_windowed_buffer&) = delete;
        cyclic_windowed_buffer& operator=(const cyclic_windowed_buffer&) = delete;

        // returns the number of bytes needed for count windows of size overlapping by overlap bytes
        static std::size_t min_allocation_size(std::size_t window_count, std::size_t window_size, std::size_t overlap);

        // number of windows in the buffer
        std::size_t window_count() const noexcept
        {
            return m_window_count;
        }

        // size of each window in bytes
        std::size_t window_size() const noexcept
        {
            return m_window_size;
        }

        // the stride of the shift in bytes
        std::size_t shift_size() const noexcept
        {
            return m_shift_size;
        }

        // amount of overlap between windows in bytes
        std::size_t overlap_size() const noexcept
        {
            return m_window_size - m_shift_size;
        }

        // effective number of the allocated bytes used to back the windows
        std::size_t capacity() const noexcept
        {
            return m_capacity;
        };

        DLContext device_context() const noexcept
        {
            return m_descriptor.device_context();
        }

    protected:
        memory::addr_t data() const noexcept
        {
            return static_cast<memory::addr_t>(const_cast<void*>(m_descriptor.data()));
        }

        std::ptrdiff_t offset(memory::addr_t p) const noexcept
        {
            return p - data();
        }

    private:
        // initializer list
        memory::descriptor m_descriptor; /* move-only */
        std::size_t        m_window_size;

        // computed in constructor
        std::size_t m_window_count;
        std::size_t m_shift_size;
        std::size_t m_capacity;
    };

    namespace detail
    {
        class cyclic_windowed_stack_impl : private cyclic_windowed_buffer
        {
        public:
            cyclic_windowed_stack_impl();
            cyclic_windowed_stack_impl(memory::descriptor md, std::size_t window_size, std::size_t overlap_size);

            cyclic_windowed_stack_impl(cyclic_windowed_stack_impl&&) noexcept;
            cyclic_windowed_stack_impl& operator=(cyclic_windowed_stack_impl&&) noexcept;

            cyclic_windowed_stack_impl(const cyclic_windowed_stack_impl&) = delete;
            cyclic_windowed_stack_impl& operator=(const cyclic_windowed_stack_impl&) = delete;

            ~cyclic_windowed_stack_impl() override
            {
                reset();
            }

            // access the cyclic buffer
            const cyclic_windowed_buffer& buffer() const
            {
                return *this;
            }

        protected:
            virtual void on_window_complete_event(std::size_t, const void*, std::size_t) {}

            // bytes available in current window
            std::size_t available() const noexcept;

            // push the data pointer forward
            // cannot advance past the end of the current window
            std::size_t push_data(const void*, std::size_t);

            // shift to the next window, recording a sync function for the current window
            // this method will block if the next window is not available
            // if the buffer wraps, the last overlap_size bytes of data is replicated to the
            // front of the stack.
            void push_window(std::function<void()>);

            // sync all outstanding windows and resets the stack to the start of the buffer
            void reset();

            // access the top of the data stack
            void* data_top()
            {
                return m_data_top;
            }

            // access the start of the current window
            void* window_start()
            {
                return m_win_top;
            }

            // convenience function
            using cyclic_windowed_buffer::window_size;

        private:
            // todo: figure out a way to relax the constrain of blocking copy and replicate

            // function used to copy external data to the buffer
            // required to be a blocking call
            virtual void copy(void*, const void*, size_t) = 0;

            // internal buffer copy function: defaults to std::memcpy; override for cuda
            // required to be a blocking call
            virtual void replicate(void*, const void*, std::size_t) = 0;

            // access the top of the stack
            memory::addr_t top()
            {
                return m_data_top;
            }

            // return the unique window id for the current window
            // this is monotonically increasing even though the buffer wraps
            std::size_t window_id() const noexcept
            {
                return m_win_counter;
            }

            // recycle buffer - reset, sync and replicate
            void recycle_buffer();

            // sync the next window and push the sync stack
            void sync_and_shift();

            // end of the buffer
            memory::addr_t m_end;

            // location of first free byte in the stack
            // allowed range [m_win_top, m_win_top + window_size_in_bytes()]
            memory::addr_t m_data_top;

            // location of first byte that needs synchronization
            // allowed range [data() + shift_size(), data() + capacity() , shift_size()]
            memory::addr_t m_sync_top;

            // start of the current window
            // allowed range [data(), data() + capacity_in_bytes() - window_size_in_bytes(), shift_size()]
            memory::addr_t m_win_top;

            // count the number of windows that have been triggered
            // effectively a unqiue id for a window
            std::size_t m_win_counter;

            // synchronization functions
            std::queue<std::pair<memory::addr_t, std::function<void()>>> m_sync;
        };

    } // namespace detail

    template <typename MemoryType, typename ThreadType>
    class cyclic_windowed_stack;

    template <typename ThreadType>
    class cyclic_windowed_stack<memory::host_memory, ThreadType> : public detail::cyclic_windowed_stack_impl
    {
    public:
        using memory_type = memory::host_memory;

        cyclic_windowed_stack(memory::descriptor md, std::size_t window_size, std::size_t overlap_size)
        : cyclic_windowed_stack_impl(std::move(md), window_size, overlap_size)
        {
        }

        cyclic_windowed_stack(cyclic_windowed_stack&& other) noexcept
        : cyclic_windowed_stack_impl(std::move(other)) {}
        cyclic_windowed_stack& operator=(cyclic_windowed_stack&& other) noexcept
        {
            cyclic_windowed_stack_impl::operator=(std::move(other));
            return *this;
        }

        ~cyclic_windowed_stack() override {}

        using cyclic_windowed_stack_impl::buffer;

    private:
        void copy(void* dst, const void* src, std::size_t size) final override
        {
            std::memcpy(dst, src, size);
        }

        void replicate(void* dst, const void* src, std::size_t size) final override
        {
            std::memcpy(dst, src, size);
        }
    };

    template <typename MemoryType, typename ThreadType>
    class cyclic_windowed_reserved_stack : private cyclic_windowed_stack<MemoryType, ThreadType>
    {
        using stack = cyclic_windowed_stack<MemoryType, ThreadType>;

        using promise_t = typename ThreadType::template promise<void>;
        using future_t  = typename ThreadType::template future<void>;

    public:
        cyclic_windowed_reserved_stack() {}

        cyclic_windowed_reserved_stack(stack&& s) : stack(std::move(s)) {}

        ~cyclic_windowed_reserved_stack() override {}

        // allow access back to the underlying buffer
        using stack::buffer;

        struct reservation;

        const reservation reserve_window()
        {
            CHECK(stack::window_size());
            if (m_future.valid())
            {
                m_future.get();
                stack::push_window([] {} /* empty sync fn */);
            }

            // prepare promise/future combo
            promise_t promise;
            m_future = promise.get_future();

            // build reservation
            reservation r(stack::window_start(), stack::window_size(), stack::data_top(), stack::available(), std::move(promise));

            return r;
        }

        void reset()
        {
            if(m_future.valid())
            {
                m_future.get();
            }
            stack::reset();
        }

        struct reservation
        {
            reservation() {}
            reservation(void* wstart, std::size_t wsize, void* dstart, std::size_t dsize, promise_t&& promise)
            : window_start(wstart), window_size(wsize), data_start(dstart), data_size(dsize), m_promise(std::move(promise))
            {
            }

            virtual ~reservation() {}

            reservation(reservation&&) noexcept = default;
            reservation& operator=(reservation&&) noexcept = default;

            void*       window_start;
            std::size_t window_size;
            void*       data_start;
            std::size_t data_size;

            void release() const
            {
                DCHECK(window_start);
                m_promise.set_value();
                // nullify pointers and zero out sizes?
            }

        private:
            mutable promise_t m_promise;
        };

    private:
        future_t m_future;
    };

    // task executor

    template <typename MemoryType, typename ThreadType>
    class cyclic_windowed_task_executor : private cyclic_windowed_stack<MemoryType, ThreadType>
    {
        using stack           = cyclic_windowed_stack<MemoryType, ThreadType>;
        using shared_future_t = typename ThreadType::template shared_future<void>;

    public:
        //cyclic_windowed_task_executor();
        cyclic_windowed_task_executor(stack&& s) : stack(std::move(s)) {}

        cyclic_windowed_task_executor(const cyclic_windowed_task_executor&) = delete;
        cyclic_windowed_task_executor& operator=(const cyclic_windowed_task_executor&) = delete;

        cyclic_windowed_task_executor(cyclic_windowed_task_executor&& other) noexcept : stack(std::move(other)) {}
        cyclic_windowed_task_executor& operator=(cyclic_windowed_task_executor&& other) noexcept
        {
            stack::operator=(std::move(other));
            return *this;
        }

        ~cyclic_windowed_task_executor() override {}

        // allow access back to the underlying buffer
        using stack::buffer;

        // write data to the buffer
        // as windows are filled the task function is run on the window data
        // the task function must return a shared_future that be used to sync the window
        // the completion of the future signifies that the data in the window can be reused
        void append_data(const void* data, std::size_t size);

        // TODO: requires some memory_utilities<MemoryType>
        // writes val the the remaining entries in the current window
        // and triggers a task execution
        // void flush(T val) {}

        // TODO: requires some memory_utilities<MemoryType>
        // pre populate some portion the first window
        // only allowed before data is pushed/appended
        // count must be less than the size of the first window
        // void pre_populate(T val, std::size_t count) {}

        // sync and reset the stack
        void reset()
        {
            stack::reset();
        }

    private:
        virtual shared_future_t on_compute_window(std::size_t, const void*, std::size_t) = 0;
        void                    on_window_complete_event(std::size_t, const void*, std::size_t) final override;
    };

    template <typename MemoryType, typename ThreadType>
    void cyclic_windowed_task_executor<MemoryType, ThreadType>::append_data(const void* data, std::size_t size)
    {
        memory::addr_t src = reinterpret_cast<memory::addr_t>(const_cast<void*>(data));

        while (size)
        {
            auto bytes = std::min(size, stack::available());
            size -= stack::push_data(src, bytes);
            src += bytes;
        }
    }

    template <typename MemoryType, typename ThreadType>
    void cyclic_windowed_task_executor<MemoryType, ThreadType>::on_window_complete_event(std::size_t id, const void* data, std::size_t size)
    {
        auto f = on_compute_window(id, data, size);
        stack::push_window([f] { f.get(); });
    }

} // namespace trtlab

================================================
FILE: trtlab/core/include/trtlab/core/dispatcher.h
================================================


#pragma once

#include <chrono>
#include <future>
#include <memory>
#include <optional>
#include <queue>

#include <boost/container/static_vector.hpp>
#include <boost/smart_ptr/detail/spinlock.hpp>

#include <glog/logging.h>

#include "hybrid_condition.h"
#include "hybrid_mutex.h"
#include "thread_pool.h"
#include "task_pool.h"

#include "standard_threads.h"
#include "userspace_threads.h"

namespace trtlab
{
    template <typename Batcher>
    class Dispatcher;

    template <template <class, class> class Batcher, typename T>
    class Dispatcher<Batcher<T, standard_threads>> : private Batcher<T, standard_threads>
    {
        using batcher_type = Batcher<T, standard_threads>;
        using thread_type  = typename batcher_type::thread_type;
        using clock_type   = typename batcher_type::clock_type;

        using mutex_type   = std::mutex;
        using cv_type      = std::condition_variable;
        using thread_pool  = std::shared_ptr<ThreadPool>;
        using task_pool    = std::shared_ptr<DeferredShortTaskPool>;

        // extra performance can be achieved using hybrid mutex/condition variables
        // using thread_pool = BaseThreadPool<hybrid_mutex, hybrid_condition>;

    public:
        using future_type = typename batcher_type::future_type;
        using execute_fn  = std::function<void(const std::vector<typename batcher_type::batch_item>&, std::function<void()>)>;

        //template <typename... Args>
        Dispatcher(batcher_type&& batcher, std::chrono::nanoseconds batching_window, thread_pool workers, task_pool progress, execute_fn exec_fn)
        : batcher_type(std::move(batcher)),
          m_Workers(workers),
          m_Progress(progress),
          m_UserFn(exec_fn),
          m_ProgressTaskEnqueued(false),
          m_DispatchID(0),
          m_BatchingWindow(batching_window),
          m_Shutdown(false)
        {
        }

        virtual ~Dispatcher()
        {
            shutdown();
        }

        // can be moveable, but only if we inherit from std::enable_shared_from_this
        // and all thread/task offsets refer to this object via a shared_ptr and not this.
        Dispatcher(Dispatcher&&) = delete;
        Dispatcher& operator=(Dispatcher&&) = delete;

        // not copyable
        Dispatcher(const Dispatcher&) = delete;
        Dispatcher& operator=(const Dispatcher&) = delete;

        // enqueue will add the item to the current batch
        // the batcher controls all the in-process logic of batching
        // the dispatcher controls the out-of-process logic, like timeouts
        // access control is via the dispatcher
        future_type enqueue(T item)
        {
            std::lock_guard<mutex_type> lock(m_EnqueueMutex);
            if (m_Shutdown)
            {
                throw std::runtime_error("dispatcher shutting down; no new enqueues can be accepted");
            }

            // should we launch a deferred async task to ensure forward progress
            bool enqueue_progress_task = !m_ProgressTaskEnqueued && batcher_type::empty();

            // push current items, then query the batcher for a batch
            auto future = batcher_type::enqueue(std::move(item));
            if (auto batch = batcher_type::update())
            {
                QueueBatch(*batch);
                enqueue_progress_task = false;
            }

            if (enqueue_progress_task)
            {
                QueueProgressTask();
            }

            return future;
        }

        void shutdown()
        {
            std::condition_variable      cv;
            std::unique_lock<mutex_type> lock(m_EnqueueMutex);
            m_Shutdown = true;
            while (m_ProgressTaskEnqueued)
            {
                cv.wait_until(lock, clock_type::now() + m_BatchingWindow);
            }
        }

    private:
        // requires mutex
        void QueueBatch(typename batcher_type::Batch& batch)
        {
            DCHECK_GT(batch.items.size(), 0);
            auto work = [batch = std::move(batch), user_fn = m_UserFn]() {
                auto completer = [&batch]() mutable { batch.promise.set_value(); };
                user_fn(batch.items, completer);
            };
            m_Workers->enqueue(std::move(work));
            m_DispatchID++;
        }

        // requires mutex
        void QueueProgressTask()
        {
            DCHECK(!batcher_type::empty());
            auto deadline = batcher_type::start_time() + m_BatchingWindow;
            auto task     = [this, id = m_DispatchID]() { ProgressTask(id); };
            m_Progress->enqueue_deferred(deadline, std::move(task));
            m_ProgressTaskEnqueued = true;
        }

        void ProgressTask(std::size_t id)
        {
            std::lock_guard<mutex_type> lock(m_EnqueueMutex);
            m_ProgressTaskEnqueued = false;

            // if the ids match, then close and queue the current batch - it timed out!
            if (m_DispatchID == id)
            {
                DCHECK(!batcher_type::empty());
                if (auto batch = batcher_type::close_batch())
                {
                    QueueBatch(*batch);
                }
            }
            else
            {
                // if there is potential work, re-queue the progress task to ensure
                // that work will complete at some future time
                if (!batcher_type::empty())
                {
                    if (auto batch = batcher_type::update())
                    {
                        QueueBatch(*batch);
                    }
                    else
                    {
                        QueueProgressTask();
                    }
                }
            }
        }

        execute_fn                m_UserFn;
        thread_pool              m_Workers;
        task_pool                m_Progress;
        mutex_type               m_EnqueueMutex;
        bool                     m_ProgressTaskEnqueued;
        std::size_t              m_DispatchID;
        std::chrono::nanoseconds m_BatchingWindow;
        bool                     m_Shutdown;
    };

    // userspace_threads

    template <template <class, class> class Batcher, typename T>
    class Dispatcher<Batcher<T, userspace_threads>> : private Batcher<T, userspace_threads>
    {
        using batcher_type = Batcher<T, userspace_threads>;
        using thread_type  = typename batcher_type::thread_type;
        using clock_type   = typename batcher_type::clock_type;

        using mutex_type   = typename thread_type::mutex;
        using cv_type      = typename thread_type::cv;

    public:
        using batch_t     = std::vector<typename batcher_type::batch_item>;
        using future_type = typename batcher_type::future_type;

        //template <typename... Args>
        Dispatcher(batcher_type&& batcher, std::chrono::nanoseconds batching_window)
        : batcher_type(std::move(batcher)),
          m_ProgressTaskEnqueued(false),
          m_DispatchID(0),
          m_BatchingWindow(batching_window),
          m_Shutdown(false)
        {
        }

        virtual ~Dispatcher()
        {
            shutdown();
        }

        // can be moveable, but only if we inherit from std::enable_shared_from_this
        // and all thread/task offsets refer to this object via a shared_ptr and not this.
        Dispatcher(Dispatcher&&) = delete;
        Dispatcher& operator=(Dispatcher&&) = delete;

        // not copyable
        Dispatcher(const Dispatcher&) = delete;
        Dispatcher& operator=(const Dispatcher&) = delete;

        // enqueue will add the item to the current batch
        // the batcher controls all the in-process logic of batching
        // the dispatcher controls the out-of-process logic, like timeouts
        // access control is via the dispatcher
        future_type enqueue(T item)
        {
            std::lock_guard<mutex_type> lock(m_EnqueueMutex);
            if (m_Shutdown)
            {
                throw std::runtime_error("dispatcher shutting down; no new enqueues can be accepted");
            }

            // should we launch a deferred async task to ensure forward progress
            bool enqueue_progress_task = !m_ProgressTaskEnqueued && batcher_type::empty();

            // push current items, then query the batcher for a batch
            auto future = batcher_type::enqueue(std::move(item));
            if (auto batch = batcher_type::update())
            {
                DVLOG(2) << "got batch from batcher - queuing execution";
                QueueBatch(std::move(*batch));
                enqueue_progress_task = false;
                DVLOG(2) << "batch queued";
            }

            if (enqueue_progress_task)
            {
                DVLOG(2) << "queuing progress task to close window on timeout";
                QueueProgressTask();
            }

            return future;
        }

        void shutdown()
        {
            cv_type                      cv;
            std::unique_lock<mutex_type> lock(m_EnqueueMutex);
            m_Shutdown = true;
            while (m_ProgressTaskEnqueued)
            {
                cv.wait_until(lock, clock_type::now() + m_BatchingWindow);
            }
        }

    private:
        virtual void compute_batch_fn(const batch_t&, std::function<void()>) {}

        // requires mutex
        void QueueBatch(typename batcher_type::Batch&& batch)
        {
            DCHECK_GT(batch.items.size(), 0);

            DVLOG(3) << "QueueBatch";
            boost::fibers::fiber(boost::fibers::launch::dispatch, [this, batch = std::move(batch)] {
                auto completer = [&batch]() mutable { batch.promise.set_value(); };
                compute_batch_fn(batch.items, completer);
            }).detach();

            m_DispatchID++;
        }

        // requires mutex
        void QueueProgressTask()
        {
            DCHECK(!batcher_type::empty());
            auto deadline = batcher_type::start_time() + m_BatchingWindow;
            boost::fibers::fiber(boost::fibers::launch::dispatch, [this, id = m_DispatchID, deadline]() { 
                thread_type::sleep_until(deadline);
                ProgressTask(id); 
            }).detach();
            m_ProgressTaskEnqueued = true;
        }

        void ProgressTask(std::size_t id)
        {
            std::lock_guard<mutex_type> lock(m_EnqueueMutex);
            m_ProgressTaskEnqueued = false;

            // if the ids match, then close and queue the current batch - it timed out!
            if (m_DispatchID == id)
            {
                DCHECK(!batcher_type::empty());
                if (auto batch = batcher_type::close_batch())
                {
                    QueueBatch(std::move(*batch));
                }
            }
            else
            {
                // if there is potential work, re-queue the progress task to ensure
                // that work will complete at some future time
                if (!batcher_type::empty())
                {
                    if (auto batch = batcher_type::update())
                    {
                        QueueBatch(std::move(*batch));
                    }
                    else
                    {
                        QueueProgressTask();
                    }
                }
            }
        }

        mutex_type               m_EnqueueMutex;
        bool                     m_ProgressTaskEnqueued;
        std::size_t              m_DispatchID;
        std::chrono::nanoseconds m_BatchingWindow;
        bool                     m_Shutdown;
    };

} // namespace trtlab

================================================
FILE: trtlab/core/include/trtlab/core/fiber_group.h
================================================
#pragma once
#include <mutex>
#include <vector>
#include <boost/fiber/all.hpp>
#include <glog/logging.h>

namespace trtlab
{
    template <typename SchedulerType = boost::fibers::algo::shared_work, typename... Args>
    class FiberGroup
    {
    public:
        FiberGroup(std::size_t thread_count, Args&&... args) : m_thread_count(thread_count), m_running(true), m_thread_barrier(thread_count)
        {
            auto scheduler_init = std::bind(boost::fibers::use_scheduling_algorithm<SchedulerType>, std::forward<Args>(args)...);
            for (int i = 1; i < thread_count; i++)
            {
                m_threads.emplace_back([this, thread_count, scheduler_init] {
                    scheduler_init();
                    m_thread_barrier.wait();
                    std::unique_lock<std::mutex> lock(m_thread_mutex);
                    m_thread_cond.wait(lock, [this] { return !m_running; });
                });
            }
            boost::fibers::use_scheduling_algorithm<SchedulerType>(std::forward<Args>(args)...);
            m_thread_barrier.wait();
            // VLOG(1) << "thread and fiber scheduler initialized on " << thread_count << " threads";
        }

        ~FiberGroup()
        {
            {
                std::lock_guard<decltype(m_thread_mutex)> lock(m_thread_mutex);
                m_running = false;
            }
            m_thread_cond.notify_all();

            for (auto& t : m_threads)
            {
                t.join();
            }
        }

    private:
        bool                                  m_running;
        const std::size_t                     m_thread_count;
        boost::fibers::barrier                m_thread_barrier;
        std::vector<std::thread>              m_threads;
        std::mutex                            m_thread_mutex;
        boost::fibers::condition_variable_any m_thread_cond;
    };

} // namespace trtlab

================================================
FILE: trtlab/core/include/trtlab/core/hybrid_condition.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

#include <atomic>
#include <cassert>
#include <chrono>
#include <condition_variable>
#include <cstdint>
#include <limits>
#include <mutex>

#include <linux/futex.h>
#include <sys/syscall.h>
#include <sys/time.h>
#include <unistd.h>

#include <x86intrin.h>

#include "trtlab/core/hybrid_mutex.h"

/**
 *
 */
class alignas(16) hybrid_condition final
{
    hybrid_condition(const hybrid_condition&) = delete;
    hybrid_condition& operator=(const hybrid_condition&) = delete;

  public:
    /**
     */
    constexpr hybrid_condition() noexcept : m_mutex(nullptr), m_sequence(0) {}

    /**
     */
    ~hybrid_condition() noexcept {}

    /** Wait for mutex to signal */
    void wait(std::unique_lock<hybrid_mutex>& lock) noexcept
    {
        (void)wait_for_impl(lock.mutex(), std::chrono::seconds(0), std::chrono::nanoseconds(0));
    }

    /**
     */
    template<typename TPredicate>
    void wait(std::unique_lock<hybrid_mutex>& lock, const TPredicate& pred)
    {
        while(!pred())
        {
            wait(lock);
        }
    }

    /**
     */
    template<typename TRep, typename TPeriod>
    std::cv_status wait_for(std::unique_lock<hybrid_mutex>& lock,
                            const std::chrono::duration<TRep, TPeriod>& rel_time)
    {
        auto rtime = rel_time;
        auto seconds = std::chrono::duration_cast<std::chrono::seconds>(rtime);
        rtime -= std::chrono::duration_cast<std::chrono::duration<TRep, TPeriod>>(seconds);
        auto nanos = std::chrono::duration_cast<std::chrono::nanoseconds>(rtime);
        return wait_for_impl(lock.mutex(), seconds, nanos);
    }

    /**
     */
    template<typename TRep, typename TPeriod, typename TPredicate>
    bool wait_for(std::unique_lock<hybrid_mutex>& lock,
                  const std::chrono::duration<TRep, TPeriod>& rel_time, TPredicate pred)
    {
        while(!pred())
        {
            if(wait_for(lock, rel_time) == std::cv_status::timeout)
            {
                return pred();
            }
        }

        return true;
    }

    /** Notify one waiting thread to wake */
    void notify_one() noexcept
    {
        // if no waiters, just return
        if(m_mutex == nullptr)
        {
            return;
        }

        // increment sequence for wakeup
        __atomic_fetch_add(&m_sequence, 1, __ATOMIC_ACQ_REL);

        // wake up one thread
        (void)sys_futex(&m_sequence, FUTEX_WAKE_PRIVATE, 1, nullptr, nullptr, 0);
    }

    /** Notify all waiting threads to wake */
    void notify_all() noexcept
    {
        hybrid_mutex* mutex = m_mutex;

        // if no waiters, just return
        if(mutex == nullptr)
        {
            return;
        }

        // increment sequence for wakeup
        __atomic_fetch_add(&m_sequence, 1, __ATOMIC_ACQ_REL);

        // wake one thread, requeue the rest, avoids thundering herd
        // wakes up one thread, requeues all remaining threads on mutex's queue
        (void)sys_futex(&m_sequence, FUTEX_CMP_REQUEUE_PRIVATE, 1,
                        reinterpret_cast<struct timespec*>(std::numeric_limits<int32_t>::max()),
                        &mutex->m_lock, m_sequence);
    }

  private:
    hybrid_mutex* m_mutex;
    int32_t m_sequence;

    /// wait for implementation
    std::cv_status wait_for_impl(hybrid_mutex* mutex, const std::chrono::seconds& seconds,
                                 const std::chrono::nanoseconds& nanoseconds) noexcept
    {
        // expected sequence number
        int sequence = m_sequence;

        if(m_mutex != mutex)
        {
            hybrid_mutex* expected = nullptr;

            // atomically set mutex ptr
            __atomic_compare_exchange_n(&m_mutex, &expected, mutex, false, __ATOMIC_ACQ_REL,
                                        __ATOMIC_ACQUIRE);

            // make sure this condition variable is not
            // used by more than one mutex
            assert(m_mutex == mutex);
        }

        // unlock the calling mutex
        mutex->unlock();

        // setup timeout (outside of lock)
        struct timespec* timeoutptr = nullptr;
        struct timespec timeout;

        // if any timeout is set, setup time struct
        if(seconds.count() > 0 || nanoseconds.count() > 0)
        {
            timeout.tv_sec = seconds.count();
            timeout.tv_nsec = nanoseconds.count();
            timeoutptr = &timeout;
        }

        // put thread to sleep on the sequence address,
        // iff mSequence is still equal to sequence (value while lock was held)
        int ret = -1;
        std::cv_status status = std::cv_status::no_timeout;

        // if interrupted.. continue.. and try again..
        while((ret = sys_futex(&m_sequence, FUTEX_WAIT_PRIVATE, sequence, timeoutptr, nullptr,
                               0)) == -1 &&
              errno == EINTR)
        {
            continue;
        }

        // return false if we had timeout waiting to be notified
        if(ret == -1 && errno == ETIMEDOUT)
        {
            status = std::cv_status::timeout;
        }

        // awoke, we need to aquire then lock before we exit
        // slight bit of code duplication here with regard to hybrid_mutex::lock()
        while(__atomic_exchange_n(&mutex->m_lock.u, 0x101, __ATOMIC_ACQUIRE) & 0x1)
        {
            (void)sys_futex(&mutex->m_lock.u, FUTEX_WAIT_PRIVATE, 0x101, nullptr, nullptr, 0);
        }

        return status;
    }
};


================================================
FILE: trtlab/core/include/trtlab/core/hybrid_mutex.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

#include <atomic>
#include <cassert>
#include <cstdint>

#include <linux/futex.h>
#include <sys/syscall.h>
#include <sys/time.h>
#include <unistd.h>

#include <x86intrin.h>

/**
 *
 */
inline int sys_futex(void* addr1, int op, int val1, const struct timespec* timeout, void* addr2,
                     int val3) noexcept
{
    return syscall(SYS_futex, addr1, op, val1, timeout, addr2, val3);
}

/**
 *
 */
class alignas(16) hybrid_mutex final
{
    hybrid_mutex(const hybrid_mutex&) = delete;
    hybrid_mutex& operator=(const hybrid_mutex&) = delete;

    friend class hybrid_condition;

  public:
    /**
     */
    constexpr hybrid_mutex() noexcept : m_spins(1U), m_lock(0x0) {}

    /**
     */
    constexpr hybrid_mutex(uint32_t spins) noexcept : m_spins(spins), m_lock(0x0) {}

    /**
     */
    ~hybrid_mutex() noexcept { assert(m_lock.u == 0x0); }

    /**
     */
    void lock() noexcept
    {
        // try and spin first
        for(uint32_t i = 0U; i < m_spins; i++)
        {
            // do atomic exchange, returns previous value. if the previous
            // value was 0, i.e. unlocked, we acquired the lock.
            if(!__atomic_exchange_n(&m_lock.locked, 0x1, __ATOMIC_ACQUIRE))
            {
                return;
            }

            // be nice, tell the cpu we are spinning
            __pause();
        }

        // didn't get lock, we may need to sleep.
        // exchange with both the locked and contended bits set.
        // if the previous value is still locked, i.e. 0x1,
        // we need to go into the kernel and sleep
        while(__atomic_exchange_n(&m_lock.u, 0x101, __ATOMIC_ACQUIRE) & 0x1)
        {
            (void)sys_futex(&m_lock.u, FUTEX_WAIT_PRIVATE, 0x101, nullptr, nullptr, 0);
        }
    }

    /**
     */
    bool try_lock() noexcept
    {
        if(!__atomic_exchange_n(&m_lock.locked, 0x1, __ATOMIC_ACQUIRE))
        {
            return true;
        }
        else
        {
            return false;
        }
    }

    /**
     */
    void unlock() noexcept
    {
        // locked and not contended
        // if we are locked without contention, i.e. only the locked flag is set,
        // attempt to atomically compare and swap to unlock
        if(m_lock.u == 0x1)
        {
            uint32_t locked = 0x1;

            if(__atomic_compare_exchange_n(&m_lock.u, &locked, 0x0, false, __ATOMIC_ACQ_REL,
                                           __ATOMIC_ACQUIRE))
            {
                return;
            }
        }

        // unlock, setting locked = 0, using release memory barrier
        __atomic_exchange_n(&m_lock.locked, 0x0, __ATOMIC_RELEASE);

        // spin, hoping someone takes the lock
        // if someone takes the lock under the spin
        // we can avoid going to the kernel
        // note: if m_spins * 2 overflows, there is no check here... though that many spins is
        // dumb...
        for(uint32_t i = 0U; i < m_spins * 2U; i++)
        {
            if(m_lock.locked)
            {
                return;
            }

            // be nice, tell the cpu we are spinning
            __pause();
        }

        // we need to wake someone up, go into the kernel
        // reset the contended flag. anyone actively waiting
        // on it, will set it again; otherwise, the kernel
        // will have no one to wake and the mutex will remain
        // in the unlocked, non-contended state (i.e. mLock.u == 0)
        m_lock.contended = 0x0;

        // tell the kernel to wake up 1 thread waiting on mLock address
        (void)sys_futex(&m_lock.u, FUTEX_WAKE_PRIVATE, 1, nullptr, nullptr, 0);
    }

  private:
    uint32_t m_spins; ///< number of spins before going to OS

    /*
     * lock data structure.
     *
     * The mutex in unlocked when the entire structure
     * has a value of 0. (i.e. mLock.u == 0)
     *
     * locked is the userland byte that is used for the
     * spinlock portion of the mutex.
     */
    union lock
    {
        constexpr lock(uint32_t _u) : u(_u) {}

        uint32_t u;
        struct
        {
            uint8_t locked;
            uint8_t contended;
            uint16_t pad;
        };
    } m_lock;
};


================================================
FILE: trtlab/core/include/trtlab/core/memory/first_touch_allocator.h
================================================
#pragma once
#include <cstring>
#include <set>
#include <utility>

#include <trtlab/memory/allocator_traits.h>

#include "../affinity.h"

#if HAS_LIBNUMA
#include <numaif.h>
#endif

namespace trtlab
{
    namespace memory
    {
        namespace traits_detail
        {
            //=== page_size() ===//
            // first try Allocator::page_size()
            // then return maximum value
            template <class Allocator>
            auto page_size(full_concept, const Allocator& alloc) -> TRTLAB_AUTO_RETURN_TYPE(alloc.page_size(), std::size_t)
/*
                template <class Allocator>
                std::size_t page_size(min_concept, const Allocator&)
            {
                return std::size_t(0);
            }
*/
        } // namespace traits_detail

        template <typename RawAllocator, char Fill = 0x42>
        class first_touch_allocator : TRTLAB_EBO(allocator_traits<RawAllocator>::allocator_type)
        {
            using traits            = allocator_traits<RawAllocator>;
            using composable_traits = composable_allocator_traits<RawAllocator>;
            using composable        = is_composable_allocator<typename traits::allocator_type>;

        public:
            using allocator_type = typename allocator_traits<RawAllocator>::allocator_type;
            using memory_type    = typename allocator_traits<RawAllocator>::memory_type;
            using is_stateful    = std::true_type;

            static_assert(is_host_memory<memory_type>::value, "currently only implemented for host_memory");

            explicit first_touch_allocator(int numa_node_id, RawAllocator&& alloc = {}) : allocator_type(std::move(alloc))
            {
                auto topology = affinity::system::topology();
                CHECK_LE(numa_node_id, topology.size());
                m_numa_node = topology[numa_node_id];
            }

            first_touch_allocator(first_touch_allocator&& other) noexcept
            : allocator_type(std::move(other)), m_numa_node(std::move(other.m_numa_node))
            {
            }

            first_touch_allocator& operator=(first_touch_allocator&& other) noexcept
            {
                allocator_type::operator=(std::move(other));
                m_numa_node             = std::move(other.m_numa_node);
                return *this;
            }

            void* allocate_node(std::size_t size, std::size_t alignment)
            {
                
                affinity_guard scoped_affinity(m_numa_node.cpus);
                auto           ptr = traits::allocate_node(get_allocator(), size, alignment);
                return first_touch(ptr, size);
            }

            void* allocate_array(std::size_t count, std::size_t size, std::size_t alignment)
            {
                affinity_guard scoped_affinity(m_numa_node.cpus);
                auto           ptr = traits::allocate_array(get_allocator(), count, size, alignment);
                return first_touch(ptr, count * size);
            }

            // TODO: turn all the default implemenations into macros
            // DEFAULT_TRAITS_DEALLOCATE_NODE
            // DEFAULT_TRAITS_DEALLOCATE_ARRAY
            // etc

            void deallocate_node(void* ptr, std::size_t size, std::size_t alignment) noexcept
            {
                traits::deallocate_node(get_allocator(), ptr, size, alignment);
            }

            void deallocate_array(void* ptr, std::size_t count, std::size_t size, std::size_t alignment) noexcept
            {
                traits::deallocate_array(get_allocator(), ptr, count, size, alignment);
            }

            std::size_t max_node_size() const
            {
                return traits::max_node_size(get_allocator());
            }

            std::size_t max_array_size() const
            {
                return traits::max_array_size(get_allocator());
            }

            std::size_t max_alignment() const
            {
                return traits::max_alignment(get_allocator());
            }

            std::size_t min_alignment() const
            {
                return traits::min_alignment(get_allocator());
            }

            /// @{
            /// \returns A reference to the underlying allocator.
            allocator_type& get_allocator() noexcept
            {
                return *this;
            }

            const allocator_type& get_allocator() const noexcept
            {
                return *this;
            }
            /// @}

        private:
            void* first_touch(void* ptr, std::size_t size)
            {
                auto page_size = traits_detail::page_size(traits_detail::full_concept{}, get_allocator());
                if (page_size && is_aligned(ptr, page_size))
                {
                    DVLOG(2) << "allocator provides memory aligned to page_size: " << page_size;
                    first_touch_aligned_pages(ptr, size, page_size);
                }
                else
                {
                    DVLOG(2) << "allocator did not provide page_size or is not aligned on a page boundry";
                    std::memset(ptr, Fill, size);
                }
                return ptr;
            }

            void first_touch_aligned_pages(void* ptr, std::size_t size, std::size_t page_size)
            {
                auto page_count = size / page_size + (size % page_size ? 1 : 0);
                auto nodes      = 1;
#if HAS_LIBNUMA
                void* pages[page_count];
                int   nodes[page_count];
                int   status[page_count];
#endif
                auto pages_per_node  = page_count / nodes;
                auto pages_remaining = page_count % nodes;

                for (std::size_t n = 0; n < nodes; n++)
                {
                    auto pages = pages_per_node + (n < pages_remaining ? 1 : 0);
                    DVLOG(1) << "assign " << pages << " to numa_ndoe " << n;

                    affinity_guard scoped_affinity(m_numa_node.cpus);
                    for (std::size_t p = n; p < pages; p += nodes)
                    {
                        char* page = static_cast<char*>(ptr);
                        page += p * page_size;
                        *page = Fill; // touch the first byte of the page
#if HAS_LIBNUMA
                        pages[p] = static_cast<void*>(page);
                        nodes[p] = n; // the node the page should be on
#endif
                    }
                }
#if HAS_NUMA
                // get status of pages
                auto rc = move_pages(0 /*self memory */, page_count, &pages, NULL, status, 0);
                if (rc)
                {
                    LOG(WARNING) << "page_check: move_pages returned :" << std::strerror(errno);
                }

                std::vector<int> move;
                for (std::size_t p = 0; p < page_count; p++)
                {
                    LOG_IF(WARNING, status[p] < 0) << "page " << p << ": " << std::strerror(std::abs(errno));
                    if (status[p] >= 0 && status[p] != nodes[p])
                    {
                        move.push_back(p);
                        LOG(WARNING) << "page " << p << ": expected on node " << node[p] "; found on " << status[p];
                    }
                }

                // move any misaligned pages
                if (move.size())
                {
                    // overwrite pages/nodes with pages o be moved
                    for (std::size_t p = 0; p < move.size; p++)
                    {
                        page_id  = move[p];
                        pages[p] = pages[page_id];
                        nodes[p] = nodes[page_id];
                    }
                    rc = move_pages(0, move.size(), &pages, nodes, status, MPOL_MF_MOVE);
                }
#endif
            }

            numa_node m_numa_node;
        };

    } // namespace memory
} // namespace trtlab

================================================
FILE: trtlab/core/include/trtlab/core/pool.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

#include <condition_variable>
#include <functional>
#include <memory>
#include <mutex>
#include <queue>

#include "standard_threads.h"
#include "utils.h"

#include "glog/logging.h"

namespace trtlab
{
    /**
 * @brief Templated Thread-safe Queue
 *
 * A simple thread-safe queue using a mutex and a condition variable.  This
 * class is derived from `std::enabled_shared_from_this` which requires it
 * to be create using `std::make_shared`.
 *
 * @tparam T
 */
    template <typename T>
    class Queue : public std::enable_shared_from_this<Queue<T>>
    {
    protected:
        Queue() = default;

    public:
        /**
     * @brief Factory function to properly create a Queue.
     *
     * @return std::shared_ptr<Queue<T>>
     */
        static std::shared_ptr<Queue<T>> Create()
        {
            return std::shared_ptr<Queue<T>>(new Queue<T>());
        }

        Queue(Queue&& other)
        {
            std::lock_guard<std::mutex> lock(other.mutex_);
            queue_ = std::move(other.queue_);
        }

        virtual ~Queue() {}

        /**
     * @brief Push a new value of T to the Queue
     *
     * @param value
     */
        void Push(T value)
        {
            {
                std::lock_guard<std::mutex> lock(mutex_);
                queue_.push(std::move(value));
            }
            cond_.notify_one();
        }

        /**
     * @brief Pop the Front object from the Queue and return.
     *
     * @return T
     */
        T Pop()
        {
            std::unique_lock<std::mutex> lock(mutex_);
            cond_.wait(lock, [this] { return !queue_.empty(); });
            T value = std::move(queue_.front());
            queue_.pop();
            return value;
        }

        /**
     * @brief Numbe of items in the Queue
     *
     * @return std::size_t
     */
        std::size_t Size()
        {
            std::lock_guard<std::mutex> lock(mutex_);
            return queue_.size();
        }

    private:
        mutable std::mutex      mutex_;
        std::queue<T>           queue_;
        std::condition_variable cond_;
    };

    /**
 * @brief Pool of ResourceType
 *
 * Pool of ResourceTypes implemented as a Queue.
 *
 * A unique aspect of this Pool object is the return type of the Pop method.  While the
 * Pool consists of shared_ptr's to objects of ResourceType typically created with the
 * the std::default_deleter.  The Pop method of this Pool clsss returned a different
 * type of shared_ptr<ResourceType> than the object pushed to the Pool.  The difference
 * is that the returned shared_ptr uses a custom deleter thereby creating a new logic
 * block for tracking this shared_ptr.  Rather than freeing the object in question,
 * the custom deleter captures both the original shared_ptr from the pool and the
 * shared_ptr of the Pool (shared_from_this), and uses those captured values to return
 * the original shared_ptr (with the default_deleter) to the Pool.
 *
 * By holding a reference to the pool in the custom deleter of the returned shared_ptr,
 * we ensure that the pool can not be deallocated while resources have been checked out.
 *
 * The custom shared_ptr also helps ensure resources are returned to the pool even if the
 * thread using the resources throws an exception.
 *
 * @tparam T
 */

    namespace v1
    {
        template <typename ResourceType>
        class Pool : public Queue<std::shared_ptr<ResourceType>>
        {
        protected:
            using Queue<std::shared_ptr<ResourceType>>::Queue;

        public:
            /**
     * @brief Factory function for creating a Pool.
     *
     * @return std::shared_ptr<Pool<ResourceType>>
     */

            static std::shared_ptr<Pool<ResourceType>> Create()
            {
                return std::shared_ptr<Pool<ResourceType>>(new Pool<ResourceType>());
            }

            /**
     * @brief Acquire a shared pointer to a ResourceType held by the Pool.
     *
     * Returns a shared_ptr<ResourceType> with a custom deleter that return the
     * Resource object by to the pool when the reference count of the shared_ptr
     * goes to zero.
     *
     * @return std::shared_ptr<ResourceType>
     */
            std::shared_ptr<ResourceType> Pop()
            {
                return Pop([](ResourceType&) {});
            }

            /**
     * @brief Acquire a shared pointer to a ResourceType held by the Pool.
     *
     * Returns a shared_ptr<ResourceType> with a custom deleter that return the
     * Resource object by to the pool when the reference count of the shared_ptr
     * goes to zero.
     *
     * onReturn will be executed prior to the object being returned to the pool.
     * onReturn is passed the raw pointer to the ResourceType.
     *
     * @param onReturn
     * @return std::shared_ptr<ResourceType>
     */
            std::shared_ptr<ResourceType> Pop(std::function<void(ResourceType&)> onReturn)
            {
                // auto pool_ptr = this->shared_from_this();
                auto                          from_pool = Queue<std::shared_ptr<ResourceType>>::Pop();
                auto                          raw       = from_pool.get();
                std::shared_ptr<ResourceType> ptr(raw, [from_pool = std::move(from_pool), pool_ptr = std::move(this->shared_from_this()),
                                                        onReturn](auto p) mutable {
                    onReturn(*p);
                    pool_ptr->Push(std::move(from_pool));
                    pool_ptr.reset();
                });
                return ptr;
            }

            /**
     * @brief Instantiates and Pushes a new Resource object.
     *
     * @param newObj Raw pointer to an object of ResourceType
     */
            void EmplacePush(ResourceType* newObj)
            {
                this->Push(std::shared_ptr<ResourceType>(newObj));
            }

            /**
     * @brief Instantiates and Pushes a new Resource object.
     *
     * Forwards the arguments passed to this method to the ResourceType constructor.
     *
     * @tparam Args
     * @param args
     */
            template <typename... Args>
            void EmplacePush(Args&&... args)
            {
                EmplacePush(new ResourceType(std::forward<Args>(args)...));
            }

            /**
     * @brief Pop/Dequeue a shared pointer to a ResourceType object.
     *
     * Unlike the Pop() that provides a shared_ptr whose Deleter returns the object
     * to the Pool; this method permanently removes the object from the Pool.
     *
     * @return std::shared_ptr<ResourceType>
     * @see Pop()
     */
            std::shared_ptr<ResourceType> PopWithoutReturn()
            {
                return Queue<std::shared_ptr<ResourceType>>::Pop();
            }
        };

    } // namespace v1

    namespace v2
    {
        template <typename ResourceType, typename ThreadType = standard_threads>
        class Pool final : public std::enable_shared_from_this<Pool<ResourceType, ThreadType>>
        {
            struct internal_key
            {
            };
            using mutex_t = typename ThreadType::mutex;
            using cv_t    = typename ThreadType::cv;
            mutable mutex_t          m_Mutex;
            cv_t                     m_Condition;
            std::queue<ResourceType> m_Queue;

        public:
            using PoolItem   = std::shared_ptr<ResourceType>;
            using PoolType   = std::shared_ptr<Pool<ResourceType, ThreadType>>;
            using OnReturnFn = std::function<void(ResourceType&)>;

            Pool(internal_key) {}
            ~Pool() {}

            static PoolType Create()
            {
                return std::make_shared<Pool<ResourceType, ThreadType>>(internal_key());
            }

            void Push(ResourceType&& obj)
            {
                {
                    std::lock_guard<mutex_t> lock(m_Mutex);
                    m_Queue.push(std::move(obj));
                }
                m_Condition.notify_one();
            }

            template <typename... Args>
            void EmplacePush(Args&&... args)
            {
                ResourceType obj(std::forward<Args>(args)...);
                Push(std::move(obj));
            }

            PoolItem Pop(OnReturnFn onReturn)
            {
                std::unique_lock<mutex_t> lock(m_Mutex);
                m_Condition.wait(lock, [this] { return !m_Queue.empty(); });
                std::shared_ptr<ResourceType> ptr(new ResourceType(std::move(m_Queue.front())),
                                                  [pool = this->shared_from_this(), onReturn](ResourceType* p) {
                                                      onReturn(*p);
                                                      pool->Push(std::move(*p));
                                                      delete p;
                                                  });
                m_Queue.pop();
                return ptr;
            }

            PoolItem Pop()
            {
                return Pop([](ResourceType&) {});
            }

            PoolItem PopWithoutReturn()
            {
                std::unique_lock<mutex_t> lock(m_Mutex);
                m_Condition.wait(lock, [this] { return !m_Queue.empty(); });
                auto ptr = std::make_shared<ResourceType>(std::move(m_Queue.front()));
                m_Queue.pop();
                return ptr;
            }

            std::size_t Size() const
            {
                std::lock_guard<mutex_t> lock(m_Mutex);
                return m_Queue.size();
            }
        };

    } // namespace v2

    namespace v3
    {
        template <typename ResourceType, typename ThreadType = standard_threads>
        class Pool final : public std::enable_shared_from_this<Pool<ResourceType, ThreadType>>
        {
            struct key
            {
            };
            class Resource;

            using pool_t = std::shared_ptr<Pool<ResourceType, ThreadType>>;
            using item_t = std::unique_ptr<ResourceType>;

        public:
            using resource_type = ResourceType;
            using on_return_fn  = std::function<void(ResourceType&)>;

            Pool(key) {}
            virtual ~Pool() {}

            DELETE_COPYABILITY(Pool);
            DELETE_MOVEABILITY(Pool);

            static pool_t Create()
            {
                return std::make_shared<Pool<ResourceType, ThreadType>>(key());
            }

            void push(ResourceType&& resource)
            {
                internal_push(std::make_unique<ResourceType>(std::move(resource)));
            }

            void push(std::unique_ptr<ResourceType> resource)
            {
                internal_push(std::move(resource));
            }

            template <typename... Args>
            void emplace_push(Args&&... args)
            {
                push(std::make_unique<ResourceType>(std::forward<Args>(args)...));
            }

            Resource pop(on_return_fn on_return)
            {
                return Resource(internal_pop(), this->shared_from_this(), on_return);
            }

            Resource pop()
            {
                return pop([](ResourceType&) {});
            }

            Resource pop_without_return()
            {
                return Resource(internal_pop(), nullptr, [](ResourceType&) {});
            }

        private:
            void internal_push(item_t ptr)
            {
                {
                    std::lock_guard<mutex_t> lock(m_Mutex);
                    m_Queue.push(std::move(ptr));
                }
                m_Condition.notify_one();
            }

            item_t internal_pop()
            {
                std::unique_lock<mutex_t> lock(m_Mutex);
                m_Condition.wait(lock, [this] { return !m_Queue.empty(); });
                auto ptr = std::move(m_Queue.front());
                m_Queue.pop();
                return ptr;
            }

            class Resource
            {
            public:
                Resource(item_t resource, pool_t pool, on_return_fn on_return)
                : m_Resource(std::move(resource)), m_Pool(pool), m_OnReturnFn(on_return)
                {
                }

                virtual ~Resource()
                {
                    if (m_Resource)
                    {
                        m_OnReturnFn(*m_Resource);
                        if (m_Pool)
                        {
                            m_Pool->push(std::move(m_Resource));
                        }
                    }
                }

                DELETE_COPYABILITY(Resource);

                Resource(Resource&&) noexcept = default;
                Resource& operator=(Resource&&) noexcept = default;

                ResourceType* operator->()
                {
                    return m_Resource.get();
                }
                const ResourceType* operator->() const
                {
                    return m_Resource.get();
                }

            private:
                pool_t       m_Pool;
                item_t       m_Resource;
                on_return_fn m_OnReturnFn;
            };

            using mutex_t = typename ThreadType::mutex;
            using cv_t    = typename ThreadType::cv;
            mutable mutex_t    m_Mutex;
            cv_t               m_Condition;
            std::queue<item_t> m_Queue;
        };

    } // namespace v3

    inline namespace v4
    {
        template <typename ResourceType, typename ThreadType = standard_threads>
        class Pool final : public std::enable_shared_from_this<Pool<ResourceType, ThreadType>>
        {
            struct key
            {
            };
            class UniqueItem;
            using SharedItem = std::shared_ptr<ResourceType>;

            using pool_t = std::shared_ptr<Pool<ResourceType, ThreadType>>;
            using item_t = ResourceType;

        public:
            using resource_type = ResourceType;
            using on_return_fn  = std::function<void(ResourceType&)>;

            Pool(key) {}
            virtual ~Pool() {}

            DELETE_COPYABILITY(Pool);
            DELETE_MOVEABILITY(Pool);

            static pool_t Create()
            {
                return std::make_shared<Pool<ResourceType, ThreadType>>(key());
            }

            void Push(ResourceType&& resource)
            {
                internal_push(std::move(resource));
            }

            void push(ResourceType&& resource)
            {
                internal_push(std::move(resource));
            }

            template <typename... Args>
            void emplace_push(Args&&... args)
            {
                push(ResourceType(std::forward<Args>(args)...));
            }

            template <typename... Args>
            void EmplacePush(Args&&... args)
            {
                push(ResourceType(std::forward<Args>(args)...));
            }

            SharedItem Pop(on_return_fn on_return)
            {
                return pop_shared(on_return);
            }

            SharedItem Pop()
            {
                return pop_shared();
            }

            SharedItem PopWithoutReturn()
            {
                return pop_shared_without_return();
            }

            UniqueItem pop_unique(on_return_fn on_return)
            {
                return UniqueItem(internal_pop(), this->shared_from_this(), on_return);
            }

            UniqueItem pop_unique()
            {
                return pop_unique([](ResourceType&) {});
            }

            UniqueItem pop_unique_without_return()
            {
                return UniqueItem(internal_pop(), nullptr, [](ResourceType&) {});
            }

            SharedItem pop_shared(on_return_fn on_return)
            {
                return std::shared_ptr<item_t>(new item_t(std::move(internal_pop())), [pool = this->shared_from_this(), on_return](item_t* ptr) mutable {
                    on_return(*ptr);
                    pool->push(std::move(*ptr));
                    delete ptr;
                });
            }

            SharedItem pop_shared()
            {
                return pop_shared([](ResourceType&) {});
            }

            SharedItem pop_shared_without_return()
            {
                return std::make_shared<item_t>(std::move(internal_pop()));
            }

            std::size_t Size() const
            {
                std::lock_guard<mutex_t> lock(m_Mutex);
                return m_Queue.size();
            }

            std::size_t size() const
            {
                std::lock_guard<mutex_t> lock(m_Mutex);
                return m_Queue.size();
            }

        private:
            void internal_push(item_t&& ptr)
            {
                {
                    std::lock_guard<mutex_t> lock(m_Mutex);
                    m_Queue.push(std::move(ptr));
                }
                m_Condition.notify_one();
            }

            item_t internal_pop()
            {
                std::unique_lock<mutex_t> lock(m_Mutex);
                m_Condition.wait(lock, [this] { return !m_Queue.empty(); });
                auto ptr = std::move(m_Queue.front());
                m_Queue.pop();
                return std::move(ptr);
            }

            class UniqueItem
            {
            public:
                UniqueItem(item_t resource, pool_t pool, on_return_fn on_return)
                : m_Resource(std::move(resource)), m_Pool(pool), m_OnReturnFn(on_return)
                {
                }

                virtual ~UniqueItem()
                {
                    if (m_Pool)
                    {
                        m_OnReturnFn(m_Resource);
                        m_Pool->push(std::move(m_Resource));
                    }
                }

                DELETE_COPYABILITY(UniqueItem);
                //DELETE_MOVEABILITY(UniqueItem);

                UniqueItem(UniqueItem&& other) noexcept
                : m_Resource(std::move(other.m_Resource)), m_Pool(std::exchange(other.m_Pool, nullptr)), m_OnReturnFn(std::move(other.m_OnReturnFn))
                {
                }
                UniqueItem& operator=(UniqueItem&& other) noexcept
                {
                    m_Resource   = std::move(other.m_Resource);
                    m_Pool       = std::exchange(other.m_Pool, nullptr);
                    m_OnReturnFn = std::exchange(other.m_OnReturnFn, nullptr);
                    return *this;
                }

                ResourceType* operator->()
                {
                    return &m_Resource;
                }
                const ResourceType* operator->() const
                {
                    return &m_Resource;
                }

            private:
                item_t       m_Resource;
                pool_t       m_Pool;
                on_return_fn m_OnReturnFn;
            };


            using mutex_t = typename ThreadType::mutex;
            using cv_t    = typename ThreadType::cv;
            mutable mutex_t    m_Mutex;
            cv_t               m_Condition;
            std::queue<item_t> m_Queue;
        };

        template <typename ResourceType, typename ThreadType = standard_threads>
        class UniquePool final : public std::enable_shared_from_this<UniquePool<ResourceType, ThreadType>>
        {
            struct key
            {
            };

            using pool_t = std::shared_ptr<UniquePool<ResourceType, ThreadType>>;

        public:
            using item_t = std::unique_ptr<ResourceType>;
            using resource_type = ResourceType;
            using on_return_fn  = std::function<void(ResourceType&)>;

            class UniqueItem;
            
            UniquePool(key) {}
            virtual ~UniquePool() {}

            DELETE_COPYABILITY(UniquePool);
            DELETE_MOVEABILITY(UniquePool);

            static pool_t Create()
            {
                return std::make_shared<UniquePool<ResourceType, ThreadType>>(key());
            }

            void push(item_t item)
            {
                internal_push(std::move(item));
            }

            template <typename... Args>
            void emplace_push(Args&&... args)
            {
                internal_push(std::make_unique<ResourceType>(std::forward<Args>(args)...));
            }

            UniqueItem pop_unique(on_return_fn on_return)
            {
                return UniqueItem(internal_pop(), this->shared_from_this(), on_return);
            }

            UniqueItem pop_unique()
            {
                return pop_unique([](ResourceType&) {});
            }

            UniqueItem pop_unique_without_return()
            {
                return UniqueItem(internal_pop(), nullptr, [](ResourceType&) {});
            }

            std::size_t size() const
            {
                std::lock_guard<mutex_t> lock(m_Mutex);
                return m_Queue.size();
            }

        private:
            void internal_push(item_t&& ptr)
            {
                {
                    std::lock_guard<mutex_t> lock(m_Mutex);
                    m_Queue.push(std::move(ptr));
                }
                m_Condition.notify_one();
            }

            item_t internal_pop()
            {
                std::unique_lock<mutex_t> lock(m_Mutex);
                m_Condition.wait(lock, [this] { return !m_Queue.empty(); });
                auto ptr = std::move(m_Queue.front());
                m_Queue.pop();
                return std::move(ptr);
            }

        public:
            class UniqueItem
            {
            public:
                UniqueItem(item_t resource, pool_t pool, on_return_fn on_return)
                : m_Resource(std::move(resource)), m_Pool(pool), m_OnReturnFn(on_return)
                {
                }

                virtual ~UniqueItem()
                {
                    if (m_Pool)
                    {
                        m_OnReturnFn(*m_Resource);
                        m_Pool->push(std::move(m_Resource));
                    }
                }

                DELETE_COPYABILITY(UniqueItem);
                //DELETE_MOVEABILITY(UniqueItem);

                UniqueItem(UniqueItem&& other) noexcept
                : m_Resource(std::move(other.m_Resource)), m_Pool(std::exchange(other.m_Pool, nullptr)), m_OnReturnFn(std::move(other.m_OnReturnFn))
                {
                }
                UniqueItem& operator=(UniqueItem&& other) noexcept
                {
                    m_Resource   = std::move(other.m_Resource);
                    m_Pool       = std::exchange(other.m_Pool, nullptr);
                    m_OnReturnFn = std::exchange(other.m_OnReturnFn, nullptr);
                    return *this;
                }

                ResourceType* operator->()
                {
                    return m_Resource.get();
                }
                const ResourceType* operator->() const
                {
                    return m_Resource.get();
                }

            private:
                item_t       m_Resource;
                pool_t       m_Pool;
                on_return_fn m_OnReturnFn;
            };


            using mutex_t = typename ThreadType::mutex;
            using cv_t    = typename ThreadType::cv;
            mutable mutex_t    m_Mutex;
            cv_t               m_Condition;
            std::queue<item_t> m_Queue;

        };

    } // namespace v4

} // namespace trtlab


================================================
FILE: trtlab/core/include/trtlab/core/ranges.h
================================================
#pragma once
#include <numeric>
#include <vector>
#include <utility>
#include <type_traits>

namespace trtlab
{
    template <typename T>
    std::vector<std::pair<T, T>> find_ranges(const std::vector<T>& values)
    {
        static_assert(std::is_integral<T>::value, "only integral types allowed");

        auto copy = values;
        sort(copy.begin(), copy.end());

        std::vector<std::pair<T, T>> ranges;

        auto it  = copy.cbegin();
        auto end = copy.cend();

        while (it != end)
        {
            auto low  = *it;
            auto high = *it;
            for (T i = 0; it != end && low + i == *it; it++, i++)
            {
                high = *it;
            }
            ranges.push_back(std::make_pair(low, high));
        }

        return ranges;
    }

    template <typename T>
    std::string print_ranges(const std::vector<std::pair<T, T>>& ranges)
    {
        return std::accumulate(std::begin(ranges), std::end(ranges), std::string(), [](std::string r, std::pair<T, T> p) {
            if (p.first == p.second)
            {
                return r + (r.empty() ? "" : ",") + std::to_string(p.first);
            }
            else
            {
                return r + (r.empty() ? "" : ",") + std::to_string(p.first) + "-" + std::to_string(p.second);
            }
        });
    }

} // namespace trtlab

================================================
FILE: trtlab/core/include/trtlab/core/resources.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

#include <memory>

namespace trtlab {

struct Resources : public std::enable_shared_from_this<Resources>
{
    virtual ~Resources() {}

    template<class Target>
    std::shared_ptr<Target> casted_shared_from_this()
    {
        return std::dynamic_pointer_cast<Target>(Resources::shared_from_this());
    }
};

} // namespace trtlab


================================================
FILE: trtlab/core/include/trtlab/core/standard_threads.h
================================================
#pragma once
#include <future>

namespace trtlab
{
    struct standard_threads
    {

        using mutex = std::mutex;

        using cv = std::condition_variable;
        
        template <typename T>
        using promise = std::promise<T>;

        template <typename T>
        using future = std::future<T>;

        template <typename T>
        using shared_future = std::shared_future<T>;

        template <class Function, class... Args>
        static auto async(Function&& f, Args&&... args)
        {
            return std::async(f, std::forward<Args>(args)...);
        }

        template <typename Rep, typename Period>
        static void sleep_for(std::chrono::duration<Rep, Period> const& timeout_duration)
        {
            std::this_thread::sleep_for(timeout_duration);
        }

        template <typename Clock, typename Duration>
        static void sleep_until(std::chrono::time_point<Clock, Duration> const& sleep_time_point)
        {
            std::this_thread::sleep_until(sleep_time_point);
        }
    };
}

================================================
FILE: trtlab/core/include/trtlab/core/task_pool.h
================================================
/* Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

#include <chrono>
#include <map>
#include <mutex>
#include <thread>

namespace trtlab
{
    class DeferredShortTaskPool
    {
    public:
        using clock_type = std::chrono::high_resolution_clock;
        using time_point = clock_type::time_point;

        DeferredShortTaskPool() : m_Thread(std::make_unique<std::thread>([this]() { ProgressThread(); })), m_Stop(false) {}

        virtual ~DeferredShortTaskPool()
        {
            shutdown();
            m_Thread->join();
        }

        DELETE_COPYABILITY(DeferredShortTaskPool);
        DELETE_MOVEABILITY(DeferredShortTaskPool);

        void enqueue_deferred(time_point deadline, std::function<void()> task)
        {
            {
                std::lock_guard<std::mutex> lock(m_Mutex);
                if(m_Stop) { throw std::runtime_error("shutting down task pool; new submission not allowed"); }
                m_Tasks[deadline] = std::move(task);
            }
            m_Condition.notify_one();
        }

        void shutdown()
        {
            {
                std::lock_guard<std::mutex> lock(m_Mutex);
                m_Stop = true;
            }
            m_Condition.notify_all();
        }

    private:
        void ProgressThread()
        {
            for (;;)
            {
                std::function<void()>  task;
                clock_type::time_point deadline;
                {
                    std::unique_lock<std::mutex> lock(m_Mutex);
                    if (m_Tasks.empty())
                    {
                        m_Condition.wait(lock, [this]() { return m_Stop || !m_Tasks.empty(); });
                    }
                    else
                    {
                        auto keyval = m_Tasks.cbegin();
                        m_Condition.wait_until(lock, keyval->first, [this]() { return m_Stop; });
                    }

                    if (m_Stop && m_Tasks.empty()) { return; }
                    if (m_Tasks.empty()) { continue; }

                    auto it = m_Tasks.begin();
                    if(clock_type::now() < it->first) { continue; }
                    task    = std::move(it->second);
                    m_Tasks.erase(it->first);
                }
                //auto start = clock_type::now();
                task();
                //auto end = clock_type::now();
                //auto wall = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
                //LOG_IF(WARNING, wall >= 5) << "task exceeded short duration limit (max 5us): " << wall << "us";
            }
        }

        std::unique_ptr<std::thread> m_Thread;
        bool                         m_Stop;

        std::map<time_point, std::function<void()>> m_Tasks;
        std::mutex                                  m_Mutex;
        std::condition_variable                     m_Condition;
    };

} // namespace trtlab


================================================
FILE: trtlab/core/include/trtlab/core/thread_pool.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
//
// Original Source: https://github.com/progschj/BaseThreadPool
//
// Original License:
//
// Copyright (c) 2012 Jakob Progsch, Václav Zeman
//
// This software is provided 'as-is', without any express or implied
// warranty. In no event will the authors be held liable for any damages
// arising from the use of this software.
//
// Permission is granted to anyone to use this software for any purpose,
// including commercial applications, and to alter it and redistribute it
// freely, subject to the following restrictions:
//
//   1. The origin of this software must not be misrepresented; you must not
//   claim that you wrote the original software. If you use this software
//   in a product, an acknowledgment in the product documentation would be
//   appreciated but is not required.
//
//   2. Altered source versions must be plainly marked as such, and must not be
//   misrepresented as being the original software.
//
//   3. This notice may not be removed or altered from any source
//   distribution.
//
// Modifications:
//   * Header-only file was split into .h/.cc files
//   * Added an extra safety check (lines 30-31) in the construction (.cc file).
//   * Added CPU affinity options to the constructor
//   * Added Size() method to get thread count
//   * Implemented transwarp::executor protocol
//
#pragma once

#include "trtlab/core/affinity.h"
#include "trtlab/core/utils.h"

#include <functional>
#include <future>
#include <queue>

#include <glog/logging.h>

namespace trtlab
{
    template <typename MutexType, typename ConditionType>
    class BaseThreadPool;

    using ThreadPool = BaseThreadPool<std::mutex, std::condition_variable>;

    /**
 * @brief Manages a Pool of Threads that consume a shared work Queue
 *
 * BaseThreadPool is the primary resoruce class for handling threads used throughout
 * the YAIS examples and tests.  The library is entirely a BYO-resources;
 * however, this implemenation is provided as a convenience class.  Many thanks
 * to the original authors for a beautifully designed class.
 */
    template <typename MutexType, typename ConditionType>
    class BaseThreadPool
    {
    public:
        /**
     * @brief Construct a new Thread Pool
     * @param nThreads Number of Worker Threads
     */
        BaseThreadPool(size_t nThreads);

        /**
     * @brief Construct a new Thread Pool with Shared CPU Affinity
     *
     * Creates a BaseThreadPool with nThreads where each thread sets its CPU affinity
     * to affinity_mask.
     *
     * @param nThreads
     * @param affinity_mask
     */
        BaseThreadPool(size_t nThreads, const cpu_set& affinity_mask);

        /**
     * @brief Construct a new Thread Pool with Exclusive CPU Affinity
     *
     * Creates a BaseThreadPool using a cpu_set such a for each CPU in the cpu_set a
     * thread is created and the affnity of that thread is dedicted to the
     * respective CPU from the cpu_set.
     *
     * @param cpu_set
     */
        BaseThreadPool(const cpu_set& cpu_set);
        virtual ~BaseThreadPool();

        DELETE_COPYABILITY(BaseThreadPool);
        DELETE_MOVEABILITY(BaseThreadPool);

        /**
     * @brief Enqueue Work to the BaseThreadPool by passing a Lambda Function
     *
     * Variadic template allows for an arbituary number of arguments to be passed
     * the captured lambda function.  Captures are still allowed and used
     * throughout the examples.
     *
     * The queue can grow larger than the number of threads.  A single worker
     * thread executues pulls a lambda function off the queue and executes it to
     * completion.  These are synchronous executions in an async messaging
     * library.  These synchronous pools can be swapped for truely async workers
     * using libevent or asio.  Happy to accept PRs to improve the async
     * abilities.
     *
     * @tparam F
     * @tparam Args
     * @param f
     * @param args
     * @return std::future<typename std::result_of<F(Args...)>::type>
     */
        template <class F, class... Args>
        auto enqueue(F&& f, Args&&... args) -> std::future<typename std::result_of<F(Args...)>::type>;

        void enqueue(std::function<void()> task);

        /**
     * @brief Number of Threads in the Pool
     */
        int Size();

#ifdef USE_TRANSWARP
        // transwarp interface: get_name, execute

        // The name of the executor
        std::string get_name() const final override
        {
            return "trtlab::BaseThreadPool";
        }

        // Only ever called on the thread of the caller to schedule()
        void execute(const std::function<void()>& functor, const std::shared_ptr<tw::node>& node) final override
        {
            {
                std::unique_lock<MutexType> lock(m_QueueMutex);
                tasks.push(functor);
            }
            m_Condition.notify_one();
        }
#endif

    private:
        void CreateThread(const cpu_set& affinity_mask);

        // need to keep track of threads so we can join them
        std::vector<std::thread> workers;
        // the task queue
        //std::queue<std::function<void()>> tasks;
        std::queue<std::packaged_task<void()>> tasks;

        // synchronization
        MutexType     m_QueueMutex;
        ConditionType m_Condition;
        bool          stop;
    };

    // add new work item to the pool
    template <typename MutexType, typename ConditionType>
    template <class F, class... Args>
    auto BaseThreadPool<MutexType, ConditionType>::enqueue(F&& f, Args&&... args) -> std::future<typename std::result_of<F(Args...)>::type>
    {
        using return_type = typename std::result_of<F(Args...)>::type;

        std::packaged_task<return_type()> task(
            std::bind(std::forward<F>(f), std::forward<Args>(args)...)
        );

        std::future<return_type> res = task.get_future();
        {
            std::lock_guard<MutexType> lock(m_QueueMutex);

            // don't allow enqueueing after stopping the pool
            if (stop)
                throw std::runtime_error("enqueue on stopped BaseThreadPool");

            tasks.emplace(std::move(task));
        }
        m_Condition.notify_one();
        return res;
    }
/*
    template <typename MutexType, typename ConditionType>
    void BaseThreadPool<MutexType, ConditionType>::enqueue(std::function<void()> task)
    {
        {
            std::lock_guard<MutexType> lock(m_QueueMutex);

            // don't allow enqueueing after stopping the pool
            if (stop)
                throw std::runtime_error("enqueue on stopped BaseThreadPool");

            std::packaged_task
            tasks.push(task);
        }
        m_Condition.notify_one();
    }
*/
    template <typename MutexType, typename ConditionType>
    BaseThreadPool<MutexType, ConditionType>::BaseThreadPool(size_t nThreads)
    : BaseThreadPool(nThreads, affinity::this_thread::get_affinity())
    {
    }

    template <typename MutexType, typename ConditionType>
    BaseThreadPool<MutexType, ConditionType>::BaseThreadPool(size_t nThreads, const cpu_set& affinity_mask) : stop(false)
    {
        for (size_t i = 0; i < nThreads; ++i)
        {
            CreateThread(affinity_mask);
        }
    }

    template <typename MutexType, typename ConditionType>
    BaseThreadPool<MutexType, ConditionType>::BaseThreadPool(const cpu_set& cpus) : stop(false)
    {
        auto exclusive = cpus.get_allocator();
        for (size_t i = 0; i < exclusive.size(); i++)
        {
            cpu_set affinity_mask;
            CHECK(exclusive.allocate(affinity_mask, 1)) << "Affinity Allocator failed on pass: " << i;
            CreateThread(affinity_mask);
        }
    }

    template <typename MutexType, typename ConditionType>
    void BaseThreadPool<MutexType, ConditionType>::CreateThread(const cpu_set& affinity_mask)
    {
        workers.emplace_back([this, affinity_mask]() {
            affinity::this_thread::set_affinity(affinity_mask);
            for (;;)
            {
                std::packaged_task<void()> task;
                {
                    std::unique_lock<MutexType> lock(this->m_QueueMutex);
                    this->m_Condition.wait(lock, [this]() { return this->stop || !this->tasks.empty(); });
                    if (this->stop && this->tasks.empty())
                        return;
                    task = move(this->tasks.front());
                    this->tasks.pop();
                }
                task();
            }
        });
    }

    // the destructor joins all threads
    template <typename MutexType, typename ConditionType>
    BaseThreadPool<MutexType, ConditionType>::~BaseThreadPool()
    {
        {
            std::lock_guard<MutexType> lock(m_QueueMutex);
            stop = true;
        }
        m_Condition.notify_all();

        for (std::thread& worker : workers)
        {
            worker.join();
        }
    }

    template <typename MutexType, typename ConditionType>
    int BaseThreadPool<MutexType, ConditionType>::Size()
    {
        return workers.size();
    }

} // namespace trtlab


================================================
FILE: trtlab/core/include/trtlab/core/types.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

#include <cstdint> // for int64_t etc
#include <functional> // for std::multiplies
#include <memory>
#include <numeric>
#include <vector>
#include <ostream>

namespace trtlab {
namespace types {

template<typename T>
struct ArrayType final
{
    using NativeType = T;
    static uint64_t ItemSize();
    static DLDataType DataTypeInfo();

  protected:
    ArrayType() = default;
};

template<typename T>
DLDataType ArrayType<T>::DataTypeInfo()
{
    DLDataType dtype;
    // clang-format off
    if(std::is_floating_point<T>::value) { dtype.code = kDLFloat; }
    else if(std::is_integral<T>::value)
    {
        dtype.code = kDLUInt;
        if(std::is_signed<unsigned int>::value) { dtype.code = kDLInt; }
    }
    else { throw std::runtime_error("Only integer or floating point types accepted"); }
    // clang-format on

    dtype.bits = 8 * ItemSize();
    dtype.lanes = 1;

    return dtype;
}

template<typename T>
uint64_t ArrayType<T>::ItemSize()
{
    return sizeof(T);
}

template<>
inline DLDataType ArrayType<void>::DataTypeInfo()
{
    return ArrayType<uint8_t>::DataTypeInfo();
}

template<>
inline uint64_t ArrayType<void>::ItemSize()
{
    return 1;
}

struct dtype final
{
    dtype(const DLDataType&);
    dtype(uint8_t code, uint8_t bits, uint16_t lanes);

    template<typename T>
    static dtype from() { return dtype(ArrayType<T>::DataTypeInfo()); }

    template<typename T>
    bool is_compatible() const;

    dtype(dtype&&) noexcept;
    dtype& operator=(dtype&&) noexcept;

    dtype(const dtype&);
    dtype& operator=(const dtype&);

    virtual ~dtype() {}

    bool operator==(const dtype&) const;
    bool operator!=(const dtype& other) const { return !(*this == other); }

    int64_t bytes() const;
    int64_t itemsize() const { return bytes(); };
    const DLDataType& to_dlpack() const;

    std::string Description() const;

  protected:
    uint32_t code() const { return m_DLPackType.code; }
    uint32_t bits() const { return m_DLPackType.bits; }
    uint32_t lanes() const { return m_DLPackType.lanes; }

  private:
    dtype();
    DLDataType m_DLPackType;
    int64_t m_Bytes;

    friend std::ostream& operator<<(std::ostream& os, const dtype& dt);
};

template<typename T>
bool dtype::is_compatible() const
{
    if(lanes() != 1) { return false; }
    if(bits() != sizeof(T) * 8) { return false; }
    if(std::is_integral<T>::value)
    {
        if(std::is_signed<T>::value) { if(code() != kDLInt) { return false; } }
        else{ if(code() != kDLUInt) { return false; }}
    }
    else if(std::is_floating_point<T>::value)
    {
        if(code() != kDLFloat) { return false; }
    }
    return true;
}

template<>
inline bool dtype::is_compatible<void>() const
{
    return true;
}


static const auto nil = dtype(kDLInt, 0U, 0U);
static const auto bytes = dtype(kDLUInt, 8U, 1U);

static const auto int8 = dtype(kDLInt, 8U, 1U);
static const auto int16 = dtype(kDLInt, 16U, 1U);
static const auto int32 = dtype(kDLInt, 32U, 1U);
static const auto int64 = dtype(kDLInt, 64U, 1U);
static const auto uint8 = dtype(kDLUInt, 8U, 1U);
static const auto uint16 = dtype(kDLUInt, 16U, 1U);
static const auto uint32 = dtype(kDLUInt, 32U, 1U);
static const auto uint64 = dtype(kDLUInt, 64U, 1U);
static const auto fp16 = dtype(kDLFloat, 16U, 1U);
static const auto fp32 = dtype(kDLFloat, 32U, 1U);
static const auto fp64 = dtype(kDLFloat, 64U, 1U);

static const dtype All[] = {int8, int16, int32, int64, uint8, uint16, uint32, uint64, fp16, fp32, fp64};

} // namespace types
} // namespace trtlab

================================================
FILE: trtlab/core/include/trtlab/core/userspace_threads.h
================================================
#pragma once
#include <boost/fiber/all.hpp>

namespace trtlab
{
    struct userspace_threads
    {
        using mutex = boost::fibers::mutex;

        using cv = boost::fibers::condition_variable;

        template <typename T>
        using promise = boost::fibers::promise<T>;

        template <typename T>
        using future = boost::fibers::future<T>;

        template <typename T>
        using shared_future = boost::fibers::shared_future<T>;

        template <class R, class... Args>
        using packaged_task = boost::fibers::packaged_task<R(Args...)>;

        template <class Function, class... Args>
        static auto async(Function&& f, Args&&... args)
        {
            return boost::fibers::async(f, std::forward<Args>(args)...);
        }

        template <typename Rep, typename Period>
        static void sleep_for(std::chrono::duration<Rep, Period> const& timeout_duration)
        {
            boost::this_fiber::sleep_for(timeout_duration);
        }

        template <typename Clock, typename Duration>
        static void sleep_until(std::chrono::time_point<Clock, Duration> const& sleep_time_point)
        {
            boost::this_fiber::sleep_until(sleep_time_point);
        }
    };
} // namespace trtlab

================================================
FILE: trtlab/core/include/trtlab/core/utils.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

#include <string>

#define DELETE_COPYABILITY(foo) \
    foo(const foo&) = delete;   \
    foo& operator=(const foo& other) = delete;

#define DELETE_MOVEABILITY(foo)   \
    foo(foo&&) noexcept = delete; \
    foo& operator=(foo&& other) noexcept = delete;

namespace trtlab {

std::string BytesToString(std::size_t bytes);
std::size_t StringToBytes(const std::string);

template<typename T>
T round_up(T val, T multiple) 
{
    static_assert(std::is_integral<T>::value && std::is_unsigned<T>::value, "");
    DCHECK(multiple);
    return ((val + multiple - 1) / multiple) * multiple;
}

} // namespace trtlab

================================================
FILE: trtlab/core/src/affinity.cc
================================================
/* Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "affinity.h"

#include <algorithm>
#include <functional>

#include <glog/logging.h>
#include <boost/fiber/numa/topology.hpp>

#include "ranges.h"

#define test_bit(_n, _p) (_n & (1UL << _p))

using namespace trtlab;

static cpuaff::affinity_manager s_manager;
static std::string              cpu_int_string(const cpu_set& cpus, std::function<int(const cpuaff::cpu&)> extract);
static std::vector<int>         parse_ints(const std::string data);

cpu_set cpu_set::get_intersection(const cpu_set& other) const
{
    cpu_set cpus;
    set_intersection(begin(), end(), other.begin(), other.end(), std::inserter(cpus, cpus.begin()));
    return cpus;
}

cpu_set cpu_set::get_union(const cpu_set& other) const
{
    cpu_set cpus;
    set_union(begin(), end(), other.begin(), other.end(), std::inserter(cpus, cpus.begin()));
    return cpus;
}

cpu_set cpu_set::get_difference(const cpu_set& other) const
{
    cpu_set cpus;
    set_difference(begin(), end(), other.begin(), other.end(), std::inserter(cpus, cpus.begin()));
    return cpus;
}

cpu_set cpu_set::from_string(std::string ids)
{
    cpu_set cpus;
    auto    ints = parse_ints(ids);
    for (const auto id : ints)
    {
        cpus.insert(affinity::system::cpu_from_logical_id(id));
    }
    return cpus;
}

std::string cpu_set::cpus_string() const
{
    auto extract = [](const cpuaff::cpu& cpu) { return int(cpu.id().get()); };
    return cpu_int_string(*this, extract);
}

std::string cpu_set::cores_string() const
{
    auto extract = [](const cpuaff::cpu& cpu) { return int(cpu.core()); };
    return cpu_int_string(*this, extract);
}

std::string cpu_set::sockets_string() const
{
    auto extract = [](const cpuaff::cpu& cpu) { return int(cpu.socket()); };
    return cpu_int_string(*this, extract);
}

std::ostream& trtlab::operator<<(std::ostream& s, const cpu_set& cpus)
{
    s << "[cpu_set: cpus=" << cpus.cpus_string() << "; cores=" << cpus.cores_string() << "; sockets=" << cpus.sockets_string() << "]";
    return s;
}

std::ostream& trtlab::operator<<(std::ostream& s, const numa_node& node)
{
    s << "[numa_node: " << node.id << "; logical_cpus=" << node.cpus.cpus_string()
      << "; distances=(";
    for(const auto& d : node.distances)
    {
        s << " " << d;
    }
    s << " )]";
    return s;
}

std::vector<numa_node> affinity::system::topology()
{
    auto                   topo = boost::fibers::numa::topology();
    std::vector<numa_node> nodes;

    for (const auto& n : topo)
    {
        nodes.emplace_back();
        auto& node = nodes.back();

        // numa node id
        node.id = n.id;

        // logical cpus
        for (auto cpu : n.logical_cpus)
        {
            node.cpus.insert(affinity::system::cpu_from_logical_id(cpu));
        }

        // distance
        for (auto d : n.distance)
        {
            node.distances.push_back(d);
        }
    }
    return nodes;
}

void affinity::this_thread::set_affinity(const cpu_set& cpus)
{
    //DVLOG(1) << "Affinity: set thread affinity to: " << cpus;
    CHECK(s_manager.set_affinity(cpus)) << "SetAffinity failed for cpu_set: " << cpus;
}

cpu_set affinity::this_thread::get_affinity()
{
    cpu_set cpus;
    CHECK(s_manager.get_affinity(cpus)) << "GetAffinity failed";
    //DVLOG(1) << "Affinity: cpu affinity of calling thread: " << cpus;
    return cpus;
}

/*
cpu_set Affinity::GetCpusByNuma(int numa_id)
{
    cpuaff::cpu_set cpus;
    CHECK(s_manager.get_cpus_by_numa(cpus, numa_id)) << "GetCpusByNuma failed for numa_id: " << numa_id;
    return cpu_set(cpus);
}

cpu_set Affinity::GetCpusBySocket(int socket_id)
{
    cpuaff::cpu_set cpus;
    CHECK(s_manager.get_cpus_by_socket(cpus, socket_id)) << "GetCpusBySocket failed for socket_id: " << socket_id;
    return cpu_set(cpus);
}

cpu_set Affinity::GetCpusByCore(int core_id)
{
    cpuaff::cpu_set cpus;
    CHECK(s_manager.get_cpus_by_core(cpus, core_id)) << "GetCpusByCore failed for core_id: " << core_id;
    return cpu_set(cpus);
}

cpu_set Affinity::GetCpusByProcessingUnit(int thread_id)
{
    cpuaff::cpu_set cpus;
    CHECK(s_manager.get_cpus_by_processing_unit(cpus, thread_id)) << "GetCpusByProcessingUnit failed for thread_id: " << thread_id;
    return cpu_set(cpus);
}
*/

affinity_guard::affinity_guard()
{
    m_original_cpus = affinity::this_thread::get_affinity();
}
affinity_guard::affinity_guard(const cpu_set& cpus) : affinity_guard()
{
    CHECK(!cpus.empty());
    affinity::this_thread::set_affinity(cpus);
}

affinity_guard::~affinity_guard()
{
    affinity::this_thread::set_affinity(m_original_cpus);
}

// static implementations

cpuaff::cpu affinity::system::cpu_from_logical_id(int id)
{
    cpuaff::cpu cpu;
    CHECK(s_manager.get_cpu_from_id(cpu, id)) << "GetCpuById failed for cpu_id: " << id;
    return cpu;
}

int ConvertString2Int(const std::string& str)
{
    int               x;
    std::stringstream ss(str);
    CHECK(ss >> x) << "Error converting " << str << " to integer";
    return x;
}

std::vector<std::string> SplitStringToArray(const std::string& str, char splitter)
{
    std::vector<std::string> tokens;
    std::stringstream        ss(str);
    std::string              temp;
    while (getline(ss, temp, splitter)) // split into new "lines" based on character
    {
        tokens.push_back(temp);
    }
    return tokens;
}

std::vector<int> parse_ints(const std::string data)
{
    std::vector<int>         result;
    std::vector<std::string> tokens = SplitStringToArray(data, ',');
    for (std::vector<std::string>::const_iterator it = tokens.begin(), end_it = tokens.end(); it != end_it; ++it)
    {
        const std::string&       token = *it;
        std::vector<std::string> range = SplitStringToArray(token, '-');
        if (range.size() == 1)
        {
            result.push_back(ConvertString2Int(range[0]));
        }
        else if (range.size() == 2)
        {
            int start = ConvertString2Int(range[0]);
            int stop  = ConvertString2Int(range[1]);
            for (int i = start; i <= stop; i++)
            {
                result.push_back(i);
            }
        }
        else
        {
            LOG(FATAL) << "Error parsing token " << token;
        }
    }
    return result;
}

std::string cpu_int_string(const cpu_set& cpus, std::function<int(const cpuaff::cpu&)> extract)
{
    std::set<int> uniques;

    for (const auto& cpu : cpus)
    {
        uniques.insert(extract(cpu));
    }

    auto ranges = find_ranges(std::vector<int>(uniques.begin(), uniques.end()));
    return print_ranges(ranges);
}


================================================
FILE: trtlab/core/src/cyclic_buffer.cc
================================================

/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "trtlab/core/cyclic_buffer.h"

#include <cstring>
#include <glog/logging.h>

using namespace trtlab;

CyclicBuffer::CyclicBuffer(memory::descriptor&& md, std::size_t window_size, std::size_t overlap_size, CallbackFn callback)
: m_Descriptor(std::move(md)),
  m_Data(reinterpret_cast<std::uintptr_t>(m_Descriptor.data())),
  m_Size(m_Descriptor.size()),
  m_WindowSize(window_size),
  m_ShiftSize(window_size - overlap_size),
  m_CurrentLocation(m_Data),
  m_WindowStart(m_Data),
  m_Callback(callback),
  m_CurrentWindow(0),
  m_PreferFullWindows(false)
{
    CHECK_LE(m_WindowSize, m_Size) << "Allocated memory must be >= the window_size";
    CHECK_LE(m_ShiftSize, m_WindowSize) << "CyclicBuffer shift_size must be <= window_size";

    m_WindowCount = (m_Size - m_WindowSize) / m_ShiftSize + 1;
    CHECK_GT(m_WindowCount, 1) << "CyclicBuffer data segment must be at least 2x the window size";
    VLOG(1) << "Allocation of " << m_Size << " bytes allows for " << m_WindowCount << " cyclic buffer windows";

    m_Size = m_WindowSize + (m_WindowCount - 1) * m_ShiftSize;
    VLOG(2) << "Effective allocation size " << m_Size << " bytes";

    // determine the location in the contiguous buffer where we need to replicate any bits
    // past that point to the head of the buffer
    auto replication_size = m_WindowSize - m_ShiftSize;
    m_NextPassStart       = m_Data;
    m_ReplicationStart    = m_Data + m_Size - replication_size;
    VLOG(2) << "Replication window begins at " << Offset(m_ReplicationStart);

    // set sync location to be the end of the buffer
    // no sync is needed on the first pass, when we reset the pointers to the head,
    // then the sync pointers move to the head and must shift forward before current
    // position is allowed to move forward
    m_SyncWindow   = m_WindowCount;
    m_SyncLocation = m_Data + m_Size;

    // initialze state trackers
    // note: if we shift by the shift size, then there is always one sync location
    // m_ShiftSize away from the end of the buffer that requires syncing.  this
    // sync location is tied to the last window and has the same state as the prior
    // sync location; however to make tracking simplier, we initialize this last
    // window state as always ready
    m_State.resize(m_WindowCount + 1);
    for (std::size_t i = 0; i < m_WindowCount + 1; i++)
    {
        ResetWindow(i);
    }
}

CyclicBuffer::~CyclicBuffer()
{
    Sync();
}

CyclicBuffer::SyncFn CyclicBuffer::AppendData(void* ptr, std::size_t size)
{
    // copy incoming data to the buffer
    // data will be copied in chunks upto the size of each window
    // windows will be computed when completed
    // this maps well to a cuda architecture of where the copy/compute
    // of one window can be overlapped with the copy of the next window
    // this call blocks until all the data has been copied to the buffer
    // the Free() method drives forward progress to reclaim buffer space
    // from windows that have completed their callbacks
    VLOG(1) << "AppendData: " << size << " bytes to buffer at offset " << Offset(m_CurrentLocation);
    std::uintptr_t data      = reinterpret_cast<std::uintptr_t>(ptr);
    auto           remaining = size;
    while (remaining)
    {
        auto chunk_size = std::min(remaining, Free());
        CopyToHead(data, chunk_size);
        data += chunk_size;
        remaining -= chunk_size;
    }
    return FinishedAppendingData();
}

std::size_t CyclicBuffer::Free()
{
    // compute the available free space remainining in the current window
    DCHECK_LE(m_WindowStart, m_CurrentLocation);
    DCHECK_LE(m_CurrentLocation, m_SyncLocation);

    if (m_CurrentLocation == m_SyncLocation)
    {
        SyncWindow(m_SyncWindow, true);
        m_SyncLocation += m_ShiftSize;
        m_SyncWindow++;
    }

    if (m_PreferFullWindows) // prefer full windows
    {
        auto next_window_start = m_WindowStart + m_WindowSize;
        while (m_SyncLocation < next_window_start)
        {
            SyncWindow(m_SyncWindow, true);
            m_SyncLocation += m_ShiftSize;
            m_SyncWindow++;
        }
    }

    auto free_current_window = m_WindowSize - (m_CurrentLocation - m_WindowStart);
    auto free_synced         = m_SyncLocation - m_CurrentLocation;
    auto free                = std::min(free_current_window, free_synced);
    VLOG(2) << free << " free bytes available in current window; " << free_current_window - free << " bytes require syncing";
    return free;
}

void CyclicBuffer::ReplicateData()
{
    VLOG(2) << "Replicating data from offset " << Offset(m_ReplicationStart) << "-" << Offset(m_Data + m_Size) << " to head";
    DCHECK_EQ(m_CurrentLocation, m_Data); // ensure we are at the start
    std::uintptr_t data      = m_ReplicationStart;
    auto           remaining = m_WindowSize - m_ShiftSize;
    while (remaining)
    {
        auto chunk_size = std::min(remaining, Free());
        ReplicateToHead(data, chunk_size);
        data += chunk_size;
        remaining -= chunk_size;
    }
    DCHECK_EQ(m_CurrentLocation, m_NextPassStart);
}

void CyclicBuffer::ReplicateToHead(std::uintptr_t data, std::size_t size)
{
    auto dst = reinterpret_cast<void*>(m_CurrentLocation);
    auto src = reinterpret_cast<void*>(data);
    VLOG(3) << "Replicating " << size << " bytes from " << Offset(data) << " to cyclic buffer @ " << Offset(m_CurrentLocation);
    Replicate(dst, src, size);
    m_CurrentLocation += size;
    DCHECK_LE(m_CurrentLocation, m_SyncLocation);
}

void CyclicBuffer::CopyToHead(std::uintptr_t data, std::size_t size)
{
    // copy bytes to current location
    // if a window is complete, kick off the execution callback
    auto dst = reinterpret_cast<void*>(m_CurrentLocation);
    auto src = reinterpret_cast<void*>(data);
    VLOG(2) << "Copying " << size << " bytes to offset " << Offset(m_CurrentLocation);
    Copy(dst, src, size);
    m_CurrentLocation += size;
    DCHECK_LE(m_CurrentLocation, m_SyncLocation);

    ExecuteWindows();
}

void CyclicBuffer::ExecuteWindows()
{
    while (m_CurrentLocation - m_WindowStart >= m_WindowSize)
    {
        VLOG(1) << "Run Callback on Window " << m_CurrentWindow << "; offset: " << Offset(m_WindowStart);
        auto data = reinterpret_cast<void*>(m_WindowStart);
        RunCallback(data, m_WindowSize);
        m_WindowStart += m_ShiftSize;
    }

    if (m_CurrentLocation - m_Data == m_Size)
    {
        VLOG(2) << "CyclicBuffer hit end of stack; resetting stack";
        m_CurrentLocation = m_Data;
        m_WindowStart     = m_Data;
        m_SyncLocation    = m_Data;
        m_SyncWindow      = 0;
        m_NextPassStart   = m_Data + (m_WindowSize - m_ShiftSize);
        // we haven't actually replicated the data from the last window
        // the first time we append data to window0, we first must sync on it
        // that ensures we can overwrite the data in window0 with the replicated
        // data first, then the new data to append

        // before we can reuse the buffer, we must replicate the last bits to the head
        if (m_CurrentLocation < m_NextPassStart)
        {
            DCHECK_EQ(m_ReplicationStart + m_ShiftSize, m_Data + m_Size);
            ReplicateData();
        }
    }
}

void CyclicBuffer::RunCallback(const void* data, std::size_t size)
{
    auto window            = m_CurrentWindow % m_WindowCount;
    m_State[window].status = WindowStatus::Running;
    m_State[window].syncfn = m_Callback(m_CurrentWindow, data, size);
    DCHECK(m_State[window].syncfn);
    m_CurrentWindow++;
}

bool CyclicBuffer::SyncWindow(std::size_t window, bool wait)
{
    VLOG(3) << "SyncWindow: " << window << "; wait=" << (wait ? "TRUE" : "FALSE");
    const auto& state = m_State[window];
    switch (state.status)
    {
    case WindowStatus::Ready:
    case WindowStatus::Finished:
        VLOG(2) << "SyncWindow " << window << " Completed: Synced == TRUE";
        return true;
    case WindowStatus::Running:
        bool finished = m_State[window].syncfn(wait);
        VLOG(2) << "SyncWindow " << window << " Completed: Synced == " << (finished ? "TRUE" : "FALSE");
        if (finished)
        {
            ResetWindow(window);
        }
        return finished;
    };

    LOG(FATAL) << "should not be reachable " << window;
    return false;
}

void CyclicBuffer::ResetWindow(std::size_t window)
{
    auto& state  = m_State[window];
    state.status = WindowStatus::Finished;
    state.syncfn = nullptr;
}

void CyclicBuffer::Sync()
{
    for (std::size_t i = 0; i < m_State.size(); i++)
    {
        SyncWindow(i, true);
    }
}

void CyclicBuffer::PreferFullWindows(bool val)
{
    m_PreferFullWindows = val;
}

================================================
FILE: trtlab/core/src/cyclic_windowed_buffer.cc
================================================

#include <algorithm>
#include <queue>

#include "cyclic_windowed_buffer.h"

#include <glog/logging.h>

using namespace trtlab;
using namespace detail;

cyclic_windowed_buffer::cyclic_windowed_buffer() : m_window_size(0), m_window_count(0), m_shift_size(0), m_capacity(0) {}

cyclic_windowed_buffer::cyclic_windowed_buffer(memory::descriptor md, std::size_t window_size, std::size_t overlap)
: m_descriptor(std::move(md)), m_window_size(window_size), m_shift_size(window_size - overlap)
{
    CHECK_LE(window_size, m_descriptor.size()) << "allocated memory must be >= the window_size";
    CHECK_LT(overlap, window_size) << "requested overlap=" << overlap << "; must be [0 <= overlap < " << window_size << ")";

    m_window_count = (m_descriptor.size() - m_window_size) / m_shift_size + 1;
    DVLOG(1) << "Allocation of " << m_descriptor.size() << " bytes allows for " << m_window_count << " cyclic buffer windows";

    m_capacity = m_window_size + (m_window_count - 1) * m_shift_size;
    DVLOG(2) << "Effective allocation size " << m_capacity << " usable bytes";

    DVLOG(2) << "window_size: " << m_window_size << "; shift_size: " << m_shift_size;
}

cyclic_windowed_buffer::cyclic_windowed_buffer(cyclic_windowed_buffer&& other) noexcept
: m_descriptor(std::move(other.m_descriptor)),
  m_window_size(std::exchange(other.m_window_size, 0UL)),
  m_shift_size(std::exchange(other.m_shift_size, 0UL)),
  m_window_count(std::exchange(other.m_window_count, 0UL)),
  m_capacity(std::exchange(other.m_capacity, 0UL))
{
}

cyclic_windowed_buffer& cyclic_windowed_buffer::operator=(cyclic_windowed_buffer&& other) noexcept
{
    m_descriptor   = std::move(other.m_descriptor);
    m_window_size  = std::exchange(other.m_window_size, 0UL);
    m_shift_size   = std::exchange(other.m_shift_size, 0UL);
    m_window_count = std::exchange(other.m_window_count, 0UL);
    m_capacity     = std::exchange(other.m_capacity, 0UL);
    return *this;
}


std::size_t cyclic_windowed_buffer::min_allocation_size(std::size_t window_count, std::size_t window_size, std::size_t overlap)
{
    CHECK_GT(window_count, 0);
    CHECK_GT(window_size, overlap);
    return window_size + (window_count - 1) * (window_size - overlap);
}

// stack impl

cyclic_windowed_stack_impl::cyclic_windowed_stack_impl() : m_end(nullptr), m_data_top(nullptr), m_sync_top(nullptr), m_win_top(nullptr), m_win_counter(0) {}

cyclic_windowed_stack_impl::cyclic_windowed_stack_impl(memory::descriptor md, std::size_t window_size, std::size_t overlap)
: cyclic_windowed_buffer(std::move(md), window_size, overlap),
  m_end(data() + capacity()),
  m_data_top(data()),
  m_sync_top(m_end),
  m_win_top(data()),
  m_win_counter(0)
{
}

/*
cyclic_windowed_stack_impl::cyclic_windowed_stack_impl(cyclic_windowed_buffer&& buffer)
: cyclic_windowed_buffer(std::move(buffer)),
  m_end(data() + capacity()),
  m_data_top(data()),
  m_sync_top(m_end),
  m_win_top(data()),
  m_win_counter(0)
{
}
*/


cyclic_windowed_stack_impl::cyclic_windowed_stack_impl(cyclic_windowed_stack_impl&& other) noexcept
: cyclic_windowed_buffer(std::move(other)),
  m_end(std::exchange(other.m_end, nullptr)),
  m_data_top(std::exchange(other.m_data_top, nullptr)),
  m_sync_top(std::exchange(other.m_sync_top, nullptr)),
  m_win_top(std::exchange(other.m_win_top, nullptr)),
  m_win_counter(std::exchange(other.m_win_counter, 0UL))
{
}

cyclic_windowed_stack_impl& cyclic_windowed_stack_impl::operator=(cyclic_windowed_stack_impl&& other) noexcept
{
    cyclic_windowed_buffer::operator=(std::move(other));
    m_end                           = std::exchange(other.m_end, nullptr);
    m_data_top                      = std::exchange(other.m_data_top, nullptr);
    m_sync_top                      = std::exchange(other.m_sync_top, nullptr);
    m_win_top                       = std::exchange(other.m_win_top, nullptr);
    m_win_counter                   = std::exchange(other.m_win_counter, 0UL);
    return *this;
}


std::size_t cyclic_windowed_stack_impl::available() const noexcept
{
    DCHECK_GE(offset(m_data_top), 0);
    DCHECK_LE(offset(m_data_top), capacity());
    DCHECK_GE(offset(m_sync_top), shift_size());
    DCHECK_LE(offset(m_sync_top), capacity());
    //DCHECK_EQ(offset(m_sync_top) % overlap_size(), 0);
    DCHECK_GE(offset(m_win_top), 0);
    DCHECK_LE(offset(m_win_top), capacity() - window_size());
    //DCHECK_EQ(offset(m_win_top) % overlap_size(), 0);

    DCHECK_LE(offset(m_data_top), offset(m_sync_top));

    std::ptrdiff_t avail     = m_sync_top - m_data_top;
    std::ptrdiff_t in_use    = m_data_top - m_win_top;
    std::ptrdiff_t remaining = window_size() - in_use;

    return std::min(avail, remaining);
}

void cyclic_windowed_stack_impl::push_window(std::function<void()> sync)
{
    DVLOG(1) << "push window: " << window_id() << " - start offset = " << offset(m_win_top);
    DCHECK_GE(offset(m_data_top), 0);
    DCHECK_LE(offset(m_data_top), capacity());
    DCHECK_GE(offset(m_sync_top), shift_size());
    DCHECK_LE(offset(m_sync_top), capacity());
    //DCHECK_EQ(offset(m_sync_top) % overlap_size(), 0);
    DCHECK_GE(offset(m_win_top), 0);
    DCHECK_LE(offset(m_win_top), capacity() - window_size());
    //DCHECK_EQ(offset(m_win_top) % overlap_size(), 0);

    DVLOG(1) << "push window: " << window_id() << " - start offset = " << offset(m_win_top);

    // record win_start and sync function for the current window
    m_sync.push(std::make_pair(m_win_top, sync));

    // move data pointer to end of current window
    m_data_top = m_win_top + window_size();

    // push window pointer
    m_win_top += shift_size();

    // if we are at the end, then recycle
    if (m_data_top == m_end)
    {
        recycle_buffer();
    }

    // ensure the next window is free
    while (m_sync_top < m_win_top + window_size())
    {
        sync_and_shift();
    }

    DVLOG(3) << "push window complete";

    // increrement win_count;
    m_win_counter++;
}

std::size_t cyclic_windowed_stack_impl::push_data(const void* src, std::size_t bytes)
{
    DCHECK_LE(m_data_top + bytes, m_sync_top);
    DCHECK_LE(m_data_top + bytes, m_win_top + window_size());

    // copy data to buffer
    copy(m_data_top, src, bytes);

    // push data stack
    m_data_top += bytes;

    // if we completed a window, trigger an event
    if (m_data_top == m_win_top + window_size())
    {
        on_window_complete_event(window_id(), m_win_top, window_size());
    }

    return bytes;
}

void cyclic_windowed_stack_impl::sync_and_shift()
{
    DCHECK_GE(offset(m_sync_top), 0);
    DCHECK_LE(offset(m_sync_top), capacity());
    //DCHECK_EQ(offset(m_sync_top) % shift_size(), 0);

    auto [start, syncfn] = m_sync.front();
    m_sync.pop();

    DCHECK_EQ(offset(start), offset(m_sync_top));
    DVLOG(2) << "sync next window - starting window offset = " << offset(start);

    // call sync function
    if (syncfn)
    {
        syncfn();
    }

    // push sync stack
    m_sync_top += shift_size();

    // the replication window does not need to be sync'd
    // no window starts at end - shift_size
    // that winwdow would overflow the buffer
    if (m_sync_top == m_end - overlap_size())
    {
        m_sync_top = m_end;
    }
}

void cyclic_windowed_stack_impl::recycle_buffer()
{
    DVLOG(2) << "end of buffer - recycle";
    DCHECK_EQ(m_data_top, m_end);

    // to keep the logic clean, we are going to push a no-op sync

    // reset stack pointers
    m_data_top = m_sync_top = m_win_top = data();

    if (overlap_size())
    {
        DVLOG(2) << "replicating " << overlap_size() << " bytes from the end to the beginning";

        // make space for the replicated data by syncing on windows from the last pass
        while (m_sync_top <= data() + overlap_size())
        {
            sync_and_shift();
        }

        // copy overlap_size bytes from the end to the beginning
        replicate(m_data_top, m_end - overlap_size(), overlap_size());

        // move data top to after the replicated data
        m_data_top += overlap_size();
    }
    DVLOG(3) << "recycle complete";
}

void cyclic_windowed_stack_impl::reset()
{
    while (!m_sync.empty())
    {
        auto [start, syncfn] = m_sync.front();
        if (syncfn)
        {
            syncfn();
        }
        m_sync.pop();
    }

    m_data_top    = data();
    m_sync_top    = m_end;
    m_win_top     = data();
    m_win_counter = 0;
}

// reservation


================================================
FILE: trtlab/core/src/memory/copy.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "trtlab/core/memory/copy.h"

#include <cstring>
#include <glog/logging.h>

namespace trtlab {

void Copy(HostMemory& dst, size_t dst_offset, const HostMemory& src, size_t src_offset, size_t size)
{
    CHECK_LE(size, dst.Size() - dst_offset) << "Copy: dst range is invalid";
    CHECK_LE(size, src.Size() - src_offset) << "Copy: src range is invalid";
    std::memcpy(dst[dst_offset], src[src_offset], size);
}

} // namespace trtlab

================================================
FILE: trtlab/core/src/memory/host_memory.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "trtlab/core/memory/host_memory.h"

#include <cstring>

#include <glog/logging.h>

namespace trtlab {

// HostMemory

size_t HostMemory::DefaultAlignment() { return 64; }

DLContext HostMemory::DeviceContext()
{
    DLContext ctx;
    ctx.device_type = kDLCPU;
    ctx.device_id = 0;
    return ctx;
}

} // namespace trtlab


================================================
FILE: trtlab/core/src/memory/malloc.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "trtlab/core/memory/malloc.h"

#include <foonathan/memory/namespace_alias.hpp>
#include <foonathan/memory/error.hpp>


#include <glog/logging.h>

namespace trtlab {

void* Malloc::Allocate(size_t size)
{
    void* ptr = std::malloc(size);
    CHECK(ptr) << "malloc(" << size << ") failed";
    return ptr;
}

std::function<void()> Malloc::Free()
{
    CHECK(Data());
    CHECK(Size());
    return [ptr = Data(), size = Size()] {
        DLOG(INFO) << "Malloc: free " << ptr << "; size: " << size;
        free(ptr);
    };
}

const char* Malloc::TypeName() const
{
    return "Malloc";
}

} // namespace trtlab

namespace foonathan { namespace memory { 

memory::allocator_info detail::MallocAllocatorImpl::info() noexcept
{
        return {"trtlab::MallocAllocator", nullptr};
}

template class memory::detail::lowlevel_allocator<detail::MallocAllocatorImpl>;
template class memory::allocator_traits<MallocAllocator>;

}}


================================================
FILE: trtlab/core/src/memory/memory.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "trtlab/core/memory/memory.h"
#include "trtlab/core/utils.h"

#include <cstring>
#include <glog/logging.h>

namespace trtlab {

CoreMemory::CoreMemory() : m_Size(0), m_Capacity(0), m_Stride1(1)
{
    m_Handle.data = nullptr;
    m_Handle.ndim = 0;
    m_Handle.shape = &m_Size;
    m_Handle.strides = &m_Stride1;
    m_Handle.dtype.code = kDLUInt;
    m_Handle.dtype.bits = 8U;
    m_Handle.dtype.lanes = 1U;
    m_Handle.ctx.device_type = kDLCPU;
    m_Handle.ctx.device_id = 0;
    m_Handle.byte_offset = 0;
}

CoreMemory::CoreMemory(void* ptr, mem_size_t size) : CoreMemory() { SetDataAndSize(ptr, size); }

void CoreMemory::SetDataAndSize(void* ptr, mem_size_t size)
{
    m_Handle.data = ptr;
    m_Handle.ndim = 1U;
    m_Size = size;
    m_Capacity = size;
}

CoreMemory::CoreMemory(void* ptr, mem_size_t size, const CoreMemory& mem) : CoreMemory(ptr, size)
{
    if(!mem.ContiguousBytes())
    {
        throw std::runtime_error(
            "CoreMemory ctor required the reference to be a contiguous bytearray");
    }
    m_Handle.ctx = mem.m_Handle.ctx;
}

CoreMemory::CoreMemory(const DLTensor& handle) : CoreMemory() { SetHandle(handle); }

CoreMemory::CoreMemory(CoreMemory&& other) noexcept
    : m_Deleter(std::exchange(other.m_Deleter, nullptr))
{
    SetHandle(other.m_Handle);
    other.m_Handle = DLTensor();
    other.m_Size = 0;
    other.m_Capacity = 0;
}

CoreMemory::CoreMemory(void* ptr, std::vector<int64_t> shape, const types::dtype& dt)
{
    DCHECK(ptr);
    DCHECK(shape.size());

    m_Handle.data = ptr;
    m_Handle.ndim = shape.size();
    m_Handle.dtype = dt.to_dlpack();

    if(m_Handle.ndim == 1)
    {
        m_Size = m_Capacity = shape[0];
        m_Handle.shape = &m_Size;
    }
    else
    {
        m_Size = m_Capacity = SizeFromShape(shape, dt.bytes());
        m_Shape = shape;
        m_Handle.shape = &m_Shape[0];
    }
}

CoreMemory::~CoreMemory()
{
    if(m_Deleter)
    {
        DLOG(INFO) << "Deallocating ptr: " << m_Handle.data;
        m_Deleter();
    }
}

types::dtype CoreMemory::DataType() const { return types::dtype(m_Handle.dtype); }

std::vector<int64_t> CoreMemory::Shape() const
{
    if(m_Handle.ndim == 1)
    {
        std::vector<int64_t> shape = {m_Size};
        return shape;
    }
    return m_Shape;
}

std::vector<int64_t> CoreMemory::Strides() const
{
    if(m_Handle.ndim == 1)
    {
        std::vector<int64_t> strides = {m_Handle.strides[0]};
        return strides;
    }
    return m_Strides;
}

void CoreMemory::Reshape(const std::vector<int64_t>& shape) { Reshape(shape, DataType()); }

void CoreMemory::Reshape(const std::vector<int64_t>& shape, const types::dtype& dt)
{
    SetShape(shape, dt, true);
}

void CoreMemory::SetShape(const std::vector<int64_t>& shape, const types::dtype& dt,
                          bool check_size)
{
    DCHECK(shape.size());
    DCHECK(dt.bytes());

    auto size = SizeFromShape(shape, dt.bytes());
    if(check_size && size > m_Capacity)
    {
        throw std::length_error("Reshape exceeds capacity");
    }

    m_Handle.ndim = shape.size();
    m_Handle.dtype = dt.to_dlpack();
    m_Handle.shape = nullptr;
    m_Size = size;

    if(shape.size() == 1)
    {
        m_Handle.shape = &m_Size;
    }
    else
    {
        m_Shape = shape;
        m_Handle.shape = &m_Shape[0];
    }

    // set strides for fortran column major
    m_Strides.resize(m_Handle.ndim);
    m_Handle.strides = &m_Strides[0];
    int64_t offset = 1;
    for(int i=1; i<=m_Handle.ndim; i++)
    {
        m_Strides[m_Handle.ndim - i] = offset;
        offset *= shape[m_Handle.ndim - i];
    }
}

void CoreMemory::ReshapeToBytes() { Reshape({Capacity()}, types::bytes); }

void* CoreMemory::operator[](size_t offset)
{
    CHECK_LE(offset, Size());
    return static_cast<void*>(static_cast<char*>(Data()) + offset);
}

const void* CoreMemory::operator[](size_t offset) const
{
    CHECK_LE(offset, Size());
    return static_cast<const void*>(static_cast<const char*>(Data()) + offset);
}

void CoreMemory::SetHandle(const DLTensor& handle)
{
    m_Handle = handle;
    if(handle.ndim == 1)
    {
        m_Capacity = m_Size = handle.shape[0] * SizeOfDataType();
        m_Handle.shape = &m_Size;
    }
    else
    {
        m_Shape.resize(handle.ndim);
        m_Handle.shape = &m_Shape[0];
        std::memcpy(m_Handle.shape, handle.shape, handle.ndim * sizeof(int64_t));
        m_Size = SizeFromShape(m_Shape, SizeOfDataType());
        m_Capacity = m_Size;
    }
    if(handle.strides)
    {
        m_Strides.resize(handle.ndim);
        m_Handle.strides = &m_Strides[0];
        std::memcpy(m_Handle.strides, handle.strides, handle.ndim * sizeof(int64_t));
        // compute capacity from strides
        mem_size_t itemsize = DataType().bytes();
        mem_size_t offset_to_end = itemsize;
        const auto& shape = Shape();
        for(int i = 0; i < m_Handle.ndim; i++)
        {
            offset_to_end += m_Strides[i] * (shape[i] - 1) * itemsize;
        }
        m_Capacity = offset_to_end;
    }
}

mem_size_t CoreMemory::SizeOfDataType() const
{
    return ((mem_size_t)m_Handle.dtype.bits * (mem_size_t)m_Handle.dtype.lanes + 7) / 8;
}

bool CoreMemory::ContiguousBytes() const
{
    return ((m_Handle.ndim == 1) && (m_Size == m_Capacity) && (DataType() == types::bytes));
}

mem_size_t CoreMemory::SizeFromShape(const std::vector<mem_size_t>& shape, mem_size_t sizeof_dtype)
{
    mem_size_t size = std::accumulate(std::begin(shape), std::end(shape), mem_size_t(1),
                                      std::multiplies<mem_size_t>());
    size *= sizeof_dtype;
    return size;
}

std::string CoreMemory::Description() const
{
    std::ostringstream os;
    // clang-format off
    os << "[" << TypeName();
    if(m_Handle.ctx.device_type == kDLGPU) { os << " gpu:" << m_Handle.ctx.device_id; }
    os << " " << m_Handle.data << "; shape: (";
    for(int i=0; i< m_Handle.ndim; i++) { os << (i ? "," : "") << m_Handle.shape[i]; }
    os << "); strides: (";
    for(int i=0; i< m_Handle.ndim; i++) { os << (i ? "," : "") << m_Handle.strides[i]; }
    os << "); dtype: " << DataType() << "; size: " << BytesToString(Size());
    if(Size() != Capacity()) { os << "; capacity: " << BytesToString(Capacity()); }
    os << "]";
    // clang-format on
    return os.str();
}

std::ostream& operator<<(std::ostream& os, const CoreMemory& core)
{
    // clang-format off
    os << core.Description();
    return os;
}

} // namespace trtlab

================================================
FILE: trtlab/core/src/memory/sysv_allocator.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "trtlab/core/memory/sysv_allocator.h"

#include <sys/ipc.h>
#include <sys/shm.h>
#include <sys/types.h>

#include <map>
#include <mutex>

#include <glog/logging.h>

#include "trtlab/core/memory/block_arena.h"
#include "trtlab/core/memory/block_manager.h"

namespace foonathan {
namespace memory {
namespace trtlab {
namespace sysv_detail {

    struct sysv_allocation final : memory_block
    {
        sysv_allocation() : memory_block(), shm_id(-1), release(true) {}
        sysv_allocation(int id, void* mem, size_t size, bool rel = true)
            : shm_id(id), memory_block(mem, size), release(rel) {}

        sysv_allocation(const sysv_allocation&) = default;
        sysv_allocation& operator=(const sysv_allocation&) = default;

        sysv_allocation(sysv_allocation&& other)
            : shm_id(std::exchange(other.shm_id, -1)),
              release(other.shm_id),
              memory_block(std::move(other)) {}

        sysv_allocation& operator=(sysv_allocation&& other)
        {
            shm_id = std::exchange(other.shm_id, -1);
            release = other.release;
            memory_block::operator=(std::move(other));
            return *this;
        }

        int shm_id;
        bool release;
    };

    class sysv_manager final
    {
      public:
        ~sysv_manager()
        {
            // warn or remove registered sysv segments depending on settings
            DVLOG(1) << "sysv_manager shutting down";
            if(m_manager.size())
            {
                LOG(WARNING) << "Detected SysV allocations that were not deallocated";
                m_manager.for_each_block([](sysv_allocation& block) {
                    DVLOG(3) << "detaching block - ptr: " << block.memory << "; shm_id: " << block.shm_id;
                    sysv_manager::detach(block.memory);
                });
            }
        }

        static const sysv_allocation& allocate(std::size_t size)
        {
            auto shm_id = shmget(IPC_PRIVATE, size, IPC_CREAT | 0666);
            DVLOG(2) << "creating shmid: " << shm_id << "; size=" << size;
            if(shm_id == -1) { throw std::bad_alloc(); }
            return sysv_manager::attach_impl(shm_id, true);
        }

        static const sysv_allocation& attach(int shm_id)
        {
            return sysv_manager::attach_impl(shm_id, false);
        }

        static int detach(void* addr)
        {
            auto& manager = sysv_manager::global_manager();
            auto shm_id = manager.drop_allocation(addr);

            if(shm_id > 0)
            {
                auto stats = sysv_manager::get_stats(shm_id);
                if(!(stats.shm_perm.mode & SHM_DEST)) { sysv_manager::release(shm_id); }
            }

            auto rc = shmdt(addr);
            DCHECK_EQ(rc, 0) << "errno: "  << errno;
            if(rc != 0) { throw std::runtime_error("shmdt failed"); }
        }

        static void release(int shm_id)
        {
            // check if the segment has been marked for release/destruction
            auto stats = sysv_manager::get_stats(shm_id); 
            if(stats.shm_perm.mode & SHM_DEST) 
            {
                DVLOG(3) << "shm_id: " << shm_id << " already marked for removal";
                return; 
            }

            // otherwise mark it for release/destruction
            auto rc = shmctl(shm_id, IPC_RMID, &stats);
            DCHECK_EQ(rc, 0) << "shmctl failed; errno: " << errno;
            if(rc != 0) { throw std::runtime_error("unable to release shm_id"); }
            DVLOG(3) << "shm_id: " << shm_id << " marked as removed";
        }

        static std::size_t size(int shm_id)
        {
            auto stats = sysv_manager::get_stats(shm_id);
            return stats.shm_segsz;
        }

        static bool release_on_deallocate()
        {
            auto& manager = sysv_manager::global_manager();
            std::lock_guard<std::mutex> lock(manager.m_mutex);
            return manager.m_release_on_deallocate;
        }

        static bool release_on_deallocate(bool val)
        {
            auto& manager = sysv_manager::global_manager();
            std::lock_guard<std::mutex> lock(manager.m_mutex);
            manager.m_release_on_deallocate = val;
            return manager.m_release_on_deallocate;
        }

        static sysv_info sysv_info_for_pointer(void* ptr)
        {
            auto& manager = sysv_manager::global_manager();
            std::lock_guard<std::mutex> lock(manager.m_mutex);
            auto block = manager.m_manager.find_block(ptr);
            if(block && block->contains(ptr))
            {
                DCHECK_NE(block->shm_id, -1);
                auto stats = sysv_manager::get_stats(block->shm_id);
                bool released = stats.shm_perm.mode & SHM_DEST;
                return {block->shm_id, block->size, block->distance(ptr), stats.shm_nattch, !released};
            }
            throw std::runtime_error("no sysv info found for pointer");
        }

     private:
        sysv_manager() : m_release_on_deallocate(true) {}

        sysv_manager(const sysv_manager&) = delete;
        sysv_manager& operator=(const sysv_manager&) = delete;

        sysv_manager(sysv_manager&&) = delete;
        sysv_manager& operator=(sysv_manager&&) = delete;

        static sysv_manager& global_manager()
        {
            static sysv_manager manager;
            return manager;
        }

        static const sysv_allocation& attach_impl(int shm_id, bool has_ownership)
        {
            DVLOG(2) << "attaching to shm_id: " << shm_id;
            auto stats = sysv_manager::get_stats(shm_id);
            if(stats.shm_perm.mode & SHM_DEST) { throw std::bad_alloc(); }
            auto ptr = shmat(shm_id, nullptr, 0);
            if(ptr == (void*)-1) { throw std::bad_alloc(); }
            auto& manager = sysv_manager::global_manager();
            return manager.register_allocation(shm_id, ptr, sysv_manager::size(shm_id), has_ownership);
        }

        static struct shmid_ds  get_stats(int shm_id)
        {
            struct shmid_ds stats;
            auto rc = shmctl(shm_id, IPC_STAT, &stats);
            DCHECK_EQ(rc, 0);
            if(rc != 0) throw std::runtime_error("no sysv info found for pointer");
            return stats;
        }

        const sysv_allocation& register_allocation(int shm_id, void* addr, std::size_t size, bool has_ownership)
        {
            // ownership and release on deallocate need to be true for a sysv segment to be removed
            // allocate -> has_ownership == true
            // attach   -> has_ownership == false
            std::lock_guard<std::mutex> lock(m_mutex);
            DVLOG(3) << this << ": registering shmd_id: " << shm_id << "; addr: " << addr << "; size: " << size;
            return m_manager.add_block({shm_id, addr, size, m_release_on_deallocate && has_ownership});
        }

        int drop_allocation(void* addr)
        {
            std::lock_guard<std::mutex> lock(m_mutex);
            int shm_id = -1;
            auto block = m_manager.find_block(addr);
            DCHECK(block);
            if(block)
            {
                DVLOG(3) << this << ": dropping sysv allocation containing " << addr;
                m_manager.drop_block(addr);
                if(block->release) { shm_id = block->shm_id; }
            }
            DVLOG(3) << this << ": " << addr << " maps to shm_id " << shm_id;
            return shm_id;
        }

        std::mutex m_mutex;
        block_manager<sysv_allocation> m_manager;
        bool m_release_on_deallocate;
    };

} // namesapce sysv_detail

    void* sysv_allocator::allocate_node(std::size_t size, std::size_t)
    {
        DVLOG(1) << "sysv::allocate_node - " << size;
        const auto& allocation = sysv_detail::sysv_manager::allocate(size);
        return allocation.memory;
    }

    void sysv_allocator::deallocate_node(void* ptr, std::size_t, std::size_t)
    {
        DVLOG(1) << "sysv::deallocate_node - " << ptr;
        sysv_detail::sysv_manager::detach(ptr);
    }

    void* sysv_allocator::attach(int shm_id)
    {
        DVLOG(1) << "sysv::attach - " << shm_id;
        const auto& allocation = sysv_detail::sysv_manager::attach(shm_id);
        return allocation.memory;
    }

    void sysv_allocator::release(int shm_id)
    {
        DVLOG(1) << "sysv::release - " << shm_id;
        sysv_detail::sysv_manager::release(shm_id);

    }

    sysv_info sysv_allocator::sysv_info_for_pointer(void* ptr)
    {
        return sysv_detail::sysv_manager::sysv_info_for_pointer(ptr);
    }

    bool sysv_allocator::release_on_deallocate()
    {
        return sysv_detail::sysv_manager::release_on_deallocate();
    }

    bool sysv_allocator::release_on_deallocate(bool val)
    {
        return sysv_detail::sysv_manager::release_on_deallocate(val);
    }

} // namespace trtlab
} // namespace memory
} // namespace foonathan

================================================
FILE: trtlab/core/src/memory/tensor_shape.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "trtlab/core/memory/tensor_shape.h"

#include <glog/logging.h>

namespace trtlab {

void TensorShapeGeneric::Initialize()
{
    CheckShape();
    if(m_Strides.size() == 0) { SetStridesCompactRowMajor(); }
    ValidateDimensions();
}

void TensorShapeGeneric::CheckShape()
{
    if(m_Shape.size() == 0)
    {
        throw std::runtime_error("Shape must have at least a dimension of 1");
    }
    for(const auto& d : m_Shape)
    {
        if(d <= 0)
        {
            throw std::runtime_error("Negative dimensions in Shape is not allowed");
        }
    }
}

void TensorShapeGeneric::ValidateDimensions()
{
    if(m_Shape.size() != m_Strides.size())
    {
        throw std::runtime_error("Shape and Strides must be of the same dimension");
    }

    if(Size())
    {
        for(int i = 0; i < m_Shape.size(); i++)
        {
            m_Items += (m_Shape[i] - 1) * m_Strides[i];
        }
        m_Items++;
    }
}

void TensorShapeGeneric::SetStridesCompactRowMajor()
{
    m_Strides.resize(m_Shape.size());

    int64_t offset = 1;
    for(int i = 1; i <= m_Shape.size(); i++)
    {
        m_Strides[m_Shape.size() - i] = offset;
        offset *= m_Shape[m_Shape.size() - i];
    }
}

} // namespace trtlab

================================================
FILE: trtlab/core/src/types.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "trtlab/core/types.h"

#include <glog/logging.h>

namespace trtlab {
namespace types {

dtype::dtype(const DLDataType& dlpack) : m_DLPackType(dlpack)
{
    m_Bytes = (bits() * lanes() + 7) / 8;
    if(code() > 2) { throw std::runtime_error("Invalid DLDataTypeCode: "); }
}

dtype::dtype(uint8_t code, uint8_t bits, uint16_t lanes) : dtype(DLDataType{code, bits, lanes}) {}

dtype::dtype() : dtype(0, 0, 0) {}

dtype::dtype(dtype&& other) noexcept { *this = std::move(other); }

dtype& dtype::operator=(dtype&& other) noexcept
{
    m_DLPackType = std::exchange(other.m_DLPackType, dtype().to_dlpack());
    m_Bytes = std::exchange(other.m_Bytes, 0);
}

dtype::dtype(const dtype& other) { *this = other; }

dtype& dtype::operator=(const dtype& other)
{
    if(&other == this)
    {
        return *this;
    }
    m_DLPackType = other.m_DLPackType;
    m_Bytes = other.m_Bytes;
}

bool dtype::operator==(const dtype& other) const
{
    if(m_DLPackType.code == other.m_DLPackType.code &&
       m_DLPackType.bits == other.m_DLPackType.bits &&
       m_DLPackType.lanes == other.m_DLPackType.lanes)
    {
        return true;
    }
    return false;
}

int64_t dtype::bytes() const { return m_Bytes; }

const DLDataType& dtype::to_dlpack() const { return m_DLPackType; }


std::string dtype::Description() const
{
    std::ostringstream os;
    std::string t = "unknown";
    uint32_t bits = m_DLPackType.bits;
    uint32_t lanes = m_DLPackType.lanes;
    if(bits == 0 || lanes == 0)
    {
        os << "nil";
    }
    else
    {
        // clang-format off
        if(m_DLPackType.code == kDLInt) { t = "int"; }
        else if(m_DLPackType.code == kDLUInt) { t = "uint"; }
        else if(m_DLPackType.code == kDLFloat) { t = "fp"; }
        os << t << (uint32_t)m_DLPackType.bits;
        if(lanes > 1U) { os << "x" << lanes; }
        // clang-format on
    }
    return os.str();
}

std::ostream& operator<<(std::ostream& os, const dtype& dt)
{
    os << dt.Description();
    return os;
}

} // namespace types
} // namespace trtlab

================================================
FILE: trtlab/core/src/utils.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "trtlab/core/utils.h"

#include <math.h>
#include <stdio.h>

#include <cmath>
#include <regex>

#include <glog/logging.h>

namespace trtlab {
/**
 * @brief Converts bytes into a more friend human readable format
 *
 * @param bytes
 * @return std::string
 */
std::string BytesToString(size_t bytes)
{
    // C++ implentation inspired from: https://stackoverflow.com/questions/3758606
    char buffer[50];
    int unit = 1024;
    const char prefixes[] = "KMGTPE";
    if(bytes < unit)
    {
        sprintf(buffer, "%ld B", bytes);
        return std::string(buffer);
    }
    int exp = (int)(std::log(bytes) / std::log(unit));
    sprintf(buffer, "%.1f %ciB", bytes / std::pow(unit, exp), prefixes[exp - 1]);
    return std::string(buffer);
}

std::uint64_t StringToBytes(const std::string str)
{
    // https://regex101.com/r/UVm5wT/1
    std::smatch m;
    std::regex r("(\\d+[.\\d+]*)([KMGTkmgt]*)([i]*)[bB]");
    std::map<char, int> prefix = {
        {'k', 1}, {'m', 2}, {'g', 3}, {'t', 4}, {'K', 1}, {'M', 2}, {'G', 3}, {'T', 4},
    };

    if(!std::regex_search(str, m, r))
        LOG(FATAL) << "Unable to convert \"" << str << "\" to bytes. "
                   << "Expected format: 10b, 1024B, 1KiB, 10MB, 2.4gb, etc.";

    const std::uint64_t base = m.empty() || (m.size() > 3 && m[3] == "") ? 1000 : 1024;
    auto exponent = prefix[m[2].str()[0]];
    auto scalar = std::stod(m[1]);
    return (std::uint64_t)(scalar * std::pow(base, exponent));
}

} // namespace trtlab

================================================
FILE: trtlab/core/tests/BUILD.bazel
================================================
cc_test(
    name = "test_core",
    srcs = glob(["test_*.cc"]),
    deps = [
        "//trtlab/core:core",
        "@com_google_googletest//:gtest_main",
    ]
)

================================================
FILE: trtlab/core/tests/CMakeLists.txt
================================================
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

find_package(GTest)

add_executable(test_core
  test_main.cc
# test_batcher.cc
  test_common.cc
  test_cyclic_windowed_buffer.cc
  test_async.cc
# test_types.cc
  test_memory.cc
# test_memory_stack.cc
  test_pool.cc
  test_thread_pool.cc
# test_cyclic_allocator.cc
  test_async_compute.cc
# test_tensor.cc
# test_stl_allocator.cc
# test_foo_memory.cc
# test_sysv_allocator.cc
# test_affinity.cc
)

target_link_libraries(test_core
  PRIVATE
    ${PROJECT_NAME}::core
    GTest::gtest
)

target_include_directories(test_core
  PRIVATE
    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
)

add_test(
  NAME core
  COMMAND $<TARGET_FILE:test_core
)


================================================
FILE: trtlab/core/tests/test_affinity.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "trtlab/core/affinity.h"

#include "glog/logging.h"
#include "gtest/gtest.h"

using namespace trtlab;

class TestAffinity : public ::testing::Test {};

TEST_F(TestAffinity, Basics)
{
    auto cpus = Affinity::GetAffinity();
    LOG(INFO) << cpus;
    LOG(INFO) << cpus.GetCpuString();

    // docker desktop uses a VM that screws up all affinity settings
    // this is similar to how aws vms disguise numa information
    // in both casses, each logical gpu shows up as a separate socket
    // and the numa_nodes is -1

    cpus = Affinity::GetCpusBySocket(0);
    LOG(INFO) << cpus;
    LOG(INFO) << "socket 0: " << cpus.GetCpuString();

    cpus = Affinity::GetCpusBySocket(1);
    LOG(INFO) << cpus;
    LOG(INFO) << "Socket 1: " << cpus.GetCpuString();
/*
    cpus = Affinity::GetCpusByNuma(0);
    LOG(INFO) << "numa 0: " << cpus.GetCpuString();

    cpus = Affinity::GetCpusByNuma(1);
    LOG(INFO) << "numa 1: " << cpus.GetCpuString();
*/
}

TEST_F(TestAffinity, IntString)
{
    std::set<int> ints = { 0, 1, 3, 4, 5, 9, 10 };
    std::string str = CpuSet::IntString(ints);
    CHECK_EQ(str, std::string("0-1,3-5,9-10"));

    ints = { 0 };
    str = CpuSet::IntString(ints);
    CHECK_EQ(str, std::string("0"));

    ints = { 1 };
    str = CpuSet::IntString(ints);
    CHECK_EQ(str, std::string("1"));

    ints = { 1,2,3 };
    str = CpuSet::IntString(ints);
    CHECK_EQ(str, std::string("1-3"));

    ints = { 1,2,3,4,5,7,6 };
    str = CpuSet::IntString(ints);
    CHECK_EQ(str, std::string("1-7"));

    ints = { 1,2,4,5,7,6 };
    str = CpuSet::IntString(ints);
    CHECK_EQ(str, std::string("1-2,4-7"));
}

================================================
FILE: trtlab/core/tests/test_async.cc
================================================
#include <gtest/gtest.h>

#include <boost/fiber/all.hpp>

class TestAsync : public ::testing::Test
{
};

TEST_F(TestAsync, FibersHello)
{
    int i = 1;

    boost::fibers::fiber f([&i] { i=2; });
    f.join();

    ASSERT_EQ(i, 2);
}

================================================
FILE: trtlab/core/tests/test_async_compute.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "trtlab/core/async_compute.h"
#include "gtest/gtest.h"

#include <glog/logging.h>

using namespace trtlab;

namespace {

class TestAsyncCompute : public ::testing::Test
{
};

TEST_F(TestAsyncCompute, EvenTest)
{
    auto compute = async_compute<void(int)>::wrap([](int i) -> bool { return (bool)((i % 2) == 0); });

    /*
    // fails to compile: the class was defined to accept a user function with only 1 int, not 2
    auto compute2ints = async_compute<void(int)>::wrap([](int i, int j) -> bool {
         return (bool)((i % 2) == 0);
    });
    */

    auto future = compute->get_future();
    (*compute)(42);
    // (*compute)(42, -2); // fails to compile, 2 ints instead of 1
    auto value = future.get();

    EXPECT_TRUE(value);
}

TEST_F(TestAsyncCompute, OddTest)
{
    auto compute =
        async_compute<void(int)>::wrap([](int i) -> bool { return (bool)((i % 2) == 0); });

    auto future = compute->get_future();
    (*compute)(41);
    auto value = future.get();

    EXPECT_FALSE(value);
}

TEST_F(TestAsyncCompute, ReturnUniquePtr)
{
    auto compute = async_compute<void(int)>::wrap([](int i) -> std::unique_ptr<bool> {
        return std::make_unique<bool>((bool)((i % 2) == 0));
    });

    auto future = compute->get_future();
    (*compute)(41);
    auto value = std::move(future.get());

    EXPECT_TRUE(value); // this the unique ptr
    EXPECT_FALSE(*value); // this the unique ptr
    EXPECT_ANY_THROW(future.get()); // this is the unique ptr - value moved out
}

TEST_F(TestAsyncCompute, ReturnVoid)
{
    auto compute = async_compute<void(int)>::wrap([](int i) { DVLOG(1) << "Inner"; });

    auto future = compute->get_future();
    (*compute)(42);
    future.wait();
}

TEST_F(TestAsyncCompute, PackagedTask)
{
    auto task = std::make_shared<std::packaged_task<void(int)>>([](int i) { DVLOG(1) << "Inner : " << i; });

    auto future = task->get_future();
    (*task)(42);
    future.wait();
}

/*
TEST_F(TestAsyncCompute, ReturnBoolInputs1xInt)
{
    struct ReturnBoolInputs1xInt : public async_compute<bool(int)>
    {
        template<typename T, typename ...Args>
        auto Compute(T(Args...) UserFn) {
            std::vector<std::future<T>> futures;
            for(int i=0; i<10; i++) {
                auto compute = wrap(UserFn);
                futures.push_back(std::move(compute->get_future()));
                m_ThreadPool.enqueue([compute](int i) {
                    (*compute)(i);
                })
            }

        }
      protected:
        template<typename T, typename... Args>
        auto Enqueue(std::shared_ptr<async_compute<T>> UserFn, Args&&... args)
        {
            auto compute = wrap(UserFn);
            auto future = compute->get_future();
            m_ThreadPool.enqueue([compute](Args&&... args) mutable {
                (*compute)(args...);
            });
        }

      private:
        ThreadPool m_ThreadPool;
    };

    ReturnBoolInputs1xInt compute()
}
*/

} // namespace

================================================
FILE: trtlab/core/tests/test_batcher.cc
================================================

#include <gtest/gtest.h>

#include <trtlab/core/standard_threads.h>
#include <trtlab/core/userspace_threads.h>

#include <trtlab/core/batcher.h>
#include <trtlab/core/dispatcher.h>
#include <trtlab/core/task_pool.h>

class TestBatcher : public ::testing::Test
{
};

using namespace trtlab;

TEST_F(TestBatcher, StandardBatcher)
{
    StandardBatcher<int, standard_threads> batcher(5);

    for (int i = 0; i < 9; i++)
    {
        auto f     = batcher.enqueue(i);
        auto batch = batcher.update();

        if (i == 4)
        {
            EXPECT_TRUE(batch);
            EXPECT_EQ(batch->items.size(), 5);

            EXPECT_EQ(f.wait_until(std::chrono::high_resolution_clock::now()), std::future_status::timeout);
            batch->promise.set_value();
            EXPECT_EQ(f.wait_until(std::chrono::high_resolution_clock::now()), std::future_status::ready);
        }
        else
        {
            EXPECT_FALSE(batch);
        }
    }

    auto batch = batcher.update();
    EXPECT_FALSE(batch);

    batch = batcher.close_batch();
    EXPECT_TRUE(batch);
    EXPECT_EQ(batch->items.size(), 4);
}

TEST_F(TestBatcher, FullBatcher)
{
    auto execute_on_batch = [](const std::vector<int>& batch, std::function<void()> release) {
        LOG(INFO) << "executing on " << batch.size() << " items";
        std::this_thread::sleep_for(std::chrono::milliseconds(2));
        LOG(INFO) << "freeing inputs ...";
        release();
    };

    StandardBatcher<int, standard_threads> batcher(5);
    auto                                   thread_pool = std::make_shared<ThreadPool>(1);
    auto                                   task_pool   = std::make_shared<DeferredShortTaskPool>();

    Dispatcher<decltype(batcher)> dispatcher(std::move(batcher), std::chrono::milliseconds(15), thread_pool, task_pool, execute_on_batch);

    std::queue<std::shared_future<void>> futures;

    for (int i = 0; i < 9; i++)
    {
        futures.push(dispatcher.enqueue(i));
    }

    // wait on first batch to complete
    for (int i = 0; i < 5; i++)
    {
        futures.front().wait();
        futures.pop();
    }

    EXPECT_EQ(futures.front().wait_until(std::chrono::high_resolution_clock::now()), std::future_status::timeout);
    //dispatcher.shutdown();
    std::this_thread::sleep_for(std::chrono::milliseconds(16));
    EXPECT_EQ(futures.front().wait_until(std::chrono::high_resolution_clock::now()), std::future_status::ready);
    futures.pop();

    LOG(INFO) << "waiting on futures";
    while (!futures.empty())
    {
        futures.front().wait();
        futures.pop();
    }
    LOG(INFO) << "futures complete";
}

TEST_F(TestBatcher, FullBatcherUserThreads)
{
    boost::fibers::use_scheduling_algorithm<boost::fibers::algo::shared_work>();

    auto execute_on_batch = [](const std::vector<int>& batch, std::function<void()> release) {
        LOG(INFO) << "exectue_fn: this_thread: " << std::this_thread::get_id() << "; this_fiber: " << boost::this_fiber::get_id();
        boost::this_fiber::sleep_for(std::chrono::milliseconds(2));
        LOG(INFO) << "exectue_fn - sleep complete: this_thread: " << std::this_thread::get_id() << "; this_fiber: " << boost::this_fiber::get_id();
        release();
    };

    StandardBatcher<int, userspace_threads> batcher(5);
    Dispatcher<decltype(batcher)> dispatcher(std::move(batcher), std::chrono::milliseconds(15), execute_on_batch);

    using dispatcher_type = Dispatcher<decltype(batcher)>;

    std::queue<typename dispatcher_type::future_type> futures;

    for (int i = 0; i < 9; i++)
    {
        LOG(INFO) << "enqueue: " << i;
        futures.push(dispatcher.enqueue(i));
    }

    // wait on first batch to complete
    for (int i = 0; i < 5; i++)
    {
        LOG(INFO) << "wait: " << i;
        futures.front().wait();
        futures.pop();
    }

    EXPECT_EQ(futures.front().wait_until(std::chrono::high_resolution_clock::now()), boost::fibers::future_status::timeout);
    boost::this_fiber::sleep_for(std::chrono::milliseconds(16));
    EXPECT_EQ(futures.front().wait_until(std::chrono::high_resolution_clock::now()), boost::fibers::future_status::ready);
    futures.pop();

    LOG(INFO) << "waiting on futures";
    while (!futures.empty())
    {
        futures.front().wait();
        futures.pop();
    }
    LOG(INFO) << "futures complete";
}

TEST_F(TestBatcher, ShortDeferredTaskPool)
{
    std::mutex              mu;
    std::condition_variable cv;
    std::size_t             count = 0;

    auto task = [&mu, &cv, &count](std::chrono::milliseconds ms) {
        DLOG(INFO) << "deferred for " << ms.count() << " ms";
        {
            std::lock_guard<std::mutex> lock(mu);
            count += ms.count();
        }
        cv.notify_one();
    };

    DeferredShortTaskPool pool;

    using namespace std::chrono_literals;
    using clock = std::chrono::high_resolution_clock;

    auto start = clock::now();

    pool.enqueue_deferred(clock::now() + 25ms, [task]() { task(25ms); });
    pool.enqueue_deferred(clock::now() + 5ms, [task]() { task(5ms); });
    pool.enqueue_deferred(clock::now() + 10ms, [task]() {
        task(10ms);
        std::this_thread::sleep_for(std::chrono::microseconds(5)); // should print a warning
    });

    std::unique_lock<std::mutex> lock(mu);
    ASSERT_EQ(count, 0);
    cv.wait(lock, [&count]() { return count != 0; });
    ASSERT_EQ(count, 5);
    cv.wait(lock, [&count]() { return count != 5; });
    ASSERT_EQ(count, 15);
    cv.wait(lock, [&count]() { return count != 15; });
    ASSERT_EQ(count, 40);

    auto elapsed = clock::now() - start;
    auto wall    = std::chrono::duration_cast<std::chrono::milliseconds>(elapsed).count();

    ASSERT_GE(wall, 25);
    ASSERT_LT(wall, 26);

    pool.shutdown();

    EXPECT_ANY_THROW(pool.enqueue_deferred(clock::now() + 3ms, [] {}));
}

================================================
FILE: trtlab/core/tests/test_common.cc
================================================

#include "test_common.h"

#include <glog/logging.h>

std::size_t log_tracker::node_total = 0;
std::size_t log_tracker::node_count = 0;

void TrackedTest::SetUp()
{
    ASSERT_EQ(log_tracker::node_total, 0);
    ASSERT_EQ(log_tracker::node_count, 0);
}

void TrackedTest::TearDown()
{
    DVLOG(1) << "^^^ deallocation of stack messages from end of test to start of teardown ^^^";
    ASSERT_EQ(log_tracker::node_total, 0);
    ASSERT_EQ(log_tracker::node_count, 0);
}

void TrackedTest::EndTest()
{
    DVLOG(1) << "*-----------* end of test *-----------*";
}

void log_tracker::on_node_allocation(void* ptr, std::size_t size, std::size_t alignment) noexcept
{
    DVLOG(1) << name << ": node allocated: " << ptr << "; size: " << size << "; alignment: " << alignment;
    node_total += size;
    node_count++;
}

void log_tracker::on_node_deallocation(void *ptr, std::size_t size, std::size_t alignment) noexcept
{
    EXPECT_GT(node_total, 0);
    EXPECT_GT(node_count, 0);
    DVLOG(1) << name << ": node deallocated: " << ptr << "; " << size << "; " << alignment;
    node_count--;
    node_total -= size;
}

void log_tracker::on_array_allocation(void* ptr, std::size_t count, std::size_t size, std::size_t alignment) noexcept
{
    DVLOG(1) << name << ": array allocated: " << ptr << " ( " << count << " * " << size << "; " << alignment << " )";
    node_count += count;
    node_total += count * size;
}

void log_tracker::on_array_deallocation(void* ptr, std::size_t count, std::size_t size, std::size_t alignment) noexcept
{
    EXPECT_GT(node_total, 0);
    EXPECT_GT(node_count, 0);
    DVLOG(1) << name << ": array deallocated: " << ptr << " ( " << count << " * " << size << "; " << alignment << " )";
    node_count -= count;
    node_total -= count * size;
}

void timeout_mutex::lock()
{
    auto now = std::chrono::steady_clock::now();
    auto success = this->try_lock_until(now + std::chrono::milliseconds(MUTEX_TIMEOUT_MS));
    if(!success) throw timeout_error();
}

void timeout_mutex::unlock()
{
    std::timed_mutex::unlock();
}

bool timeout_mutex::try_lock()
{
    return std::timed_mutex::try_lock();
}

#include <trtlab/core/ranges.h>
using namespace trtlab;

class TestCore : public ::testing::Test {};

TEST_F(TestCore, FindRanges0)
{
    std::vector<int> a { 1 };
    std::vector<std::pair<int, int>> a_ranges { {1,1} };
    auto ranges = find_ranges(a);
    ASSERT_EQ(ranges, a_ranges);
    ASSERT_EQ(print_ranges(ranges), "1");
}


TEST_F(TestCore, FindRanges1)
{
    std::vector<int> a { 1,2 };
    std::vector<std::pair<int, int>> a_ranges { {1,2} };
    auto ranges = find_ranges(a);
    ASSERT_EQ(ranges, a_ranges);
    ASSERT_EQ(print_ranges(ranges), "1-2");
}

TEST_F(TestCore, FindRanges2)
{
    std::vector<int> a { 1,2,3 };
    std::vector<std::pair<int, int>> a_ranges { {1,3} };
    auto ranges = find_ranges(a);
    ASSERT_EQ(ranges, a_ranges);
    ASSERT_EQ(print_ranges(ranges), "1-3");
}

TEST_F(TestCore, FindRanges3)
{
    std::vector<int> a { 1,3 };
    std::vector<std::pair<int, int>> a_ranges { {1,1}, {3,3} };
    auto ranges = find_ranges(a);
    ASSERT_EQ(ranges, a_ranges);
    ASSERT_EQ(print_ranges(ranges), "1,3");
}

TEST_F(TestCore, FindRanges4)
{
    std::vector<int> a { 1,2,4,5,6,10 };
    std::vector<std::pair<int, int>> a_ranges { {1,2}, {4,6}, {10,10} };
    auto ranges = find_ranges(a);
    ASSERT_EQ(ranges, a_ranges);
    ASSERT_EQ(print_ranges(ranges), "1-2,4-6,10");
}

TEST_F(TestCore, FindRanges5)
{
    std::vector<int> a { 0,1,2,3,4,5,6 };
    std::vector<std::pair<int, int>> a_ranges { {0,6} };
    auto ranges = find_ranges(a);
    ASSERT_EQ(ranges, a_ranges);
    ASSERT_EQ(print_ranges(ranges), "0-6");
}

================================================
FILE: trtlab/core/tests/test_common.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

#include <cstdint>
#include <cstring>
#include <mutex>

#include <gtest/gtest.h>

#define MUTEX_TIMEOUT_MS 1000

class TrackedTest : public ::testing::Test
{
  public:
    void EndTest();
    
  protected:
    virtual void SetUp() override;
    virtual void TearDown() override;
};

struct log_tracker
{
    void on_node_allocation(void* ptr, std::size_t size, std::size_t alignment) noexcept;

    void on_node_deallocation(void *ptr, std::size_t size, std::size_t alignment) noexcept;

    void on_array_allocation(void* ptr, std::size_t count, std::size_t size, std::size_t alignment) noexcept;

    void on_array_deallocation(void* ptr, std::size_t count, std::size_t size, std::size_t alignment) noexcept;

    const char* name;
    static std::size_t node_total;
    static std::size_t node_count;
};

struct timeout_error : std::exception
{
    const char* what() const throw ()
    {
        return "timed out";
    }
};

class timeout_mutex : std::timed_mutex
{
  public:
    timeout_mutex() = default;
    timeout_mutex(const timed_mutex&) = delete;

    void lock();
    void unlock();
    bool try_lock();
};

================================================
FILE: trtlab/core/tests/test_cyclic_allocator.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "trtlab/core/memory/cyclic_allocator.h"
#include "trtlab/core/memory/malloc.h"
#include "trtlab/core/memory/sysv_allocator.h"
#include "gtest/gtest.h"

using namespace trtlab;

namespace {

static size_t one_mb = 1024 * 1024;

template<typename T>
class TestCyclicStacks : public ::testing::Test
{
};

using MemoryTypes = ::testing::Types<Malloc, SystemV>;

TYPED_TEST_CASE(TestCyclicStacks, MemoryTypes);

TYPED_TEST(TestCyclicStacks, EmptyOnCreate)
{
    auto stack = std::make_unique<CyclicAllocator<TypeParam>>(5, one_mb);
    EXPECT_EQ(5, stack->AvailableSegments());
    EXPECT_EQ(5 * one_mb, stack->AvailableBytes());
}

TYPED_TEST(TestCyclicStacks, AddSegment)
{
    auto stack = std::make_unique<CyclicAllocator<TypeParam>>(5, one_mb);
    stack->AddSegment();
    EXPECT_EQ(6, stack->AvailableSegments());
}

TYPED_TEST(TestCyclicStacks, DropSegment)
{
    auto stack = std::make_unique<CyclicAllocator<TypeParam>>(5, one_mb);
    stack->DropSegment();
    EXPECT_EQ(4, stack->AvailableSegments());
}

TYPED_TEST(TestCyclicStacks, Allocate)
{
    auto stack = std::make_unique<CyclicAllocator<TypeParam>>(5, one_mb);
    {
        auto seg_0_0 = stack->Allocate(1);
        EXPECT_EQ(5 * one_mb - stack->Alignment(), stack->AvailableBytes());
    }
    // even though seg_0_0 is released, the current segment does not get reset/recycled
    // because the stack still owns reference to seg_0
    // a segment is only recycled after it becomes detached and its reference count goes to 0
    EXPECT_EQ(5 * one_mb - stack->Alignment(), stack->AvailableBytes());

    {
        auto seg_0_1 = stack->Allocate(1024);
        EXPECT_EQ(5, stack->AvailableSegments()); // seg_0 is still active
        auto seg_1_0 = stack->Allocate(one_mb);
        EXPECT_EQ(3,
                  stack->AvailableSegments()); // seg_0 is detached; seg_1 detaches if capacity is 0
        auto seg_2_0 = stack->Allocate(1024);
        EXPECT_EQ(3, stack->AvailableSegments()); // seg_2 is now active; 0 and 1 are detached
        seg_1_0.reset(); // we can release seg_0/1 in any order
        EXPECT_EQ(4, stack->AvailableSegments());
        seg_0_1.reset(); // we can release seg_0/1 in any order
        EXPECT_EQ(5, stack->AvailableSegments());
    }
    // seg_0 has had 2 allocation, then failed to have capacity for the 3rd allocation
    // seg_1 is completely used from the 3rd allocation, but is still the active segment
    // until seg_2 is allocated
    EXPECT_EQ(5, stack->AvailableSegments());

    {
        // everything has been released, so we can grab 5 x one_mb buffers
        // but we will OOM on our 6th
        auto b0 = stack->Allocate(one_mb);
        auto b1 = stack->Allocate(one_mb);
        auto b2 = stack->Allocate(one_mb);
        auto b3 = stack->Allocate(one_mb);
        auto b4 = stack->Allocate(one_mb);
        EXPECT_EQ(0, stack->AvailableSegments());

        // The following will hang and deadlock the test
        // stack->Allocate(one_mb);
    }
}

TYPED_TEST(TestCyclicStacks, AllocateThenReleaseStack)
{
    auto stack = std::make_unique<CyclicAllocator<TypeParam>>(5, one_mb);
    auto buf = stack->Allocate(1024);
    stack.reset();
    EXPECT_EQ(buf->Size(), 1024);
    DLOG(INFO) << "Deallocation should hppen after this statement";
}

/*
TYPED_TEST(TestCyclicStacks, CastToMemoryType)
{
    auto stack = std::make_unique<CyclicAllocator<TypeParam>>(5, one_mb);
    std::unique_ptr<TypeParam> buf = std::move(stack->Allocate(1024));
    EXPECT_EQ(buf->Size(), 1024);
    stack.reset();
    EXPECT_EQ(buf->Size(), 1024);
    DLOG(INFO) << "Deallocation should hppen after this statement";
}
*/

TYPED_TEST(TestCyclicStacks, AllocateShouldFail)
{
    auto stack = std::make_unique<CyclicAllocator<TypeParam>>(5, one_mb);
    EXPECT_DEATH(stack->Allocate(one_mb + 1), "");
}

} // namespace


================================================
FILE: trtlab/core/tests/test_cyclic_windowed_buffer.cc
================================================
/* Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "gtest/gtest.h"

#include <cstdlib>

#include <chrono>
#include <future>
#include <numeric>

#include <trtlab/memory/allocator.h>
#include <trtlab/memory/malloc_allocator.h>

#include <trtlab/core/thread_pool.h>
#include <trtlab/core/cyclic_buffer.h>
#include <trtlab/core/standard_threads.h>

using namespace trtlab;
using namespace trtlab::memory;

class TestWindowedBuffer : public ::testing::Test
{
};

TEST_F(TestWindowedBuffer, SynchronousNonOverlapping)
{
    auto alloc    = make_allocator(malloc_allocator());
    auto int_x100 = alloc.allocate_descriptor(100 * sizeof(int));

    // external state will the buffer will modify
    // no mutex needed in this case, but could be captured and passed
    auto sums = std::vector<long>();

    HostCyclicBuffer buffer(std::move(int_x100), 40, 0, [&sums](std::size_t id, const void* data, std::size_t bytes) -> auto {
        DCHECK_EQ(bytes % sizeof(int), 0);
        EXPECT_EQ(id, sums.size());
        auto array = static_cast<const int*>(data);
        auto count = bytes / sizeof(int);
        long sum   = 0;
        for (int i = 0; i < count; i++)
        {
            sum += array[i];
        }
        sums.push_back(sum);
        return [](bool) { return true; };
    });

    ASSERT_EQ(sums.size(), 0);

    std::vector<int> vec;
    vec.resize(10);

    for (int i = 0; i < 20; i++)
    {
        std::iota(vec.begin(), vec.end(), i);
        buffer.AppendData(vec.data(), vec.size() * sizeof(int));

        EXPECT_EQ(sums.size(), i + 1);
        EXPECT_EQ(sums[i], 45 + 10 * i);
    }
}

TEST_F(TestWindowedBuffer, SynchronousOverlapping)
{
    auto alloc    = make_allocator(malloc_allocator());
    auto int_x100 = alloc.allocate_descriptor(110 * sizeof(int));

    // external state will the buffer will modify
    // no mutex needed in this case, but could be captured and passed
    auto sums = std::vector<long>();

    HostCyclicBuffer buffer(std::move(int_x100), 40, 20, [&sums](std::size_t id, const void* data, std::size_t bytes) -> auto {
        DCHECK_EQ(bytes % sizeof(int), 0);
        EXPECT_EQ(id, sums.size());
        auto array = static_cast<const int*>(data);
        auto count = bytes / sizeof(int);
        long sum   = 0;
        for (int i = 0; i < count; i++)
        {
            sum += array[i];
        }
        sums.push_back(sum);
        return [](bool) { return true; };
    });

    ASSERT_EQ(sums.size(), 0);

    std::vector<int> vec;
    vec.resize(10);

    for (int i = 0; i < 40; i++)
    {
        std::iota(vec.begin(), vec.end(), i);
        buffer.AppendData(vec.data(), vec.size() * sizeof(int));
    }

    for (int i = 0; i < sums.size(); i++)
    {
        EXPECT_EQ(sums[i], 45 + 5 * i);
    }
}

template <typename T>
std::function<bool(bool)> SyncFnFromFuture(std::shared_future<T>&& shared)
{
    return [f = std::move(shared)](bool wait) -> bool {
        if (wait)
        {
            f.wait();
            return true;
        }
        auto rc = f.wait_for(std::chrono::nanoseconds(100));
        if (rc == std::future_status::ready)
        {
            return true;
        }
        return false;
    };
}

TEST_F(TestWindowedBuffer, AsynchronousNonOverlapping)
{
    auto alloc    = make_allocator(malloc_allocator());
    auto int_x100 = alloc.allocate_descriptor(100 * sizeof(int));

    ThreadPool thread_pool(5);

    // external state will the buffer will modify
    // no mutex needed in this case, but could be captured and passed
    auto sums = std::vector<long>();

    HostCyclicBuffer buffer(std::move(int_x100), 40, 0,
                            [&sums, &thread_pool ](std::size_t id, const void* data, std::size_t bytes) -> auto {
                                EXPECT_EQ(id, sums.size());
                                sums.push_back(-1);
                                auto index = id;
                                DVLOG(2) << "callback " << index << " handing off to thread pool";
                                auto future = thread_pool.enqueue([&sums, index, data, bytes]() {
                                    auto rand = 1 + std::rand() / ((RAND_MAX + 1u) / 10);
                                    DVLOG(2) << "sleep for " << rand << " millseconds";
                                    std::this_thread::sleep_for(std::chrono::milliseconds(rand));
                                    DCHECK_EQ(bytes % sizeof(int), 0);
                                    auto array = static_cast<const int*>(data);
                                    auto count = bytes / sizeof(int);
                                    long sum   = 0;
                                    for (int i = 0; i < count; i++)
                                    {
                                        sum += array[i];
                                    }
                                    DVLOG(2) << "sum[ " << index << " ]: " << sum;
                                    sums[index] = sum;
                                });
                                return SyncFnFromFuture(std::move(future.share()));
                            });

    ASSERT_EQ(sums.size(), 0);

    std::vector<int> vec;
    vec.resize(10);

    for (int i = 0; i < 20; i++)
    {
        std::iota(vec.begin(), vec.end(), i);
        buffer.AppendData(vec.data(), vec.size() * sizeof(int));
    }

    buffer.Sync();

    for (int i = 0; i < 20; i++)
    {
        EXPECT_EQ(sums[i], 45 + 10 * i);
    }
}

TEST_F(TestWindowedBuffer, AsynchronousOverlapping)
{
    auto alloc    = make_allocator(malloc_allocator());
    auto int_x100 = alloc.allocate_descriptor(100 * sizeof(int));

    ThreadPool thread_pool(5);

    // external state will the buffer will modify
    // no mutex needed in this case, but could be captured and passed
    auto sums = std::vector<long>();

    HostCyclicBuffer buffer(std::move(int_x100), 40, 20,
                            [&sums, &thread_pool ](std::size_t id, const void* data, std::size_t bytes) -> auto {
                                EXPECT_EQ(id, sums.size());
                                sums.push_back(-1);
                                auto index = id;
                                DVLOG(2) << "callback " << index << " handing off to thread pool";
                                auto future = thread_pool.enqueue([&sums, index, data, bytes]() {
                                    auto rand = 1 + std::rand() / ((RAND_MAX + 1u) / 10);
                                    DVLOG(2) << "sleep for " << rand << " millseconds";
                                    std::this_thread::sleep_for(std::chrono::milliseconds(rand));
                                    DCHECK_EQ(bytes % sizeof(int), 0);
                                    auto array = static_cast<const int*>(data);
                                    auto count = bytes / sizeof(int);
                                    long sum   = 0;
                                    for (int i = 0; i < count; i++)
                                    {
                                        sum += array[i];
                                    }
                                    DVLOG(2) << "sum[ " << index << " ]: " << sum;
                                    sums[index] = sum;
                                });
                                return SyncFnFromFuture(std::move(future.share()));
                            });

    ASSERT_EQ(sums.size(), 0);

    std::vector<int> vec;
    vec.resize(10);

    for (int i = 0; i < 20; i++)
    {
        std::iota(vec.begin(), vec.end(), i);
        buffer.AppendData(vec.data(), vec.size() * sizeof(int));
    }

    buffer.Sync();

    for (int i = 0; i < 20; i++)
    {
        EXPECT_EQ(sums[i], 45 + 5 * i);
    }
}

#include <trtlab/core/cyclic_windowed_buffer.h>

// make protected members public for testing
struct test_cw_stack : public ::trtlab::cyclic_windowed_stack<memory::host_memory, standard_threads>
{
    using cyclic_windowed_stack = ::trtlab::cyclic_windowed_stack<memory::host_memory, standard_threads>;

public:
    using cyclic_windowed_stack::available;
    using cyclic_windowed_stack::cyclic_windowed_stack;
    using cyclic_windowed_stack::push_window;
};

TEST_F(TestWindowedBuffer, Stack)
{
    auto alloc = make_allocator(malloc_allocator());
    auto md    = alloc.allocate_descriptor(100 * sizeof(int));

    test_cw_stack stack(std::move(md), 40, 20);

    EXPECT_EQ(stack.buffer().window_count(), 19);

    std::size_t sync_count = 0;
    auto        syncfn     = [&sync_count] {
        ++sync_count;
        DVLOG(1) << "syncfn called " << sync_count << " times";
    };

    EXPECT_EQ(stack.available(), 40);

    for (int i = 0; i < stack.buffer().window_count() - 1; i++)
    {
        stack.push_window(syncfn);
        EXPECT_EQ(sync_count, 0);
        EXPECT_EQ(stack.available(), 20);
    }

    // the push_window causes the stack to recycle
    // in this scenario, two syncs are needed:
    // 1) to reserve space for the 20 bytes of replicated data
    // 2) a second sync to free the remainder of the first window
    stack.push_window(syncfn);
    EXPECT_EQ(sync_count, 2);

    stack.push_window(syncfn);
    EXPECT_EQ(sync_count, 3);

    for (int i = 0; i < stack.buffer().window_count() * 3; i++)
    {
        stack.push_window(syncfn);
        EXPECT_EQ(stack.available(), 20);
    }
}

/*
TEST_F(TestWindowedBuffer, TaskExecutor)
{
    auto alloc = make_allocator(malloc_allocator());
    auto md    = alloc.allocate_descriptor(100 * sizeof(int));

    cyclic_windowed_stack<memory::host_memory, standard_threads> stack(std::move(md), 10*sizeof(int), 5*sizeof(int));

    // clang-format off
    cyclic_windowed_task_executor<memory::host_memory, standard_threads> buffer(std::move(stack),
        [](std::size_t id, const void* data, std::size_t count) -> auto {
            auto f = std::async([id] {
                std::this_thread::sleep_for(std::chrono::milliseconds(100*id));
                LOG(INFO) << "task " << id << " complete";
            }).share();
            return f;
        });
    // clang-format on 

    std::vector<int> source(100);
    std::iota(source.begin(), source.end(), 0);

    buffer.append_data(&source[0], 100*sizeof(int));
    buffer.append_data(&source[0], 100*sizeof(int));
}
*/

#include <trtlab/core/userspace_threads.h>

TEST_F(TestWindowedBuffer, Reservation)
{
    auto alloc = make_allocator(malloc_allocator());
    auto md    = alloc.allocate_descriptor(20 * sizeof(int));

    cyclic_windowed_stack<memory::host_memory, userspace_threads> base(std::move(md), 10 * sizeof(int), 8 * sizeof(int));

    cyclic_windowed_reserved_stack<memory::host_memory, userspace_threads> stack(std::move(base)); 

    for (int i = 0; i < stack.buffer().window_count() * 3; i++)
    {
        auto r = stack.reserve_window();
        int *top = static_cast<int*>(r.data_start);
        auto count = r.data_size / sizeof(int);
        LOG(INFO) << "reservation " << i << ": filling " << count << " entries";
        std::fill(top, top + count, i);
        top = static_cast<int*>(r.window_start);
        for(int i=0; i<10; i++)
        {
            LOG(INFO) << *(top + i);
        }
        r.release();
    }
}

================================================
FILE: trtlab/core/tests/test_foo_memory.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "trtlab/core/memory/allocator.h"
#include "trtlab/core/memory/block_allocators.h"
#include "trtlab/core/memory/containers.h"
#include "trtlab/core/memory/transactional_allocator.h"
#include "trtlab/core/memory/host_first_touch_allocator.h"

#include <foonathan/memory/aligned_allocator.hpp>
#include <foonathan/memory/container.hpp>
#include <foonathan/memory/smart_ptr.hpp>
#include <foonathan/memory/tracking.hpp>
#include <foonathan/memory/namespace_alias.hpp>

#include <glog/logging.h>
#include <gtest/gtest.h>

#include <chrono>
#include <future>
#include <thread>

#include "test_common.h"

using namespace memory::literals;


template<typename T, typename RawAllocator>
auto make_vector(RawAllocator&& alloc)
{
    return memory::vector<T, RawAllocator>(alloc);
}

template<typename T, typename RawAllocator, typename Mutex>
auto make_vector(memory::trtlab::allocator<RawAllocator, Mutex> alloc)
{
    return memory::vector<T, memory::trtlab::allocator<RawAllocator, Mutex>>(alloc);
}

class TestFooMemory : public TrackedTest {};

TEST_F(TestFooMemory, BlankTest) 
{
    DVLOG(1) << "FooMemory testing";
    ASSERT_TRUE(true); 
}

TEST_F(TestFooMemory, Malloc)
{
    auto raw = memory::malloc_allocator();
    auto alloc = memory::make_allocator_adapter(std::move(raw));
    auto tracked = memory::make_tracked_allocator(log_tracker{"** tracker: malloc **"}, std::move(alloc));

    auto p0 = tracked.allocate_node(1024, 8);

    ASSERT_EQ(log_tracker::node_total, 1024);
    ASSERT_EQ(log_tracker::node_count, 1);

    tracked.deallocate_node(p0, 1024, 8);

    EndTest();
}

TEST_F(TestFooMemory, MallocTraits)
{
    auto raw = memory::malloc_allocator();
    auto adp = memory::make_allocator_adapter(std::move(memory::malloc_allocator()));
    auto ref = memory::make_allocator_reference(raw);
    auto mt_ref = memory::make_allocator_reference<std::mutex>(raw);
    auto tracked = memory::make_tracked_allocator(log_tracker{"** tracker: malloc **"}, ref);

    // default malloc implementation does not provide a memory_type
    static_assert(std::is_same<decltype(adp)::memory_type, memory::host_memory>::value, "should be host_memory");
    static_assert(std::is_same<decltype(ref)::memory_type, memory::host_memory>::value, "should be host_memory");
    static_assert(std::is_same<decltype(mt_ref)::memory_type, memory::host_memory>::value, "should be host_memory");
    static_assert(std::is_same<decltype(tracked)::memory_type, memory::host_memory>::value, "should be host_memory");

    DVLOG(1) << "access allocator traits from adapter";
    auto context_adp = adp.device_context();
    ASSERT_EQ(context_adp.device_type, kDLCPU);
    ASSERT_EQ(context_adp.device_id, 0);

    DVLOG(1) << "access allocator traits from reference";
    auto context_ref = ref.device_context();
    ASSERT_EQ(context_ref.device_type, kDLCPU);
    ASSERT_EQ(context_ref.device_id, 0);

    DVLOG(1) << "access allocator traits from mt reference";
    auto context_mt_ref = mt_ref.device_context();
    ASSERT_EQ(context_mt_ref.device_type, kDLCPU);
    ASSERT_EQ(context_mt_ref.device_id, 0);

    DVLOG(1) << "access allocator traits from tracker";
    auto context_trk = tracked.device_context();
    ASSERT_EQ(context_trk.device_type, kDLCPU);
    ASSERT_EQ(context_trk.device_id, 0);

    EndTest();
}

TEST_F(TestFooMemory, MallocAsStdAllocator)
{
    auto raw = memory::malloc_allocator();
    auto ref = memory::make_allocator_reference(raw);
    auto tracked = memory::make_tracked_allocator(log_tracker{"** tracker: malloc **"}, ref);

    std::size_t count = 64;
    auto vec = memory::vector<std::size_t, decltype(tracked)>(tracked);
    vec.reserve(count);

    ASSERT_GE(log_tracker::node_total, count * sizeof(std::size_t));
    ASSERT_GE(log_tracker::node_count, count);

    std::size_t tmp_total = log_tracker::node_total;

    for(std::size_t i=0; i<count; i++)
    {
        vec.push_back(i);
    }

    ASSERT_EQ(log_tracker::node_total, tmp_total);

    DVLOG(1) << "pushing beyond vector capacity - expect new alloc/dealloc/copy";
    vec.push_back(count+1);
    DVLOG(1) << "^^^ should see alloc/dealloc messages ^^^";

    ASSERT_GT(log_tracker::node_total, tmp_total);

    EndTest();
}

TEST_F(TestFooMemory, MallocThreadSafe)
{
    auto raw = memory::malloc_allocator();
    auto ref = memory::make_allocator_reference<std::mutex>(raw);
    auto tracked = memory::make_tracked_allocator(log_tracker{"** tracker: malloc **"}, ref);

    std::size_t count = 64;
    auto vec = memory::vector<std::size_t, decltype(tracked)>(tracked);
    vec.reserve(count);

    ASSERT_GE(log_tracker::node_total, count * sizeof(std::size_t));
    ASSERT_GE(log_tracker::node_count, count);

    std::size_t tmp_total = log_tracker::node_total;

    for(std::size_t i=0; i<count; i++)
    {
        vec.push_back(i);
    }

    ASSERT_EQ(log_tracker::node_total, tmp_total);

    DVLOG(1) << "pushing beyond vector capacity - expect new alloc/dealloc/copy";
    vec.push_back(count+1);
    DVLOG(1) << "^^^ should see alloc/dealloc messages ^^^";

    ASSERT_GT(log_tracker::node_total, tmp_total);

    EndTest();
}

TEST_F(TestFooMemory, GrowthCappedBlockAllocator)
{

    // base allocator
    auto malloc_raw = memory::malloc_allocator();
    auto malloc_ref = memory::make_allocator_reference(malloc_raw);
    auto malloc_tracked = memory::make_tracked_allocator(log_tracker{"** tracker: malloc **"}, malloc_ref);

    // malloc block allocator
    auto block_alloc = memory::trtlab::make_growth_capped_block_allocator(1_MiB, 2, malloc_tracked);

    auto context = block_alloc.device_context();
    ASSERT_EQ(context.device_type, kDLCPU);
    ASSERT_EQ(context.device_id, 0);

    DVLOG(1) << "allocate first block - should pass";
    auto block_0 = block_alloc.allocate_block();
    ASSERT_NE(block_0.memory, nullptr);
    ASSERT_EQ(block_0.size, 1_MiB);

    DVLOG(1) << "allocate second block - should pass";
    auto block_1 = block_alloc.allocate_block();
    ASSERT_NE(block_1.memory, nullptr);
    ASSERT_EQ(block_1.size, 1_MiB);

    DVLOG(1) << "allocate third block - should throw an exception";
    ASSERT_ANY_THROW(auto block_2 = block_alloc.allocate_block());

    DVLOG(1) << "increasing limit to 3; allocate third block - should pass";
    block_alloc.set_max_block_count(3);
    auto block_2 = block_alloc.allocate_block();

    DVLOG(1) << "deallocate blocks";
    block_alloc.deallocate_block(block_0);
    block_alloc.deallocate_block(block_1);
    block_alloc.deallocate_block(block_2);

    EndTest();
}

TEST_F(TestFooMemory, BlockArena)
{

    // base allocator
    auto malloc_raw = memory::malloc_allocator();
    auto malloc_ref = memory::make_allocator_reference(malloc_raw);
    auto malloc_tracked = memory::make_tracked_allocator(log_tracker{"** tracker: malloc **"}, malloc_ref);

    // malloc block allocator
    auto block_alloc = memory::trtlab::make_growth_capped_block_allocator(1_MiB, 2, malloc_tracked);

    ASSERT_EQ(log_tracker::node_count, 0);
    
    // arena with caching
    auto arena = memory::trtlab::make_block_arena(std::move(block_alloc));

    arena.reserve_blocks(2);
    ASSERT_EQ(log_tracker::node_count, 2);

    DVLOG(1) << "allocate first block - should pass";
    auto block_0 = arena.allocate_block();
    ASSERT_NE(block_0.memory, nullptr);
    ASSERT_EQ(block_0.size, 1_MiB);
    ASSERT_EQ(log_tracker::node_count, 2);

    DVLOG(1) << "allocate second block - should pass";
    auto block_1 = arena.allocate_block();
    ASSERT_NE(block_1.memory, nullptr);
    ASSERT_EQ(block_1.size, 1_MiB);
    ASSERT_EQ(log_tracker::node_count, 2);

    DVLOG(1) << "allocate third block - should throw an exception";
    ASSERT_ANY_THROW(auto block_2 = arena.allocate_block());
    ASSERT_EQ(log_tracker::node_count, 2);

    DVLOG(1) << "increasing limit to 3; allocate third block - should pass";
    arena.get_block_allocator().set_max_block_count(3);
    auto block_2 = arena.allocate_block();
    ASSERT_EQ(log_tracker::node_count, 3);

    DVLOG(1) << "deallocate blocks";

    DVLOG(1) << "deallocate block_0; then force the cache to shrink_to_fit";
    arena.deallocate_block(block_0);
    ASSERT_EQ(log_tracker::node_count, 3);
    arena.shrink_to_fit();
    ASSERT_EQ(log_tracker::node_count, 2);

    DVLOG(1) << "deallocate remaining blocks";
    arena.deallocate_block(block_1);
    arena.deallocate_block(block_2);
    ASSERT_EQ(log_tracker::node_count, 2);
    DVLOG(1) << "there should be two deallocations after the * end of test * from the block_arena destructor";

    EndTest();
}

template<typename StatelessAllocator>
auto make_raw_transactional_allocator(std::size_t block_size, std::size_t block_count = 2)
{
    // base allocator
    auto raw = StatelessAllocator();

    // convert to full fledged allocator - use direct_storage which optimizes out mutexes for stateless allocators
    auto alloc = memory::make_allocator_adapter(std::move(raw));

    static_assert(!decltype(alloc)::is_stateful::value, "should be stateless");
    static_assert(std::is_same<memory::no_mutex, typename decltype(alloc)::mutex>::value, "should use memory::no_mutex");

    // create a tracker for calls to the malloc allocator
    auto tracked = memory::make_tracked_allocator(log_tracker{"** tracker: base **"}, std::move(alloc));

    // malloc block allocator
    auto block_alloc = memory::trtlab::make_growth_capped_block_allocator(block_size, block_count, std::move(tracked));

    // transactional allocator
    return memory::trtlab::make_transactional_allocator(std::move(block_alloc));
}

template<typename StatelessAllocator = memory::malloc_allocator>
auto make_smart_transactional_allocator(std::size_t block_size, std::size_t block_count = 2)
{
    // transactional allocator
    auto alloc = make_raw_transactional_allocator<StatelessAllocator>(block_size, block_count);

    // populate the cache
    alloc.reserve_blocks(block_count);

    // smart allocator
    // use a special timeout_mutex - throws an exception if the lock is not obtained in MUTEX_TIMEOUT_MS
    return memory::trtlab::make_allocator<timeout_mutex>(std::move(alloc));
}

TEST_F(TestFooMemory, TransactionalLifeCycle)
{

    auto smart = make_smart_transactional_allocator(1_MiB);

    static_assert(decltype(smart)::is_stateful::value, "should be stateful");
    static_assert(std::is_same<decltype(smart)::mutex, timeout_mutex>::value, "should be a timeout mutex");

    // we can safely make copies of allocators
    // because all allocators hold a shared_ptr to the underlying allocator storage (raw allocator + mutex)
    auto tracker = memory::make_tracked_allocator(log_tracker{"** transactional **"}, smart.copy());

    static_assert(decltype(tracker)::is_stateful::value, "should be stateful");

    {
        DVLOG(1) << "make some shared_ptr<int> objects with the reference";
        auto int_0 = memory::allocate_shared<int>(smart, 2);
        auto int_1 = memory::allocate_shared<int>(smart, 4);
    }

    {
        auto lock = smart.lock();
        DVLOG(1) << "got lock - should hang if trying to use the allocator";
        ASSERT_THROW(auto int_0 = memory::allocate_shared<int>(smart, 2), timeout_error);
    }

    {
        DVLOG(1) << "make some shared_ptr<int> objects with the tracker";
        auto int_0 = memory::allocate_shared<int>(tracker, 2);
        auto int_1 = memory::allocate_shared<int>(tracker, 4);
    }

    {
        auto lock = smart.lock();
        ASSERT_THROW(auto int_0 = memory::allocate_shared<int>(tracker, 2), timeout_error);
    }

    EndTest();
}


TEST_F(TestFooMemory, TransactionalAllocatorFullyCaptured)
{
    auto trans = make_raw_transactional_allocator<memory::malloc_allocator>(1_MiB);

    auto ptr = trans.allocate_node(1024*1024, 8);

    DVLOG(1) << ptr;

    trans.deallocate_node(ptr, 1024*1024, 8);

    EndTest();
}


TEST_F(TestFooMemory, SmartTransactionalLifeCycle)
{
    auto smart = make_smart_transactional_allocator(1_MiB, 2);

    // smart allocators should be shared
    static_assert(memory::is_shared_allocator<decltype(smart)>::value, "not shared");

    // ensure we can reach the raw allocator
    smart.get_raw_allocator().get_block_allocator().set_max_block_count(3);

    // basic allocation
    auto ptr = smart.allocate_node(1024, 8);
    ASSERT_NE(ptr, nullptr) << "should have thrown an exception or a valid ptr";
    smart.deallocate_node(ptr, 1024, 8);

    EndTest();
}

TEST_F(TestFooMemory, SmartTransactionalBase)
{
    auto smart = make_smart_transactional_allocator(1_MiB, 2);

    // smart allocators should be shared
    static_assert(memory::is_shared_allocator<decltype(smart)>::value, "not shared");

    EndTest();
}

TEST_F(TestFooMemory, SmartTransactionalDescriptor)
{
    auto smart = make_smart_transactional_allocator(1_MiB, 2);

    // smart allocators should be shared
    static_assert(memory::is_shared_allocator<decltype(smart)>::value, "not shared");

    // a smart descriptor holds a shared_ptr to the allocator
    DVLOG(1) << "create descriptor";
    auto md = smart.allocate_descriptor(1024, 8);
    DVLOG(1) << "created descriptor";
    ASSERT_EQ(smart.use_count(), 2);
    ASSERT_NE(md.data(), nullptr);
    ASSERT_EQ(md.size(), 1024);
    ASSERT_EQ(md.device_context().device_type, kDLCPU);

    // a smart descriptor is only moveable, not copyable
    DVLOG(1) << "move descriptor";
    auto moved_md = std::move(md);
    ASSERT_EQ(md.data(), nullptr);
    ASSERT_EQ(smart.use_count(), 2);
    static_assert(!std::is_copy_constructible<decltype(md)>::value, "should not be copyable");
    static_assert(!std::is_copy_assignable<decltype(md)>::value, "should not be copyable");

    // to make a smart descriptor copyable, you can create a shared_ptr to it
    DVLOG(1) << "convert to shared descriptor";
    auto shared_md = moved_md.make_shared();
    ASSERT_EQ(smart.use_count(), 2);
    ASSERT_EQ(shared_md.use_count(), 1);

    // copying the descriptor does not increment the ref count of the allocator
    // the descriptor holds a shared_ptr to the allocator,
    // but the descriptor is not copied, rather it becomes a shared pointer
    auto copied_md = shared_md;
    ASSERT_EQ(smart.use_count(), 2);
    ASSERT_EQ(shared_md.use_count(), 2);

    EndTest();
}

TEST_F(TestFooMemory, AlignedSmartAllocator)
{

    // base allocator
    auto malloc_raw = memory::malloc_allocator();

    // convert to full fledged allocator - use direct_storage which optimizes out mutexes for stateless allocators
    auto malloc_alloc = memory::make_allocator_adapter(std::move(malloc_raw));

    static_assert(!decltype(malloc_alloc)::is_stateful::value, "should be stateless");
    static_assert(std::is_same<memory::no_mutex, typename decltype(malloc_alloc)::mutex>::value, "should use memory::no_mutex");

    // create a tracker for calls to the malloc allocator
    auto malloc_tracked = memory::make_tracked_allocator(log_tracker{"** tracker: malloc **"}, std::move(malloc_alloc));

    // malloc block allocator
    auto block_alloc = memory::trtlab::make_growth_capped_block_allocator(1_MiB, 2, std::move(malloc_tracked));

    // transactional allocator
    auto alloc = memory::trtlab::make_transactional_allocator(std::move(block_alloc));
    alloc.reserve_blocks(2);

    auto aligned = memory::make_aligned_allocator(256, std::move(alloc));

    auto smart = memory::trtlab::make_allocator(std::move(aligned));

    auto p0 = smart.allocate_node(64, 8);
    auto p1 = smart.allocate_node(64, 8);

    auto i0 = std::uintptr_t(p0);
    auto i1 = std::uintptr_t(p1);

    ASSERT_EQ(i0 % 256, 0);
    ASSERT_EQ(i1 % 256, 0);

    ASSERT_GE(i1 - i0, 256);

    smart.deallocate_node(p0, 64, 0);
    smart.deallocate_node(p1, 64, 0);

    EndTest();
}

TEST_F(TestFooMemory, SmartAllocatorStateful)
{

    void *ptr;

    // base allocator
    auto malloc_raw = memory::malloc_allocator();

    // convert to full fledged allocator - use direct_storage which optimizes out mutexes for stateless allocators
    auto malloc_alloc = memory::make_allocator_adapter(std::move(malloc_raw));

    static_assert(!decltype(malloc_alloc)::is_stateful::value, "should be stateless");
    static_assert(std::is_same<memory::no_mutex, typename decltype(malloc_alloc)::mutex>::value, "should use memory::no_mutex");

    // create a tracker for calls to the malloc allocator
    auto malloc_tracked = memory::make_tracked_allocator(log_tracker{"** tracker: malloc **"}, std::move(malloc_alloc));

    // malloc block allocator
    auto block_alloc = memory::trtlab::make_growth_capped_block_allocator(1_MiB, 2, std::move(malloc_tracked));

    // transactional allocator
    auto alloc = memory::trtlab::make_transactional_allocator(std::move(block_alloc));
    alloc.reserve_blocks(2);

    // smart allocator
    // use a special timeout_mutex - throws an exception if the lock is not obtained in MUTEX_TIMEOUT_MS
    auto smart = memory::trtlab::make_allocator<timeout_mutex>(std::move(alloc));
    ASSERT_EQ(smart.use_count(), 1);

    // smart allocators should be shared
    static_assert(memory::is_shared_allocator<decltype(smart)>::value, "not shared");

    // ensure we can reach the raw allocator
    smart.get_raw_allocator().get_block_allocator().set_max_block_count(3);

    // basic allocation
    ptr = smart.allocate_node(1024, 8);
    ASSERT_NE(ptr, nullptr) << "should have thrown an exception or a valid ptr";
    smart.deallocate_node(ptr, 1024, 8);

    // smart descriptor
    {
        // a smart descriptor holds a shared_ptr to the allocator
        auto md = smart.allocate_descriptor(1024, 8);
        ASSERT_EQ(smart.use_count(), 2);
        ASSERT_NE(md.data(), nullptr);
        ASSERT_EQ(md.size(), 1024);

        // a smart descriptor is only moveable, not copyable
        auto moved_md = std::move(md);
        ASSERT_EQ(md.data(), nullptr);
        ASSERT_EQ(smart.use_count(), 2);
        static_assert(!std::is_copy_constructible<decltype(md)>::value, "should not be copyable");
        static_assert(!std::is_copy_assignable<decltype(md)>::value, "should not be copyable");

        // to make a smart descriptor copyable, you can create a shared_ptr to it
        auto shared_md = moved_md.make_shared();
        ASSERT_EQ(smart.use_count(), 2);
        ASSERT_EQ(shared_md.use_count(), 1);

        // copying the descriptor does not increment the ref count of the allocator
        // the descriptor holds a shared_ptr to the allocator,
        // but the descriptor is not copied, rather it becomes a shared pointer
        auto copied_md = shared_md;
        ASSERT_EQ(smart.use_count(), 2);
        ASSERT_EQ(shared_md.use_count(), 2);
    }

    // smart allocators are copyable
    auto smart_copy = smart;
    ASSERT_EQ(smart.use_count(), 2);
    ASSERT_EQ(smart_copy.use_count(), 2);

    {
        auto lock = smart.lock();
        ASSERT_EQ(smart.use_count(), 2);
        ASSERT_EQ(smart_copy.use_count(), 2);

        ptr = lock->allocate_node(1024, 8);
        lock->deallocate_node(ptr, 1024, 8);

        ASSERT_THROW(ptr = smart_copy.allocate_node(1024, 8), timeout_error);
    }

    // smart allocators are also moveable
    auto smart_move = std::move(smart_copy);
    ASSERT_EQ(smart.use_count(), 2);
    ASSERT_EQ(smart_move.use_count(), 2);
    ASSERT_EQ(smart_copy.use_count(), 0);

    // create a tracked allocator
    // if we were not explicity about the copy, smart would have been moved
    {
        auto tracked = memory::make_tracked_allocator(log_tracker{"** tracker: smart **"}, smart.copy());
        ASSERT_EQ(smart.use_count(), 3);

        ptr = tracked.allocate_node(1024, 8);
        tracked.deallocate_node(ptr, 1024, 8);

        auto lock = smart.lock();
        ASSERT_THROW(ptr = tracked.allocate_node(1024, 8), timeout_error);
    }

    ASSERT_EQ(smart.use_count(), 2);

    // without the explicit copy, the passed allocator is moved into the tracker
    {
        auto tracked = memory::make_tracked_allocator(log_tracker{"** tracker: smart **"}, smart_move);
        ASSERT_EQ(smart.use_count(), 2);
        ASSERT_EQ(smart_move.use_count(), 0);

        ptr = tracked.allocate_node(1024, 8);
        tracked.deallocate_node(ptr, 1024, 8);

        // tracked.lock() is inaccessible
        auto lock = tracked.get_allocator().lock();
        ASSERT_THROW(ptr = smart.allocate_node(1024, 8), timeout_error);
    }

    ASSERT_EQ(smart.use_count(), 1);

    //

    {
        // pass by reference which for a shared allocator would call the copy constructor
        auto v = memory::trtlab::make_vector<int>(smart);
        ASSERT_EQ(smart.use_count(), 2);
        v.reserve(1024);
    }

    ASSERT_EQ(smart.use_count(), 1);

    {
        // this fails to move the smart allocator into the std_allocator owned by the vector
        auto v = memory::trtlab::make_vector<int>(std::move(smart));
        ASSERT_EQ(smart.use_count(), 2);
        v.reserve(1024);
    }

    ASSERT_EQ(smart.use_count(), 1);

    // test smart pointers

    {
        auto shared = memory::allocate_shared<int>(smart, 1);
        ASSERT_EQ(smart.use_count(), 2);
    }

    ASSERT_EQ(smart.use_count(), 1);


    EndTest();
}

TEST_F(TestFooMemory, SmartAllocatorStateless)
{
    void *ptr;

    // base allocator
    auto malloc_raw = memory::malloc_allocator();
    auto malloc_smart = memory::trtlab::make_allocator(std::move(malloc_raw));

    ASSERT_EQ(malloc_smart.use_count(), 1);

    ptr = malloc_smart.allocate_node(1024, 8);
    malloc_smart.deallocate_node(ptr, 1024, 8);

    auto smart_copy = malloc_smart;
    ASSERT_EQ(malloc_smart.use_count(), 2);
    ASSERT_EQ(smart_copy.use_count(), 2);

    {
        auto lock = malloc_smart.lock();
        ASSERT_EQ(malloc_smart.use_count(), 2);
        ASSERT_EQ(smart_copy.use_count(), 2);

        ptr = lock->allocate_node(1024, 8);
        lock->deallocate_node(ptr, 1024, 8);

        DVLOG(1) << "this should not hang - stateless allocators, even if requesting std::mutex use no_mutex";
        ptr = smart_copy.allocate_node(1024, 8);
        smart_copy.deallocate_node(ptr, 1024, 8);
    }

    EndTest();
}


struct backend {};
struct backend_a : backend {};
struct backend_b : backend_a {};
struct backend_c : backend {};

template<typename BackendType>
class md : public BackendType
{
  public:
    md() : m_ptr(nullptr) {}
    md(void* ptr) : m_ptr(ptr) {}

  private:
    void* m_ptr;
};


template<typename BackendType>
void do_a(md<BackendType>& require_a_or_derived_from_a)
{
    static_assert(std::is_base_of<backend_a, md<BackendType>>::value, "Backend needs to be derived from backend_a");
    DVLOG(1) << "yep";
}


TEST_F(TestFooMemory, TemplateInheritance)
{
    static_assert(std::is_base_of<backend, md<backend>>::value, "should be true");
    static_assert(std::is_base_of<backend_a, md<backend_b>>::value, "should be true");

    auto a = std::move(md<backend_a>());
    auto b = std::move(md<backend_b>());
    auto c = std::move(md<backend_c>());

    do_a(a);
    do_a(b);

    // will fail to compile
    //do_a(c);
}

struct pinned_memory : memory::host_memory
{
    static constexpr DLDeviceType device_type() { return kDLCPUPinned; }
};

#include <typeinfo>

TEST_F(TestFooMemory, OtherMemoryTypes)
{
    ASSERT_EQ(memory::host_memory::device_type(), kDLCPU);
    ASSERT_EQ(pinned_memory::device_type(), kDLCPUPinned);
    ASSERT_EQ(pinned_memory::min_alignment(), 8UL);
}


/*

min_alignment CAN be overridden, but ONLY IF the value is GREATER than the base memory_type.

In this test, malloc_allocator has memory_type == host_memory which has a default min_alignment of 8.

1024 CAN be used as a min_alignment override.
1 CANNOT be used as a min_alignment override.

*/

struct malloc_allocator_1024 : public memory::malloc_allocator
{
    constexpr static std::size_t min_alignment() { return 1024UL; }
};

struct malloc_allocator_1 : public memory::malloc_allocator
{
    constexpr static std::size_t min_alignment() { return 1UL; }
};

TEST_F(TestFooMemory, MinAlignment)
{
    auto raw_1 = malloc_allocator_1();
    auto malloc_1 = memory::trtlab::make_allocator(std::move(raw_1));

    auto raw = memory::malloc_allocator();
    auto malloc_8 = memory::trtlab::make_allocator(std::move(raw));

    auto raw_1024 = malloc_allocator_1024();
    auto malloc_1024 = memory::trtlab::make_allocator(std::move(raw_1024));

    ASSERT_EQ(malloc_1.min_alignment(), 8UL);
    ASSERT_EQ(malloc_8.min_alignment(), 8UL);
    ASSERT_EQ(malloc_1024.min_alignment(), 1024UL);
}


TEST_F(TestFooMemory, FirstTouch)
{
    trtlab::CpuSet cpus = trtlab::Affinity::GetCpusFromString("0,1");
    auto raw = memory::malloc_allocator();
    auto ft = memory::trtlab::make_first_touch_allocator(cpus, std::move(raw));
    auto alloc = memory::trtlab::make_allocator(std::move(ft));

    auto md = alloc.allocate_descriptor(1_MiB);
}

================================================
FILE: trtlab/core/tests/test_main.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <glog/logging.h>
#include <gtest/gtest.h>

int main(int argc, char **argv) {
    FLAGS_alsologtostderr = 1; // Log to console
    ::google::InitGoogleLogging("trtlab::test_core");
    ::testing::InitGoogleTest(&argc, argv);
    ::google::ParseCommandLineFlags(&argc, &argv, true);
    return RUN_ALL_TESTS();
}

================================================
FILE: trtlab/core/tests/test_memory.cc
================================================
#include <gtest/gtest.h>

#include <trtlab/core/memory/first_touch_allocator.h>
#include <trtlab/memory/allocator.h>
#include <trtlab/memory/malloc_allocator.h>

class TestMemory : public ::testing::Test {};

using namespace trtlab::memory;

TEST_F(TestMemory, FirstTouchAllocator)
{
    auto raw = first_touch_allocator<malloc_allocator, 0x42>(0);
    auto alloc = make_allocator(std::move(raw));

    auto md = alloc.allocate_descriptor(16);

    char *data = (char *) md.data();
    for(int i=0; i<16; i++)
    {
        EXPECT_EQ(data[i], 0x42);
    }
}

================================================
FILE: trtlab/core/tests/test_memory_old.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "trtlab/core/memory/allocator.h"
#include "trtlab/core/memory/bytes.h"
#include "trtlab/core/memory/bytes_allocator.h"
#include "trtlab/core/memory/tensor.h"
#include "trtlab/core/memory/copy.h"
#include "trtlab/core/memory/malloc.h"
#include "trtlab/core/memory/sysv_allocator.h"
#include "trtlab/core/memory/transactional_allocator.h"
#include "trtlab/core/utils.h"

#include <cstring>
#include <list>

#include <gtest/gtest.h>

#include <foonathan/memory/container.hpp> // vector, list, list_node_size
#include <foonathan/memory/memory_pool.hpp> // memory_pool
#include <foonathan/memory/smart_ptr.hpp> // allocate_unique
#include <foonathan/memory/static_allocator.hpp> // static_allocator_storage, static_block_allocator
#include <foonathan/memory/temporary_allocator.hpp> // temporary_allocator
#include <foonathan/memory/allocator_storage.hpp>
#include <foonathan/memory/tracking.hpp>

#include <foonathan/memory/namespace_alias.hpp>

using namespace trtlab;

namespace {

static mem_size_t one_kb = 1024;
static mem_size_t one_mb = one_kb * one_kb;
typedef std::vector<mem_size_t> shape_t;

template<typename T>
class TestMemory : public ::testing::Test
{
};

using MemoryTypes = ::testing::Types<Malloc, SystemV>;

TYPED_TEST_CASE(TestMemory, MemoryTypes);

/*
TYPED_TEST(TestMemory, should_not_compile)
{
    TypeParam memory(one_mb);
}
*/

TYPED_TEST(TestMemory, make_shared)
{
    auto shared = std::make_shared<Allocator<TypeParam>>(one_mb);

    EXPECT_TRUE(shared->Data());
    EXPECT_EQ(shared->Size(), one_mb);
    EXPECT_EQ(shared->Capacity(), one_mb);
    EXPECT_EQ(shared->Shape().size(), 1);
    EXPECT_EQ(shared->Shape()[0], one_mb);
    EXPECT_EQ(shared->DataType(), types::bytes);
    EXPECT_EQ(shared->DeviceInfo().device_type, kDLCPU);
    EXPECT_EQ(shared->DeviceInfo().device_id, 0);

    shared.reset();
    EXPECT_FALSE(shared);
}

TYPED_TEST(TestMemory, make_unique)
{
    auto unique = std::make_unique<Allocator<TypeParam>>(one_mb);

    EXPECT_TRUE(unique->Data());
    EXPECT_EQ(unique->Size(), one_mb);
    EXPECT_EQ(unique->Capacity(), one_mb);
    EXPECT_EQ(unique->Shape().size(), 1);
    EXPECT_EQ(unique->Shape()[0], one_mb);
    EXPECT_EQ(unique->DataType(), types::bytes);
    EXPECT_EQ(unique->DeviceInfo().device_type, kDLCPU);
    EXPECT_EQ(unique->DeviceInfo().device_id, 0);

    unique.reset();
    EXPECT_FALSE(unique);
}

TYPED_TEST(TestMemory, ctor)
{
    Allocator<TypeParam> memory(one_mb);

    EXPECT_TRUE(memory.Data());
    EXPECT_EQ(memory.Size(), one_mb);
    EXPECT_EQ(memory.Capacity(), one_mb);
    EXPECT_EQ(memory.Shape().size(), 1);
    EXPECT_EQ(memory.Shape()[0], one_mb);
    EXPECT_EQ(memory.DataType(), types::bytes);
    EXPECT_EQ(memory.DeviceInfo().device_type, kDLCPU);
    EXPECT_EQ(memory.DeviceInfo().device_id, 0);
}

TYPED_TEST(TestMemory, move_ctor)
{
    Allocator<TypeParam> memory(one_mb);
    Allocator<TypeParam> host(std::move(memory));

    EXPECT_TRUE(host.Data());
    EXPECT_EQ(one_mb, host.Size());
    EXPECT_EQ(host.Shape()[0], one_mb);

    EXPECT_FALSE(memory.Data());
    EXPECT_EQ(memory.Size(), 0);
    EXPECT_EQ(memory.Capacity(), 0);
    EXPECT_EQ(memory.Shape().size(), 0);
}

TYPED_TEST(TestMemory, move_ctor_with_reshape)
{
    Allocator<TypeParam> memory(one_mb);

    memory.Reshape({512, 512}, types::fp32);
    DLOG(INFO) << "reshaped";
    DLOG(INFO) << memory;
    DLOG(INFO) << "reshaped";
    EXPECT_EQ(memory.Size(), one_mb);
    EXPECT_EQ(memory.Capacity(), one_mb);
    EXPECT_EQ(memory.Shape()[0], 512);
    EXPECT_EQ(memory.Shape()[1], 512);
    EXPECT_EQ(memory.DataType(), types::fp32);

    Allocator<TypeParam> host(std::move(memory));
    DLOG(INFO) << "moved";

    // moved location
    EXPECT_TRUE(host.Data());
    EXPECT_EQ(host.Size(), one_mb);
    EXPECT_EQ(host.Capacity(), one_mb);
    EXPECT_EQ(host.Shape()[0], 512);
    EXPECT_EQ(host.Shape()[1], 512);
    EXPECT_EQ(host.DataType(), types::fp32);

    // original location
    EXPECT_FALSE(memory.Data());
    EXPECT_EQ(memory.Size(), 0);
    EXPECT_EQ(memory.Capacity(), 0);
    EXPECT_EQ(memory.DataType(), types::nil);
}

TYPED_TEST(TestMemory, move_to_shared_ptr)
{
    Allocator<TypeParam> memory(one_mb);
    auto ptr = std::make_shared<Allocator<TypeParam>>(std::move(memory));
    EXPECT_TRUE(ptr);
    EXPECT_TRUE(ptr->Data());
    EXPECT_FALSE(memory.Data());
}

TYPED_TEST(TestMemory, smart_move)
{
    auto shared = std::make_shared<Allocator<TypeParam>>(one_mb);
    std::weak_ptr<TypeParam> weak = shared;

    {
        std::vector<std::shared_ptr<CoreMemory>> core_segs;
        std::vector<std::shared_ptr<HostMemory>> host_segs;

        core_segs.push_back(shared);
        host_segs.push_back(std::move(shared));

        EXPECT_FALSE(weak.expired());
    }

    EXPECT_TRUE(weak.expired());
}

TYPED_TEST(TestMemory, shape)
{
    Allocator<TypeParam> memory(one_mb);
    shape_t shape = {one_mb};

    EXPECT_EQ(memory.Shape(), shape);

    // Exact Size
    memory.Reshape({512, 512}, types::fp32);
    EXPECT_EQ(memory.Shape()[0], 512);
    EXPECT_EQ(memory.Shape()[1], 512);
    EXPECT_EQ(memory.DataType(), types::fp32);
    EXPECT_EQ(memory.Size(), memory.Capacity());

    // Exact Size
    memory.Reshape({1024, 512}, types::fp16);
    EXPECT_EQ(memory.Shape()[0], 1024);
    EXPECT_EQ(memory.Shape()[1], 512);
    EXPECT_EQ(memory.DataType(), types::fp16);

    // Exact Size
    memory.Reshape({1024, 1024}, types::int8);
    EXPECT_EQ(memory.Shape()[0], 1024);
    EXPECT_EQ(memory.Shape()[1], 1024);
    EXPECT_EQ(memory.DataType(), types::int8);

    // Reshape to smaller than allocated
    memory.Reshape({256, 128}, types::int8);
    EXPECT_EQ(memory.Shape()[0], 256);
    EXPECT_EQ(memory.Shape()[1], 128);
    EXPECT_EQ(memory.Capacity(), one_mb);
    EXPECT_EQ(memory.Size(), 256 * 128 * types::int8.bytes());
    EXPECT_LT(memory.Size(), memory.Capacity());

    // Reshape to larger than allocated
    try
    {
        memory.Reshape({512, 513}, types::fp32);
        FAIL() << "Expected std::length_error";
    }
    catch(std::length_error const& err)
    {
        EXPECT_EQ(err.what(), std::string("Reshape exceeds capacity"));
    }
    catch(...)
    {
        FAIL() << "Expected std::length_error";
    }
}

TYPED_TEST(TestMemory, alignment)
{
    EXPECT_EQ(TypeParam::DefaultAlignment(), 64);
    EXPECT_EQ(TypeParam::AlignedSize(1), 64);
    EXPECT_EQ(TypeParam::AlignedSize(64), 64);
    EXPECT_EQ(TypeParam::AlignedSize(65), 128);

    /*
        EXPECT_EQ(TypeParam::AlignedSize<float>(15), 64);
        EXPECT_EQ(TypeParam::AlignedSize<float>(16), 64);
        EXPECT_EQ(TypeParam::AlignedSize<float>(17), 128);

        EXPECT_EQ(TypeParam::AlignedSize<double>(7), 64);
        EXPECT_EQ(TypeParam::AlignedSize<double>(8), 64);
        EXPECT_EQ(TypeParam::AlignedSize<double>(9), 128);
    */
}

// This tests an object very similar to the pybind11
// DLPack Descriptor
template<typename MemoryType>
class DescFromSharedPointer : public Descriptor<MemoryType>
{
  public:
    DescFromSharedPointer(std::shared_ptr<MemoryType> shared)
        : Descriptor<MemoryType>(
              shared,
              std::string("SharedPointer<" + std::string(shared->TypeName()) + ">").c_str()),
          m_ManagedMemory(shared)
    {
    }

    ~DescFromSharedPointer() override {}

    std::function<void()> CaptureSharedObject()
    {
        return [mem = m_ManagedMemory] { DLOG(INFO) << *mem; };
    }

  private:
    std::shared_ptr<MemoryType> m_ManagedMemory;
};

TYPED_TEST(TestMemory, DescriptorFromSharedPointer)
{
    auto shared = std::make_shared<Allocator<TypeParam>>(one_mb);

    std::weak_ptr<TypeParam> weak = shared;
    ASSERT_EQ(weak.use_count(), 1);

    std::function<void()> captured_obj;

    {
        // the descriptor captures the shared ptr twice
        // once in the descriptors deleter and the other
        // as the m_ManagedMemory shared_ptr
        DescFromSharedPointer<TypeParam> desc(shared);
        ASSERT_EQ(weak.use_count(), 3);

        captured_obj = desc.CaptureSharedObject();
        ASSERT_EQ(weak.use_count(), 4);
    }

    // descriptor out of scope
    ASSERT_EQ(weak.use_count(), 2);

    shared.reset();
    ASSERT_EQ(weak.use_count(), 1);

    // calling does not release
    captured_obj();

    // release
    DLOG(INFO) << "ref count to zero";
    captured_obj = nullptr;

    ASSERT_TRUE(weak.expired());
}

class TestSystemVMemory : public ::testing::Test
{
};

TEST_F(TestSystemVMemory, same_process)
{
    Allocator<SystemV> first(one_mb);
    ASSERT_GE(first.ShmID(), 0);
    ASSERT_EQ(first.Size(), one_mb);
    ASSERT_EQ(first.Capacity(), one_mb);

    // test move
    auto master = std::move(first);
    ASSERT_GE(master.ShmID(), 0);
    ASSERT_EQ(master.Size(), one_mb);
    ASSERT_EQ(master.Capacity(), one_mb);
    ASSERT_EQ(first.ShmID(), -1);

    {
        auto attached = SystemV::Attach(master.ShmID());
        EXPECT_EQ(master.ShmID(), attached->ShmID());
        EXPECT_EQ(master.Size(), attached->Size());
        // different virtual address pointing at the same memory
        EXPECT_NE(master.Data(), attached->Data());
        auto master_ptr = static_cast<long*>(master.Data());
        auto attach_ptr = static_cast<long*>(attached->Data());
        *master_ptr = 0xDEADBEEF;
        EXPECT_EQ(*master_ptr, *attach_ptr);
        EXPECT_EQ(*attach_ptr, 0xDEADBEEF);
        DLOG(INFO) << "finished with attached";
    }
    DLOG(INFO) << "finished with test";
}

TEST_F(TestSystemVMemory, smart_ptrs)
{
    auto master = std::make_unique<Allocator<SystemV>>(one_mb);
    EXPECT_TRUE(master->ShmID());
    auto attached = SystemV::Attach(master->ShmID());
    EXPECT_EQ(master->ShmID(), attached->ShmID());
    EXPECT_EQ(master->Size(), attached->Size());

    // different virtual address pointing at the same memory
    EXPECT_NE(master->Data(), attached->Data());

    // ensure both segments point to the same data
    auto master_ptr = static_cast<long*>(master->Data());
    auto attach_ptr = static_cast<long*>(attached->Data());
    *master_ptr = 0xDEADBEEF;
    EXPECT_EQ(*master_ptr, *attach_ptr);

    DLOG(INFO) << "releasing the attached segment";
    attached.reset();
    DLOG(INFO) << "released the attached segment";
}

TEST_F(TestSystemVMemory, TryAttachingToDeletedSegment)
{
    auto master = std::make_unique<Allocator<SystemV>>(one_mb);
    EXPECT_TRUE(master->ShmID());
    auto shm_id = master->ShmID();
    master.reset();
    DLOG(INFO) << "trying to attach to a deleted segment";
    EXPECT_DEATH(auto attached = SystemV::Attach(shm_id), "");
}

class TestCopy : public ::testing::Test
{
};

TEST_F(TestCopy, MallocToMalloc)
{
    char v0 = 111;
    char v1 = 222;

    Allocator<Malloc> m0(one_kb);
    std::memset(m0.Data(), v0, one_kb);

    auto m0_array = static_cast<char*>(m0.Data());
    EXPECT_EQ(m0_array[0], v0);
    EXPECT_EQ(m0_array[0], m0_array[1023]);

    auto m1 = std::make_unique<Allocator<Malloc>>(one_mb);
    std::memset(m1->Data(), v1, one_mb);

    auto m1_array = static_cast<char*>(m1->Data());
    EXPECT_EQ(m1_array[0], v1);
    EXPECT_EQ(m1_array[0], m1_array[1024]);

    EXPECT_NE(m0_array[0], m1_array[0]);

    // Copy smaller into larger
    Copy(*m1, m0, 1024);

    EXPECT_EQ(m1_array[0], v0);
    EXPECT_EQ(m1_array[1024], v1);
}

class TestGeneric : public ::testing::Test
{
};

TEST_F(TestGeneric, AllocatedPolymorphism)
{
    auto malloc = std::make_shared<Allocator<Malloc>>(1024);
    auto sysv = std::make_shared<Allocator<SystemV>>(1024);

    std::vector<std::shared_ptr<CoreMemory>> memory;

    memory.push_back(std::move(malloc));
    memory.push_back(std::move(sysv));
}

template<typename T>
class TestProvider : public BytesProvider<T>
{
  public:
    TestProvider() : m_Memory(one_mb) {}

    Bytes<T> Allocate(size_t offset, size_t size)
    {
        CHECK(m_Memory[offset + size]);
        CHECK(m_Memory[offset]);
        return this->BytesFromThis(m_Memory[offset], size);
    }

  private:
    const void* BytesProviderData() const final override { return m_Memory.Data(); }
    mem_size_t BytesProviderSize() const final override { return m_Memory.Size(); }
    const DLContext& BytesProviderDeviceInfo() const final override
    {
        return m_Memory.DeviceInfo();
    }
    const T& BytesProviderMemory() const final override { return m_Memory; }

    Allocator<T> m_Memory;
};

#if ENABLED_BYTES_HANDLE
TEST_F(TestGeneric, BytesHandleLifecycle)
{
    auto provider = std::make_shared<TestProvider<Malloc>>();
    auto obj = provider->Allocate(one_kb, one_kb);
    auto d1 = obj.Handle();
    // auto p1 = detail::BytesHandleFactory::HostPinned((void*)0xFACEBEEF, 2048);
    // auto g1 = detail::BytesHandleFactory::Device((void*)0xFACEB000, 1024*1024);

    auto d2 = d1;
    ASSERT_EQ(d2.Data(), d1.Data());
    ASSERT_EQ(d2.Size(), d1.Size());

    auto d3 = std::move(d2);
    ASSERT_EQ(d3.Data(), d1.Data());
    ASSERT_EQ(d3.Size(), d1.Size());
    ASSERT_EQ(d2.Data(), nullptr);
    ASSERT_EQ(d2.Size(), 0);

    BytesHandle<Malloc> d4(std::move(d3));
    ASSERT_EQ(d4.Data(), d1.Data());
    ASSERT_EQ(d4.Size(), d1.Size());
    ASSERT_EQ(d3.Data(), nullptr);
    ASSERT_EQ(d3.Size(), 0);

    BytesHandle<Malloc> d5(d4);
    ASSERT_EQ(d4.Data(), d1.Data());
    ASSERT_EQ(d4.Size(), d1.Size());
    ASSERT_EQ(d5.Data(), d1.Data());
    ASSERT_EQ(d5.Size(), d1.Size());

    // Downcasting from Malloc to HostMemory is allowed
    auto h1 = d5.BaseHandle();

    // Upcasting from HostMemory to Malloc is forbidden
    // EXPECT_THROW(auto m1 = h1.Cast<Malloc>(), std::bad_cast);

    // compiler error; different types
    // h1 = d4;
    // h1 = std::move(d4);
    // BytesHandle<Malloc> copy_ctor(h1);
    // BytesHandle<Malloc> move_ctor(std::move(h1));
}
#endif

namespace middleman {
class base
{
};

template<typename T>
class derived_base : public base
{
};

template<typename T>
class derived : public derived_base<typename T::BaseType>
{
};
} // namespace middleman

TEST_F(TestGeneric, TemplatedMiddleman)
{
    auto t = middleman::derived<Malloc>();
    auto derived = std::make_shared<middleman::derived<Malloc>>(std::move(t));
    std::shared_ptr<middleman::derived_base<HostMemory>> host = derived;

    // derived<Malloc> : derived_base<HostName>

    // std::shared_ptr<middleman::derived<HostMemory>> host = derived;
}

TEST_F(TestGeneric, ProtectedInheritance)
{
    struct a
    {
    };
    struct b : protected a
    {
    };

    EXPECT_FALSE((std::is_convertible<b*, a*>::value));

    struct c : public a
    {
    };

    EXPECT_TRUE((std::is_convertible<c*, a*>::value));

    struct d : protected a
    {
        operator const a&() { return *this; }
    };

    d objd;
    a obja = (const a&)objd;

    EXPECT_FALSE((std::is_convertible<const d&, const a&>::value));
}

TEST_F(TestGeneric, BytesCapture)
{
    auto provider = std::make_shared<TestProvider<Malloc>>();
    CHECK(provider);
    std::weak_ptr<TestProvider<Malloc>> weak = provider;

    {
        auto bytes = std::move(provider->Allocate(0, one_mb));
        ASSERT_EQ(bytes.Size(), one_mb);

        provider.reset();
        ASSERT_FALSE(weak.expired());
    }
    ASSERT_TRUE(weak.expired());
}

TEST_F(TestGeneric, BytesCopyMoveAssignment)
{
    auto provider = std::make_shared<TestProvider<Malloc>>();
    CHECK(provider);
    std::weak_ptr<TestProvider<Malloc>> weak = provider;
    ASSERT_EQ(provider.use_count(), 1);
    const auto half_mb = one_mb / 2;

    {
        // move assignment
        auto mv_assign = std::move(provider->Allocate(0, one_mb));
        auto ptr = mv_assign.Data();
        ASSERT_EQ(weak.use_count(), 2);
        EXPECT_EQ(mv_assign.Size(), one_mb);
        EXPECT_EQ(mv_assign.NDims(), 1);
        EXPECT_EQ(mv_assign.Shape()[0], one_mb);
        EXPECT_EQ(mv_assign.Strides()[0], 1);

        Bytes<Malloc> mv_ctor(std::move(mv_assign));
        ASSERT_EQ(weak.use_count(), 2);
        ASSERT_EQ(mv_ctor.Data(), ptr);
        ASSERT_EQ(mv_assign.Data(), nullptr);

        // check to ensure we can get back to the Provider's Memory object
        ASSERT_EQ(mv_ctor.Memory().Data(), ptr);

        // release the shared_ptr to the provider; our Bytes object still has a ref
        provider.reset();
        ASSERT_EQ(weak.use_count(), 1);
        ASSERT_EQ(mv_ctor.Data(), ptr);

        auto derived = std::make_shared<Bytes<Malloc>>(std::move(mv_ctor));
        std::shared_ptr<BytesBase> base = derived;
        ASSERT_EQ(derived->Data(), ptr);
        ASSERT_EQ(base->Data(), ptr);

        auto take_away = std::move(*derived);
        EXPECT_EQ(take_away.Data(), ptr);
        EXPECT_EQ(derived->Data(), nullptr);
        EXPECT_EQ(base->Data(), nullptr);

        // should not compile
        // public inheritance; protected copy/move ctors/assignments
        // BytesBase base = std::move(*derived);

        // should not compile
        // copy ctor deleted
        // Bytes<Malloc> copy_ctor(*derived);
        ASSERT_FALSE(std::is_copy_constructible<Bytes<Malloc>>::value);
        ASSERT_TRUE(std::is_move_constructible<Bytes<Malloc>>::value);

        // should not compile
        // copy ctor/assignment deleted
        // auto copy = *derived
        ASSERT_FALSE(std::is_copy_assignable<Bytes<Malloc>>::value);
        ASSERT_TRUE(std::is_move_assignable<Bytes<Malloc>>::value);

        BytesBaseType<HostMemory> host = std::move(take_away);
        ASSERT_EQ(weak.use_count(), 1);
        EXPECT_EQ(host.Data(), ptr);
        EXPECT_EQ(take_away.Data(), nullptr);
        EXPECT_EQ(take_away.Size(), 0);
        EXPECT_EQ(take_away.NDims(), 0);
        EXPECT_EQ(take_away.Shape()[0], 0);
        EXPECT_EQ(take_away.Strides()[0], 0);

        host.Release();
        ASSERT_EQ(weak.use_count(), 0);
    }
    ASSERT_EQ(weak.use_count(), 0);
}


TEST_F(TestGeneric, MemoryProvider)
{
    auto sysv = Allocator<SystemV>(2*one_mb);
    auto mb = BytesAllocator<Malloc>::Allocate(one_mb);
    auto sb = BytesAllocator<SystemV>::Expose(std::move(sysv));

}


TEST_F(TestGeneric, FooNathanAlignOffset)
{
/*
    using memory::detail::align_offset;
    auto val = align_offset((void*)(1), 64);
    EXPECT_EQ(val, 63);

    val = align_offset((void*)128, 64);
    EXPECT_EQ(val, 0);

    // this line should not compile in debug mode - alignments must be powers of 2
    // auto compile = align_offset((void*)(1), 63);

    // ASSERT_EQ(memory::unordered_set_node_size<int>::value, 0);

    struct log_tracker
    {
        void on_node_allocation(void* ptr, std::size_t size, std::size_t) noexcept
        {
            LOG(INFO) << " node allocated: " << ptr << "; " << size;
        }

        void on_node_deallocation(void *ptr, std::size_t size, std::size_t) noexcept
        {
            LOG(INFO) << " node deallocated: " << ptr << "; " << size;
        }
    };

    //auto heap = memory::MallocAllocator();
    auto storage = memory::make_allocator_adapter(memory::MallocAllocator());
    auto tracked = memory::make_tracked_allocator(log_tracker{}, std::move(storage));

    void *ptr = tracked.allocate_node(4096u, 64u);
    tracked.deallocate_node(ptr, 4096u, 64u);

    auto unique = memory::allocate_unique<int>(tracked, 1024);
    LOG(INFO) << *unique;

    memory::static_allocator_storage<4096u> static_storage;
    using static_pool_t = memory::memory_pool<memory::node_pool, memory::static_block_allocator>;
    static_pool_t static_pool(64u, 4096u, static_storage);
    auto tracked_pool = memory::make_tracked_allocator(log_tracker{}, std::move(static_pool));
    auto v1 = memory::allocate_unique<int>(tracked_pool, 2048);
    auto v2 = memory::allocate_unique<int>(tracked_pool, 4096);
    v1.reset();
    v1 = memory::allocate_unique<int>(tracked_pool, 1024);
*/
}

static void* make_ptr(std::size_t addr)
{
    return reinterpret_cast<void*>(addr);
}

static std::size_t make_int(void* addr)
{
    return reinterpret_cast<std::size_t>(addr);
}

static std::size_t make_int(const char* addr)
{
    return std::size_t(addr);
}


TEST_F(TestGeneric, FooFixedSizedStackAllocator)
{
    /*
    memory::trtlab::transactional_detail::FixedSizeStackAllocator alloc(make_ptr(128), 1024);

    std::size_t remaining = 1024;
    auto a0 = alloc.Allocate(1, 1);
    EXPECT_EQ(alloc.Available(), remaining -= 1);
    auto a1 = alloc.Allocate(1, 1);
    EXPECT_EQ(alloc.Available(), remaining -= 1);
    auto a2 = alloc.Allocate(1, 8);
    EXPECT_EQ(alloc.Available(), remaining = 1024 - 8 - 1);
    auto out_of_range = alloc.Allocate(2048, 8);

    char p = 128;

    EXPECT_EQ(make_int(a0), 128);
    EXPECT_EQ(make_int(a1), 129);
    EXPECT_EQ(make_int(a2), 128 + 8);
    EXPECT_EQ(out_of_range, nullptr);

    auto moved_alloc = std::move(alloc);

    auto should_be_nullptr = alloc.Allocate(8, 8);
    EXPECT_EQ(should_be_nullptr, nullptr);
    EXPECT_EQ(alloc.Available(), 0u);

    EXPECT_EQ(moved_alloc.Available(), remaining);
    auto a3 = moved_alloc.Allocate(128, 64);
    EXPECT_EQ(moved_alloc.Available(), remaining = 1024 - 64 - 128);


    EXPECT_TRUE(moved_alloc.Contains(make_ptr(128)));
    EXPECT_TRUE(moved_alloc.Contains(make_ptr(1024+128-1)));
    EXPECT_FALSE(moved_alloc.Contains(make_ptr(1024+128)));
    EXPECT_FALSE(moved_alloc.Contains(make_ptr(127)));

    EXPECT_FALSE(alloc.Contains(make_ptr(128)));
    EXPECT_FALSE(alloc.Contains(make_ptr(1024+128-1)));
    EXPECT_FALSE(alloc.Contains(make_ptr(1024+128)));
    EXPECT_FALSE(alloc.Contains(make_ptr(127)));

    EXPECT_TRUE(alloc.Contains(nullptr));
    EXPECT_FALSE(moved_alloc.Contains(nullptr));
    */
}

TEST_F(TestGeneric, FooTransactionalStackAllocator)
{
    /*
    std::size_t remaining = 1024;
    memory::trtlab::transactional_detail::StackAllocator alloc(make_ptr(128), remaining);

    auto a0 = alloc.Allocate(1, 1);
    EXPECT_EQ(alloc.Available(), remaining -= 1);
    EXPECT_EQ(alloc.InUseCount(), 1);

    auto a1 = alloc.Allocate(1, 1);
    EXPECT_EQ(alloc.Available(), remaining -= 1);
    EXPECT_EQ(alloc.InUseCount(), 2);

    auto a2 = alloc.Allocate(1, 8);
    EXPECT_EQ(alloc.Available(), remaining = 1024 - 8 - 1);
    EXPECT_EQ(alloc.InUseCount(), 3);

    EXPECT_ANY_THROW(auto out_of_range = alloc.Allocate(2048, 8));
    EXPECT_EQ(alloc.InUseCount(), 3);

    EXPECT_FALSE(alloc.ShouldReleaseAfterDeallocate(a1));
    EXPECT_FALSE(alloc.ShouldReleaseAfterDeallocate(a2));
    EXPECT_TRUE(alloc.ShouldReleaseAfterDeallocate(a0));
    */
}

template<typename RawAllocator>
auto make_balloc(std::size_t block_size, std::size_t max_blocks, RawAllocator&& alloc)
{
    return memory::trtlab::GrowthCappedFixedSizeBlockAllocator<RawAllocator>(block_size, max_blocks, std::move(alloc));
}


/*
TEST_F(TestGeneric, HostDescriptorDLTensorLifecycle)
{
    void *ptr = (void*)0xDEADBEEF;
    mem_size_t size = 13370;
    DLTensor dltensor;
    std::shared_ptr<nextgen::SharedDescriptor<HostMemory>> shared_hdesc;

    {
        nextgen::HostDescriptor hdesc(ptr, size, [&ptr, &size]{
            ptr = nullptr;
            size = 0;
        });
        EXPECT_EQ(hdesc.Data(), (void*)0xDEADBEEF);
        EXPECT_EQ(hdesc.Size(), 13370);
        hdesc.Reshape({13370/2, 1}, types::fp16);

        // regular descriptors can not expose a dltensor
        // dltensor = (DLTensor)hdesc;

        shared_hdesc =
std::make_shared<nextgen::SharedDescriptor<HostMemory>>(std::move(hdesc)); dltensor =
(DLTensor)(*shared_hdesc);

        EXPECT_EQ(shared_hdesc->Data(), dltensor.data);
        EXPECT_EQ(shared_hdesc->Capacity(), 13370);
        EXPECT_EQ(dltensor.ctx.device_type, kDLCPU);
        EXPECT_EQ(dltensor.dtype.code, kDLFloat);
        EXPECT_EQ(dltensor.dtype.bits, 16U);
        EXPECT_EQ(dltensor.dtype.lanes, 1U);
        EXPECT_EQ(dltensor.ndim, 2);
    }

    // dltensor is still valid as shared_hdesc is still valid
    EXPECT_EQ(ptr, (void*)0xDEADBEEF);
    EXPECT_EQ(size, 13370);
    EXPECT_EQ(dltensor.ndim, 2);
    EXPECT_EQ(dltensor.shape[0], 13370/2);
    EXPECT_EQ(dltensor.shape[1], 1);
/*
    {
        HostDescriptor hdesc(dltensor, [shared_hdesc]{});
        // shared_hdesc and hdesc do share the same memory;
        // however, they have difference DLPack descriptors that were equivalent
        // on instantiate, but can change with their respective objects
        // changes to hdesc are not seen by shared_hdesc
        EXPECT_EQ(hdesc.Data(), dltensor.data);
        EXPECT_EQ(hdesc.Capacity(), 13370);
        EXPECT_EQ(dltensor.ndim, 2);
        EXPECT_EQ(hdesc.Shape().size(), dltensor.ndim);
        hdesc.ReshapeToBytes();
        EXPECT_EQ(hdesc.Shape()[0], 13370);
        EXPECT_EQ(dltensor.shape[0], 13370/2);
    }
}
*/

TEST_F(TestGeneric, MallocAndHostDescriptors)
{
    // auto mdesc = nextgen::Malloc::Allocate(one_mb);

    // Unable to move from from Malloc -> HostMemory
    // Descriptor<HostMemory> hdesc(std::move(mdesc));

    // auto shared = std::make_shared<Descriptor<Malloc>>(std::move(mdesc));

    // auto shared_from = std::make_shared<Descriptor<Malloc>>(mdesc);
}

class TestBytesToString : public ::testing::Test
{
};

TEST_F(TestBytesToString, BytesToString)
{
    // Edge cases inspired from example output: https://stackoverflow.com/questions/3758606
    using std::string;
    EXPECT_EQ(string("0 B"), BytesToString(0));
    EXPECT_EQ(string("1000 B"), BytesToString(1000));
    EXPECT_EQ(string("1023 B"), BytesToString(1023));
    EXPECT_EQ(string("1.0 KiB"), BytesToString(1024));
    EXPECT_EQ(string("1.7 KiB"), BytesToString(1728));
    EXPECT_EQ(string("108.0 KiB"), BytesToString(110592));
    EXPECT_EQ(string("6.8 MiB"), BytesToString(7077888));
    EXPECT_EQ(string("432.0 MiB"), BytesToString(452984832));
    EXPECT_EQ(string("27.0 GiB"), BytesToString(28991029248));
    EXPECT_EQ(string("1.7 TiB"), BytesToString(1855425871872));
}

TEST_F(TestBytesToString, StringToBytes)
{
    EXPECT_EQ(0, StringToBytes("0B"));
    EXPECT_EQ(0, StringToBytes("0GB"));
    EXPECT_EQ(1000, StringToBytes("1000B"));
    EXPECT_EQ(1000, StringToBytes("1000b"));
    EXPECT_EQ(1000, StringToBytes("1kb"));
    EXPECT_EQ(1023, StringToBytes("1023b"));
    //  EXPECT_EQ(       1023, StringToBytes("1.023kb")); // no effort to control rounding -
    //  this fails with 1022
    EXPECT_EQ(1024, StringToBytes("1kib"));
    EXPECT_EQ(1024, StringToBytes("1.0KiB"));
    EXPECT_EQ(8000000, StringToBytes("8.0MB"));
    EXPECT_EQ(8388608, StringToBytes("8.0MiB"));
    EXPECT_EQ(18253611008, StringToBytes("17GiB"));
    EXPECT_DEATH(StringToBytes("17G"), "");
    EXPECT_DEATH(StringToBytes("yais"), "");
}

} // namespace


================================================
FILE: trtlab/core/tests/test_memory_stack.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "trtlab/core/memory/cyclic_allocator.h"
#include "trtlab/core/memory/malloc.h"
#include "trtlab/core/memory/memory_stack.h"
#include "trtlab/core/memory/smart_stack.h"
#include "trtlab/core/memory/sysv_allocator.h"

#include "gtest/gtest.h"

using namespace trtlab;
using namespace trtlab;

namespace {

static size_t one_mb = 1024 * 1024;

// template <typename T>
class TestMemoryStack : public ::testing::Test
{
  protected:
    virtual void SetUp() { stack = std::make_shared<MemoryStack<Malloc>>(one_mb); }

    virtual void TearDown() { stack->Reset(); }

    std::shared_ptr<MemoryStack<Malloc>> stack;
};

class TestSmartStack : public ::testing::Test
{
  protected:
    virtual void SetUp() { stack = SmartStack<SystemV>::Create(one_mb); }

    virtual void TearDown()
    {
        if(stack) stack->Reset();
    }

    std::shared_ptr<SmartStack<SystemV>> stack;
};

// using MemoryTypes = ::testing::Types<Malloc>;
// TYPED_TEST_CASE(TestMemoryStack, MemoryTypes);

TEST_F(TestMemoryStack, EmptyOnCreate)
{
    ASSERT_EQ(one_mb, stack->Size());
    ASSERT_EQ(one_mb, stack->Available());
    ASSERT_EQ(0, stack->Allocated());
}

TEST_F(TestMemoryStack, AllocateAndReset)
{
    auto p0 = stack->Allocate(128 * 1024);
    ASSERT_TRUE(p0);
    EXPECT_EQ(128 * 1024, stack->Allocated());
    stack->Reset();
    EXPECT_EQ(0, stack->Allocated());
    auto p1 = stack->Allocate(1);
    EXPECT_EQ(p0, p1);
}

TEST_F(TestMemoryStack, Unaligned)
{
    auto p0 = stack->Allocate(1);
    ASSERT_TRUE(p0);
    EXPECT_EQ(stack->Alignment(), stack->Allocated());

    auto p1 = stack->Allocate(1);
    ASSERT_TRUE(p1);
    EXPECT_EQ(2 * stack->Alignment(), stack->Allocated());

    auto len = (char*)p1 - (char*)p0;
    EXPECT_EQ(len, stack->Alignment());

    EXPECT_EQ(stack->Offset(p0), 0);
    EXPECT_EQ(stack->Offset(p1), stack->Alignment());
}

TEST_F(TestSmartStack, EmptyOnCreate)
{
    ASSERT_EQ(one_mb, stack->Size());
    ASSERT_EQ(one_mb, stack->Available());
    ASSERT_EQ(0, stack->Allocated());
}

TEST_F(TestSmartStack, AllocateAndReset)
{
    auto p0 = stack->Allocate(128 * 1024);
    ASSERT_TRUE(p0);
    EXPECT_EQ(128 * 1024, stack->Allocated());
    EXPECT_EQ(p0->DataType(), types::bytes);
    stack->Reset();
    EXPECT_EQ(0, stack->Allocated());
    auto p1 = stack->Allocate(1);
    EXPECT_EQ(p0->Data(), p1->Data());
}

TEST_F(TestSmartStack, Unaligned)
{
    auto p0 = stack->Allocate(1);
    ASSERT_TRUE(p0->Data());
    EXPECT_EQ(stack->Alignment(), stack->Allocated());

    auto p1 = stack->Allocate(1);
    ASSERT_TRUE(p1);
    EXPECT_EQ(2 * stack->Alignment(), stack->Allocated());

    auto len = (char*)p1->Data() - (char*)p0->Data();
    EXPECT_EQ(len, stack->Alignment());

    EXPECT_EQ(stack->Offset(p0->Data()), 0);
    EXPECT_EQ(stack->Offset(p1->Data()), stack->Alignment());

    EXPECT_EQ(p0->Offset(), 0);
    EXPECT_EQ(p1->Offset(), stack->Alignment());

    EXPECT_GE(p0->Stack().Memory().ShmID(), 0);
    // EXPECT_EQ(p0->ShmID(), -1);

    DLOG(INFO) << "delete descriptors";

    p0.reset();
    p1.reset();

    DLOG(INFO) << "delete stack";

    stack.reset();
}

TEST_F(TestSmartStack, PassMemory)
{
    auto memory = std::make_unique<Allocator<Malloc>>(one_mb);
    auto s = SmartStack<Malloc>::Create(std::move(memory));
}

TEST_F(TestSmartStack, PassSpecializedMemory)
{
    auto memory = std::make_unique<Allocator<Malloc>>(one_mb);
    memory->Reshape({512, 512}, types::fp32);
    EXPECT_EQ(memory->DataType(), types::fp32);

    auto s = SmartStack<Malloc>::Create(std::move(memory));
    // MemoryStack will Reshape any CoreMemory object to bytes at full capacity
    // This is ok since the MemoryStack is taking ownership of the object.
    EXPECT_EQ(s->Memory().DataType(), types::bytes);

    // This no longer fails because MemoryStack is converting our memory object
    // back into a useable form
    // EXPECT_THROW(auto p0 = s->Allocate(1), std::invalid_argument);
}

} // namespace

================================================
FILE: trtlab/core/tests/test_pool.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "glog/logging.h"
#include "trtlab/core/pool.h"
#include "gtest/gtest.h"

using namespace trtlab;

struct Object
{
    Object(std::string name) : m_Name(name), m_Original(name) {}
    ~Object()
    {
        DVLOG(2) << "Destroying Object " << m_Name;
    }

    Object(Object&& other) noexcept = default;
    Object& operator=(Object&& other) noexcept = default;

    void SetName(std::string name)
    {
        m_Name = name;
    }
    const std::string GetName() const
    {
        return m_Name;
    }

    void Reset()
    {
        m_Name = m_Original;
    }

private:
    std::string m_Name;
    std::string m_Original;
};

struct ShareableObject : public Object, public std::enable_shared_from_this<ShareableObject> 
{
    using Object::Object;
    auto Copy() { return shared_from_this(); }
};


class TestPool : public ::testing::Test
{
protected:
    virtual void SetUp()
    {
        p0 = Pool<Object>::Create();

        p1 = Pool<Object>::Create();
        p1->EmplacePush("Foo");

        p2 = Pool<Object>::Create();
        p2->EmplacePush("Foo");
        p2->EmplacePush("Bar");
    }

    virtual void TearDown() {}

    std::shared_ptr<Pool<Object>> p0;
    std::shared_ptr<Pool<Object>> p1;
    std::shared_ptr<Pool<Object>> p2;
};

TEST_F(TestPool, EmptyOnCreate)
{
    ASSERT_EQ(0, p0->Size());
}

TEST_F(TestPool, Push)
{
    ASSERT_EQ(0, p0->Size());

    p0->EmplacePush(Object("Baz"));
    ASSERT_EQ(1, p0->Size());
}

TEST_F(TestPool, Pop)
{
    ASSERT_EQ(1, p1->Size());

    auto obj = p1->Pop();
    ASSERT_TRUE(obj);
    ASSERT_EQ(0, p1->Size());
    obj.reset();
    ASSERT_EQ(1, p1->Size());

    {
        auto scoped_obj = p1->Pop();
        ASSERT_EQ(0, p1->Size());
    }
    ASSERT_EQ(1, p1->Size());
}

TEST_F(TestPool, PopOnReturn)
{
    auto foo = std::string("Foo");
    auto bar = std::string("Bar");
    {
        auto obj = p1->Pop([](Object& obj) { obj.Reset(); });
        ASSERT_TRUE(obj);
        ASSERT_EQ(foo, obj->GetName());
        obj->SetName(bar);
    }
    ASSERT_EQ(1, p1->Size());
    {
        auto obj = p1->Pop();
        ASSERT_TRUE(obj);
        ASSERT_EQ(foo, obj->GetName());
        obj->SetName(bar);
    }
    ASSERT_EQ(1, p1->Size());
    {
        auto obj = p1->Pop();
        ASSERT_TRUE(obj);
        ASSERT_EQ(bar, obj->GetName());
    }
    ASSERT_EQ(1, p1->Size());
}

TEST_F(TestPool, PopOnReturnWithCapture)
{
    auto foo = std::string("Foo");
    auto bar = std::string("Bar");

    auto obj = p1->Pop([](Object& obj) { obj.Reset(); });
    ASSERT_TRUE(obj);
    ASSERT_EQ(foo, obj->GetName());
    obj->SetName(bar);
    ASSERT_EQ(0, p1->Size());
    ASSERT_EQ(1, obj.use_count());

    // Capture obj in onReturn lambda
    auto from_p2_0 = p2->Pop([obj](Object& obj) {});
    ASSERT_EQ(2, obj.use_count());

    // Capture obj again a second onReturn lambda
    auto from_p2_1 = p2->Pop([obj](Object& obj) {});
    ASSERT_EQ(3, obj.use_count());

    // Free one of the resources that captured obj
    from_p2_0.reset();
    ASSERT_EQ(0, p1->Size());
    ASSERT_EQ(2, obj.use_count());
    ASSERT_EQ(bar, obj->GetName());

    // Free the original - it's still captured by from_p2_1
    obj.reset();
    ASSERT_EQ(0, p1->Size());

    // Free the last holder of obj
    from_p2_1.reset();
    ASSERT_EQ(1, p1->Size());

    {
        // Ensure the obj onReturn was called
        auto scoped = p1->Pop();
        ASSERT_EQ(foo, scoped->GetName());
    }
}

TEST_F(TestPool, PopWithoutReturn)
{
    ASSERT_EQ(1, p1->Size());

    auto obj = p1->PopWithoutReturn();
    ASSERT_TRUE(obj);
    ASSERT_EQ(0, p1->Size());
    obj.reset();
    ASSERT_EQ(0, p1->Size());
}

/*
TEST_F(TestPool, EnableSharedFromThis)
{
    auto p1 = Pool<std::shared_ptr<ShareableObject>>::Create();
    p1->Push(std::move(std::make_shared<ShareableObject>("Foo")));

    {
        auto scoped_obj = p1->Pop();
        ASSERT_EQ(scoped_obj.use_count(), 1);
        ASSERT_EQ(0, p1->Size());

        auto explicit_copy = scoped_obj;
        ASSERT_EQ(scoped_obj.use_count(), 2);
        ASSERT_EQ(explicit_copy.use_count(), 2);
        ASSERT_EQ(0, p1->Size());
        explicit_copy.reset();
        ASSERT_EQ(scoped_obj.use_count(), 1);
        ASSERT_EQ(0, p1->Size());

        auto copy_via_sft = scoped_obj->Copy();
        ASSERT_EQ(copy_via_sft.get(), scoped_obj.get());
        ASSERT_EQ(scoped_obj.use_count(), 2);
        ASSERT_EQ(copy_via_sft.use_count(), 2);

        scoped_obj.reset();
        ASSERT_EQ(copy_via_sft.use_count(), 1);
        ASSERT_EQ(0, p1->Size());
    }
}
*/

================================================
FILE: trtlab/core/tests/test_stl_allocator.cc
================================================


#include "trtlab/core/memory/cyclic_allocator.h"
#include "trtlab/core/memory/malloc.h"
#include "trtlab/core/memory/bytes.h"

#include "trtlab/core/memory/stl_allocator.h"

#include <gtest/gtest.h>

using namespace trtlab;

static const uint64_t one_mb = 1024 * 1024;

class TestCustomAllocator : public ::testing::Test
{
};

template<typename T>
using custom_vector = std::vector<T, stl::temporary_allocator<T, Malloc>>;

TEST_F(TestCustomAllocator, CustomVector)
{
    auto v1 = custom_vector<int> {1, 2, 4, 8};
    ASSERT_EQ(v1[0], 1);
    ASSERT_EQ(v1[1], 2);
    ASSERT_EQ(v1[2], 4);
    ASSERT_EQ(v1[3], 8);

    auto v2 = v1;
    ASSERT_EQ(v2[0], 1);
    ASSERT_EQ(v2[1], 2);
    ASSERT_EQ(v2[2], 4);
    ASSERT_EQ(v2[3], 8);

    v2[0] = 10;
    ASSERT_NE(v1[0], v2[0]);

    void* before = v2.data();
    v2.resize(1024);
    void* after = v2.data();
    ASSERT_NE(before, after);
    DLOG(INFO) << "finished resize";
    ASSERT_EQ(v1[0], 1);
    ASSERT_EQ(v1[1], 2);
    ASSERT_EQ(v1[2], 4);
    ASSERT_EQ(v1[3], 8);

    auto v3 = std::move(v2);
}


================================================
FILE: trtlab/core/tests/test_sysv_allocator.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <chrono>
#include <thread>
#include <glog/logging.h>

#include "test_common.h"

#include "trtlab/core/memory/allocator.h"
#include "trtlab/core/memory/sysv_allocator.h"

#include <foonathan/memory/namespace_alias.hpp>

#include "test_transactional_allocator.h"

using namespace memory::literals;

class TestSysV : public TrackedTest
{
};

TEST_F(TestSysV, BlankTest) 
{
    ASSERT_TRUE(true);
    auto alloc = memory::trtlab::sysv_allocator();
    ASSERT_TRUE(true);
}

TEST_F(TestSysV, LifeCycle)
{
    auto raw = memory::trtlab::sysv_allocator();
    auto alloc = memory::make_allocator_adapter(std::move(raw));

    auto ptr = alloc.allocate_node(1024, 8);
    alloc.deallocate_node(ptr, 1024, 8);

    EndTest();
}

TEST_F(TestSysV, Attach)
{
    auto raw = memory::trtlab::sysv_allocator();
    auto alloc = memory::make_allocator_adapter(std::move(raw));

    auto ptr = alloc.allocate_node(1024, 8);
    ASSERT_NE(ptr, nullptr);

    auto info = alloc.get_allocator().sysv_info_for_pointer(ptr);
    ASSERT_NE(info.shm_id, -1);
    ASSERT_EQ(info.offset, 0);
    ASSERT_EQ(info.attachment_count, 1);

    auto attached_ptr = alloc.get_allocator().attach(info.shm_id);
    auto updated_info = alloc.get_allocator().sysv_info_for_pointer(attached_ptr);
    ASSERT_NE(attached_ptr, nullptr);
    ASSERT_NE(ptr, attached_ptr);
    ASSERT_EQ(updated_info.attachment_count, 2);

    alloc.deallocate_node(ptr, 1024, 8);

    auto info_post_dealloc = alloc.get_allocator().sysv_info_for_pointer(attached_ptr);
    ASSERT_EQ(info_post_dealloc.attachment_count, 1);

    alloc.deallocate_node(attached_ptr, 1024, 8);

    EndTest();
}

TEST_F(TestSysV, AttachShouldFailIfSegmentHasBeenReleased)
{
    auto raw = memory::trtlab::sysv_allocator();
    auto alloc = memory::make_allocator_adapter(std::move(raw));

    auto ptr = alloc.allocate_node(1024, 8);
    ASSERT_NE(ptr, nullptr);

    auto info = alloc.get_allocator().sysv_info_for_pointer(ptr);
    ASSERT_NE(info.shm_id, -1);
    ASSERT_EQ(info.offset, 0);
    ASSERT_EQ(info.attachment_count, 1);
    ASSERT_TRUE(info.is_attachable);

    alloc.get_allocator().release(info.shm_id);
    info = alloc.get_allocator().sysv_info_for_pointer(ptr);
    ASSERT_EQ(info.attachment_count, 1);
    ASSERT_FALSE(info.is_attachable);

    ASSERT_ANY_THROW(alloc.get_allocator().attach(info.shm_id));

    alloc.deallocate_node(ptr, 1024, 8);

    EndTest();
}


TEST_F(TestSysV, AsBaseForHighLevelAllocators)
{
    auto alloc = make_smart_transactional_allocator<memory::trtlab::sysv_allocator>(1_MiB, 2);

    auto ptr_0 = alloc.allocate_node(1024, 8);
    auto ptr_1 = alloc.allocate_node(2048, 8);
    auto ptr_2 = alloc.allocate_node(1_MiB, 8);

    auto info_0 = memory::trtlab::sysv_allocator::sysv_info_for_pointer(ptr_0);
    auto info_1 = memory::trtlab::sysv_allocator::sysv_info_for_pointer(ptr_1);
    auto info_2 = memory::trtlab::sysv_allocator::sysv_info_for_pointer(ptr_2);
    ASSERT_EQ(info_0.offset, 0);
    ASSERT_EQ(info_1.offset, 1024);
    ASSERT_EQ(info_0.shm_id, info_1.shm_id);
    ASSERT_NE(info_0.shm_id, info_2.shm_id);
    ASSERT_NE(info_1.shm_id, info_2.shm_id);

    alloc.deallocate_node(ptr_0, 1024, 8);
    alloc.deallocate_node(ptr_1, 2048, 8);
    alloc.deallocate_node(ptr_2, 1_MiB, 8);

    EndTest();
}

================================================
FILE: trtlab/core/tests/test_tensor.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "trtlab/core/memory/allocator.h"
#include "trtlab/core/memory/bytes.h"
#include "trtlab/core/memory/bytes_allocator.h"
#include "trtlab/core/memory/malloc.h"
#include "trtlab/core/memory/tensor.h"

#include <gtest/gtest.h>

using namespace trtlab;

static const uint64_t one_mb = 1024 * 1024;

class TestTensor : public ::testing::Test
{
};

TEST_F(TestTensor, StateFromBytes)
{
    auto bytes = BytesAllocator<Malloc>::Allocate(one_mb);
    auto tensor = TensorAllocator::FromBytes(std::move(bytes));

    ASSERT_NE(tensor.Data(), nullptr);
    ASSERT_EQ(tensor.NDims(), 1);
    ASSERT_EQ(tensor.Shape()[0], one_mb);
    ASSERT_EQ(tensor.Strides()[0], 1);
    ASSERT_EQ(tensor.NBytes(), one_mb);
    ASSERT_EQ(tensor.ItemSize(), 1);
    ASSERT_FALSE(tensor.IsShared());

    Tensor<HostMemory> moved(std::move(tensor));
    ASSERT_FALSE(moved.IsShared());
    ASSERT_THROW(tensor.IsShared(), std::runtime_error);

    {
        Tensor<HostMemory> copied(moved);
        ASSERT_TRUE(moved.IsShared());
        ASSERT_TRUE(copied.IsShared());
    }

    ASSERT_FALSE(moved.IsShared());

    {
        auto cp_assign = moved;
        ASSERT_TRUE(moved.IsShared());
        ASSERT_TRUE(cp_assign.IsShared());
    }
}

TEST_F(TestTensor, ReshapeView)
{
    auto bytes = BytesAllocator<Malloc>::Allocate(one_mb);
    auto tensor = TensorAllocator::FromBytes(std::move(bytes));

    ASSERT_NE(tensor.Data(), nullptr);
    ASSERT_EQ(tensor.NDims(), 1);
    ASSERT_EQ(tensor.Shape()[0], one_mb);
    ASSERT_EQ(tensor.Strides()[0], 1);
    ASSERT_EQ(tensor.NBytes(), one_mb);
    ASSERT_EQ(tensor.ItemSize(), 1);
    ASSERT_FALSE(tensor.IsShared());

    tensor.ReshapeView({512, 512}, types::fp32);
    ASSERT_EQ(tensor.NDims(), 2);
    ASSERT_EQ(tensor.Shape()[0], 512);
    ASSERT_EQ(tensor.Shape()[1], 512);
    ASSERT_EQ(tensor.Strides()[1], 1);
    ASSERT_EQ(tensor.Strides()[0], 512);
    ASSERT_EQ(tensor.ItemSize(), 4);
    ASSERT_EQ(tensor.NBytes(), one_mb);
    ASSERT_FALSE(tensor.IsShared());

    {
        Tensor<HostMemory> copied(tensor);
        ASSERT_TRUE(tensor.IsShared());
        ASSERT_TRUE(copied.IsShared());

        ASSERT_THROW(tensor.ReshapeView({512, 256}, types::fp32), std::runtime_error);
    }

    ASSERT_THROW(tensor.ReshapeView({512, 513}, types::fp32), std::runtime_error);

    tensor.ReshapeView({2, 128, 64}, types::int16);
    ASSERT_EQ(tensor.NDims(), 3);
    ASSERT_EQ(tensor.Shape()[0], 2);
    ASSERT_EQ(tensor.Shape()[1], 128);
    ASSERT_EQ(tensor.Shape()[2], 64);
    ASSERT_EQ(tensor.Strides()[2], 1);
    ASSERT_EQ(tensor.Strides()[1], 64);
    ASSERT_EQ(tensor.Strides()[0], 128*64);
    ASSERT_EQ(tensor.ItemSize(), 2);
    ASSERT_EQ(tensor.NBytes(), 32768);
}


TEST_F(TestTensor, Shapes1D)
{
    TensorShapeGeneric array(one_mb);
    ASSERT_EQ(array.NDims(), 1);
    ASSERT_EQ(array.Size(), one_mb);
    ASSERT_EQ(array.Shape()[0], one_mb);
    ASSERT_EQ(array.Strides()[0], 1);
    ASSERT_TRUE(array.IsCompact());
    ASSERT_FALSE(array.IsStrided());
    ASSERT_FALSE(array.IsBroadcasted());
}

TEST_F(TestTensor, ShapesNDGeneric)
{
    TensorShapeGeneric nd({8, 3, 224, 224});
    ASSERT_EQ(nd.NDims(), 4);
    ASSERT_EQ(nd.Size(), 1204224);
    ASSERT_EQ(nd.Strides()[3], 1);
    ASSERT_EQ(nd.Strides()[2], 224);
    ASSERT_EQ(nd.Strides()[1], 224 * 224);
    ASSERT_EQ(nd.Strides()[0], 224 * 224 * 3);
    ASSERT_TRUE(nd.IsCompact());
    ASSERT_FALSE(nd.IsStrided());
    ASSERT_FALSE(nd.IsBroadcasted());
}

TEST_F(TestTensor, ShapesNDGenericWithStrides)
{
    // exposed as NCHW
    // stored as NHWC
    TensorShapeGeneric nd({8, 3, 224, 224}, {224 * 224 * 3, 1, 224 * 3, 3});
    ASSERT_EQ(nd.NDims(), 4);
    ASSERT_EQ(nd.Size(), 1204224);
    ASSERT_TRUE(nd.IsCompact());
    ASSERT_FALSE(nd.IsStrided());
    ASSERT_FALSE(nd.IsBroadcasted());
}

TEST_F(TestTensor, ShapesEmpty)
{
    // exposed as NCHW
    // stored as NHWC
    ASSERT_ANY_THROW(TensorShapeGeneric nd({}));
}


================================================
FILE: trtlab/core/tests/test_thread_pool.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "glog/logging.h"
#include "trtlab/core/thread_pool.h"
#include "gtest/gtest.h"

using namespace trtlab;

class TestThreadPool : public ::testing::Test
{
  protected:
    virtual void SetUp() { thread_pool = std::make_shared<ThreadPool>(3); }

    virtual void TearDown() {}

    std::shared_ptr<ThreadPool> thread_pool;
};

TEST_F(TestThreadPool, ReturnInt)
{
    auto should_be_1 = thread_pool->enqueue([] { return 1; });
    ASSERT_EQ(1, should_be_1.get());
}

TEST_F(TestThreadPool, ReturnChainedInt)
{
    auto should_be_1 =
        thread_pool->enqueue([this] { return thread_pool->enqueue([] { return 1; }); });
    ASSERT_EQ(1, should_be_1.get().get());
}

TEST_F(TestThreadPool, MakeUnique) { auto unqiue = std::make_unique<ThreadPool>(1); }

TEST_F(TestThreadPool, CaptureThis)
{
    class ObjectThatOwnsAThreadPool
    {
        class ValueObject;

      public:
        ObjectThatOwnsAThreadPool()
            : m_ThreadPool(std::move(std::make_unique<ThreadPool>(1))),
              m_Object(std::move(std::make_unique<ValueObject>()))
        {
        }

        DELETE_COPYABILITY(ObjectThatOwnsAThreadPool);
        DELETE_MOVEABILITY(ObjectThatOwnsAThreadPool);

        ~ObjectThatOwnsAThreadPool()
        {
            DVLOG(2) << "Destroying ObjectThatOwnsAThreadPool: " << this;
        }
        auto test()
        {
            DVLOG(2) << "[before queue] val = " << m_Object.get();
            return m_ThreadPool->enqueue([this]() {
                DVLOG(2) << "[before sleep] val = " << m_Object.get();
                std::this_thread::sleep_for(std::chrono::milliseconds(1000));
                DVLOG(2) << "[after  sleep] val = " << m_Object.get();
                return m_Object->value;
            });
        }

      private:
        class ValueObject
        {
          public:
            ValueObject() : value(42) {}
            DELETE_COPYABILITY(ValueObject);
            DELETE_MOVEABILITY(ValueObject);
            ~ValueObject() { DVLOG(2) << "Destroying ValueObject"; }
            int value;
        };
        std::unique_ptr<ValueObject> m_Object;
        std::unique_ptr<ThreadPool> m_ThreadPool;
    };

    auto obj = std::make_unique<ObjectThatOwnsAThreadPool>();
    auto future = obj->test();
    obj.reset();
    EXPECT_EQ(future.get(), 42);
}

================================================
FILE: trtlab/core/tests/test_transactional_allocator.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

#include "trtlab/core/memory/allocator.h"
#include "trtlab/core/memory/block_allocators.h"
#include "trtlab/core/memory/transactional_allocator.h"

#include <foonathan/memory/tracking.hpp>

#include "test_common.h"

template<typename StatelessAllocator>
auto make_raw_transactional_allocator(std::size_t block_size, std::size_t block_count = 2)
{
    namespace memory = foonathan::memory;

    // base allocator
    auto raw = StatelessAllocator();

    // convert to full fledged allocator - use direct_storage which optimizes out mutexes for stateless allocators
    auto alloc = memory::make_allocator_adapter(std::move(raw));

    static_assert(!decltype(alloc)::is_stateful::value, "should be stateless");
    static_assert(std::is_same<memory::no_mutex, typename decltype(alloc)::mutex>::value, "should use memory::no_mutex");

    // create a tracker for calls to the malloc allocator
    auto tracked = memory::make_tracked_allocator(log_tracker{"** tracker: base **"}, std::move(alloc));

    // malloc block allocator
    auto block_alloc = memory::trtlab::make_growth_capped_block_allocator(block_size, block_count, std::move(tracked));

    // transactional allocator
    return memory::trtlab::make_transactional_allocator(std::move(block_alloc));
}

template<typename StatelessAllocator = memory::malloc_allocator>
auto make_smart_transactional_allocator(std::size_t block_size, std::size_t block_count = 2)
{
    namespace memory = foonathan::memory;

    // transactional allocator
    auto alloc = make_raw_transactional_allocator<StatelessAllocator>(block_size, block_count);

    // populate the cache
    alloc.reserve_blocks(block_count);

    // smart allocator
    // use a special timeout_mutex - throws an exception if the lock is not obtained in MUTEX_TIMEOUT_MS
    return memory::trtlab::make_allocator<timeout_mutex>(std::move(alloc));
}

================================================
FILE: trtlab/core/tests/test_types.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "glog/logging.h"
#include "gtest/gtest.h"

#include "trtlab/core/types.h"

using namespace trtlab;
using namespace trtlab::types;

class TestTypes : public ::testing::Test
{
};

TEST_F(TestTypes, int8)
{
    EXPECT_EQ(types::int8.to_dlpack().code, (uint8_t)kDLInt);
    EXPECT_EQ(types::int8.to_dlpack().bits, (uint8_t)8U);
    EXPECT_EQ(types::int8.to_dlpack().lanes, (uint16_t)1U);
}

TEST_F(TestTypes, uint8)
{
    EXPECT_EQ(types::uint8.to_dlpack().code, (uint8_t)kDLUInt);
    EXPECT_EQ(types::uint8.to_dlpack().bits, (uint8_t)8U);
    EXPECT_EQ(types::uint8.to_dlpack().lanes, (uint16_t)1U);
}

TEST_F(TestTypes, fp32)
{
    EXPECT_EQ(types::fp32.to_dlpack().code, (uint8_t)kDLFloat);
    EXPECT_EQ(types::fp32.to_dlpack().bits, (uint8_t)32U);
    EXPECT_EQ(types::fp32.to_dlpack().lanes, (uint8_t)1U);
}

TEST_F(TestTypes, ctors_and_assignment)
{
    // copy assignment
    auto bytes = types::uint8;
    EXPECT_EQ(bytes.bytes(), 1);

    // copy ctor
    types::dtype bytes2(bytes);

    EXPECT_EQ(bytes, bytes2);

    // move ctor
    auto bytes3 = std::move(bytes2);
    EXPECT_EQ(bytes, bytes3);
    EXPECT_NE(bytes, bytes2);
    EXPECT_EQ(bytes2.to_dlpack().bits, 0);
    EXPECT_EQ(bytes2.bytes(), 0);

    // move assignment
    bytes2 = std::move(bytes3);
    EXPECT_EQ(bytes, bytes2);
    EXPECT_NE(bytes, bytes3);

}

TEST_F(TestTypes, Equivalence)
{
    auto bytes = types::uint8;
    EXPECT_EQ(bytes, types::uint8);
    EXPECT_NE(bytes, types::int8);
    EXPECT_NE(bytes, types::fp32);

    DLDataType dlpack = bytes.to_dlpack();
    EXPECT_EQ(dlpack.code, (uint8_t)kDLUInt);
    EXPECT_EQ(dlpack.bits, (uint8_t)8U);
    EXPECT_EQ(dlpack.lanes, (uint16_t)1U);

    types::dtype test(dlpack);
    EXPECT_EQ(test, types::uint8);
}

TEST_F(TestTypes, TypeVsObject)
{
    ASSERT_EQ(dtype::from<void>(), bytes);
}

TEST_F(TestTypes, ArbituaryDLDataTypes)
{
    DLDataType dt;

    dt.code = kDLFloat;
    dt.bits = 32;
    dt.lanes = 1;

    dtype float32(dt);
    ASSERT_TRUE(float32.is_compatible<float>());

    // todo: we could allow this to be compatible
    dt.lanes = 3;
    dtype float3(dt);
    ASSERT_FALSE(float3.is_compatible<float>());
    ASSERT_TRUE(float3.is_compatible<void>());

    dt.lanes = 1;
    dt.bits = 33;
    dtype float33(dt);
    ASSERT_EQ(float33.bytes(), 5); // round to nearest byte
    ASSERT_FALSE(float3.is_compatible<float>());
    ASSERT_TRUE(float3.is_compatible<void>());

    dt.bits = 32;
    dt.code = 7;
    ASSERT_THROW(dtype unknown(dt), std::runtime_error);
}

TEST_F(TestTypes, CheckAllForCompatibility)
{
    for(const auto& t : types::All)
    {
        int count = 0;
        count += (int)t.is_compatible<int8_t>();
        count += (int)t.is_compatible<int16_t>();
        count += (int)t.is_compatible<int32_t>();
        count += (int)t.is_compatible<int64_t>();
        count += (int)t.is_compatible<uint8_t>();
        count += (int)t.is_compatible<uint16_t>();
        count += (int)t.is_compatible<uint32_t>();
        count += (int)t.is_compatible<uint64_t>();
        count += (int)t.is_compatible<float>();
        count += (int)t.is_compatible<double>();

        if(t == types::fp16) { count++; }

        DVLOG(2) << t;
        ASSERT_EQ(count, 1);
    }
}

================================================
FILE: trtlab/cuda/BUILD.bazel
================================================

cc_library(
    name = "cuda",
    srcs = glob([
        "src/**/*.cc",
        "src/**/*.h",
    ]),
    hdrs = glob([
        "include/**/*.h",
    ]),
    deps = [
        "//trtlab/core",
        "@local_config_cuda//:cuda_headers",
        "@local_config_cuda//:cuda_runtime",
    ],
    linkopts = [
#       "-L/usr/local/cuda/lib64/stubs",
        "-lnvidia-ml",
    ],
    strip_include_prefix = "include",
    visibility = ["//visibility:public"],
)


================================================
FILE: trtlab/cuda/CMakeLists.txt
================================================
# Find CUDA
find_package(CUDA REQUIRED)
message(STATUS "CUDA IncludeDir ${CUDA_INCLUDE_DIRS}")

set(header_path ${CMAKE_CURRENT_SOURCE_DIR}/include/trtlab/cuda)

add_library(cuda
# src/copy.cc
# src/cuda_device.cc
# src/cuda_managed.cc
# src/cuda_pinned_host.cc
  src/device_info.cc
  src/device_guard.cc
# src/device_memory.cc
  src/cuda_allocators.cc
)

add_library(${PROJECT_NAME}::cuda ALIAS cuda)

target_link_libraries(cuda
    ${PROJECT_NAME}::core
    ${CUDA_LIBRARIES}
    nvidia-ml
)

target_include_directories(cuda
  PUBLIC
    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
    ${CUDA_INCLUDE_DIRS}
  PRIVATE
    ${header_path}
)

set_target_properties(cuda PROPERTIES OUTPUT_NAME ${PROJECT_NAME}-cuda)

install(
  TARGETS cuda
  EXPORT ${PROJECT_NAME}-targets
  RUNTIME DESTINATION  ${CMAKE_INSTALL_BINDIR}
  LIBRARY DESTINATION  ${CMAKE_INSTALL_LIBDIR}
  ARCHIVE DESTINATION  ${CMAKE_INSTALL_LIBDIR}
  INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
)

install(
  DIRECTORY include/
  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
)

  add_subdirectory(tests)

# add_subdirectory(benchmarks)


================================================
FILE: trtlab/cuda/benchmarks/CMakeLists.txt
================================================
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

find_package(benchmark REQUIRED)

add_executable(bench_cuda
  bench_main.cc
  bench_cuda_memory.cc
)

target_link_libraries(bench_cuda 
  PRIVATE 
    trtlab::cuda
    trtlab_memory
    benchmark::benchmark
)

#set(CMAKE_EXE_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS} "-pg")

add_test(NAME bench_cuda COMMAND $<TARGET_FILE:bench_cuda>)


================================================
FILE: trtlab/cuda/benchmarks/bench_cuda_memory.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <benchmark/benchmark.h>

#include <trtlab/memory/allocator.h>
#include <trtlab/memory/transactional_allocator.h>
#include <trtlab/memory/literals.h>

#include <trtlab/cuda/common.h>
#include <trtlab/cuda/memory/cuda_allocators.h>

using namespace trtlab::memory;
using namespace literals;

namespace bench
{

template <typename Allocator>
static void run_allocate_node_x1(benchmark::State& state, Allocator& alloc)
{
    for(auto _ : state)
    {
        auto p0 = (int*)alloc.allocate_node(33_MiB, 256);
        benchmark::DoNotOptimize(p0);
        alloc.deallocate_node(p0, 33_MiB, 256);
    }
}

template <typename Allocator>
static void run_allocate_node_x10(benchmark::State& state, Allocator& alloc)
{
    for(auto _ : state)
    {
        auto p0 = alloc.allocate_node(33_MiB, 8UL);
        auto p1 = alloc.allocate_node(44_MiB, 8UL);
        auto p2 = alloc.allocate_node(78_MiB, 8UL);
        auto p3 = alloc.allocate_node(12_MiB, 8UL);
        auto p4 = alloc.allocate_node(32_MiB, 8UL);
        auto p5 = alloc.allocate_node(100_MiB, 8UL);
        auto p6 = alloc.allocate_node(18_MiB, 8UL);
        auto p7 = alloc.allocate_node(21_MiB, 8UL);
        auto p8 = alloc.allocate_node(15_MiB, 8UL);
        auto p9 = alloc.allocate_node(71_MiB, 8UL);

        benchmark::DoNotOptimize(p0);
        benchmark::DoNotOptimize(p1);
        benchmark::DoNotOptimize(p2);
        benchmark::DoNotOptimize(p3);
        benchmark::DoNotOptimize(p4);
        benchmark::DoNotOptimize(p5);
        benchmark::DoNotOptimize(p6);
        benchmark::DoNotOptimize(p7);
        benchmark::DoNotOptimize(p8);
        benchmark::DoNotOptimize(p9);

        alloc.deallocate_node(p3, 12_MiB, 8UL);
        alloc.deallocate_node(p1, 44_MiB, 8UL);
        alloc.deallocate_node(p7, 21_MiB, 8UL);
        alloc.deallocate_node(p8, 15_MiB, 8UL);
        alloc.deallocate_node(p0, 33_MiB, 8UL);
        alloc.deallocate_node(p4, 32_MiB, 8UL);
        alloc.deallocate_node(p6, 18_MiB, 8UL);
        alloc.deallocate_node(p5, 100_MiB, 8UL);
        alloc.deallocate_node(p9, 71_MiB, 8UL);
        alloc.deallocate_node(p2, 78_MiB, 8UL);
    }
}

template <typename Allocator>
static void run_allocate_descriptor_x1(benchmark::State& state, Allocator& alloc)
{
    for(auto _ : state)
    {
        auto p0 = alloc.allocate_descriptor(33_MiB, 8UL);
    }
}

template <typename Allocator>
static void run_allocate_descriptor_x10(benchmark::State& state, Allocator& alloc)
{
    for(auto _ : state)
    {
        auto p0 = alloc.allocate_descriptor(33_MiB, 8UL);
        auto p1 = alloc.allocate_descriptor(44_MiB, 8UL);
        auto p2 = alloc.allocate_descriptor(78_MiB, 8UL);
        auto p3 = alloc.allocate_descriptor(12_MiB, 8UL);
        auto p4 = alloc.allocate_descriptor(32_MiB, 8UL);
        auto p5 = alloc.allocate_descriptor(100_MiB, 8UL);
        auto p6 = alloc.allocate_descriptor(18_MiB, 8UL);
        auto p7 = alloc.allocate_descriptor(21_MiB, 8UL);
        auto p8 = alloc.allocate_descriptor(15_MiB, 8UL);
        auto p9 = alloc.allocate_descriptor(71_MiB, 8UL);
    }
}

static auto make_transactional_allocator()
{
    constexpr std::size_t block_size = 128_MiB;
    constexpr std::size_t block_count = 6;

    auto malloc  = make_allocator_adapter(cuda_malloc_allocator(0));
    auto block   = make_block_allocator<fixed_size_block_allocator>(std::move(malloc), block_size);
    auto counted = make_extended_block_allocator<count_limited_block_allocator>(std::move(block), block_count);
    auto arena   = make_cached_block_arena(std::move(counted));
    auto alloc   = make_transactional_allocator(std::move(arena));    

    alloc.reserve_blocks(block_count);

    return alloc;
}
}

// malloc

static void memory_cuda_malloc_raw_x1(benchmark::State& state)
{
    auto alloc = make_allocator(cuda_malloc_allocator(0));
    bench::run_allocate_node_x1(state, alloc);
    CHECK_CUDA(cudaDeviceSynchronize());
}

static void memory_cuda_malloc_raw_x10(benchmark::State& state)
{
    auto alloc = make_allocator(cuda_malloc_allocator(0));
    bench::run_allocate_descriptor_x10(state, alloc);
    CHECK_CUDA(cudaDeviceSynchronize());
}

// transactional

static void memory_transactional_raw_x1(benchmark::State& state)
{
    auto alloc = bench::make_transactional_allocator();
    bench::run_allocate_node_x1(state, alloc);
}

static void memory_transactional_raw_x10(benchmark::State& state)
{
    auto alloc = bench::make_transactional_allocator();
    bench::run_allocate_node_x10(state, alloc);
}

static void memory_transactional_allocator_x1(benchmark::State& state)
{
    auto txalloc = bench::make_transactional_allocator();
    auto alloc =   make_allocator(std::move(txalloc));
    bench::run_allocate_node_x1(state, alloc);
}

static void memory_transactional_allocator_x10(benchmark::State& state)
{
    auto txalloc = bench::make_transactional_allocator();
    auto alloc =   make_allocator(std::move(txalloc));
    bench::run_allocate_node_x10(state, alloc);
}

static void memory_transactional_descriptor_x1(benchmark::State& state)
{
    auto txalloc = bench::make_transactional_allocator();
    auto alloc =   make_allocator(std::move(txalloc));
    bench::run_allocate_descriptor_x1(state, alloc);
}

static void memory_transactional_descriptor_x10(benchmark::State& state)
{
    auto txalloc = bench::make_transactional_allocator();
    auto alloc =   make_allocator(std::move(txalloc));
    bench::run_allocate_descriptor_x10(state, alloc);
}

BENCHMARK(memory_cuda_malloc_raw_x1);
BENCHMARK(memory_cuda_malloc_raw_x10);

BENCHMARK(memory_transactional_raw_x1);
BENCHMARK(memory_transactional_allocator_x1);
BENCHMARK(memory_transactional_descriptor_x1);

BENCHMARK(memory_transactional_raw_x10);
BENCHMARK(memory_transactional_allocator_x10);
BENCHMARK(memory_transactional_descriptor_x10);

================================================
FILE: trtlab/cuda/benchmarks/bench_main.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <benchmark/benchmark.h>

BENCHMARK_MAIN();

================================================
FILE: trtlab/cuda/include/trtlab/cuda/common.h
================================================
#pragma once

#include <cuda.h>
#include <cuda_runtime.h>
#include <glog/logging.h>

#define CHECK_CUDA(code) { CHECK_EQ((code), cudaSuccess) << cudaGetErrorString((code)); }


================================================
FILE: trtlab/cuda/include/trtlab/cuda/cyclic_windowed_buffer.h
================================================
#pragma once

#include <trtlab/core/cyclic_windowed_buffer.h>

#include "memory/device_memory.h"
#include "sync.h"

namespace trtlab
{
    template <typename ThreadType>
    class cyclic_windowed_stack<memory::device_memory, ThreadType> : public detail::cyclic_windowed_stack_impl
    {
    public:
        using memory_type = memory::device_memory;

        cyclic_windowed_stack() : cyclic_windowed_stack_impl(), m_Stream(nullptr) {}

        cyclic_windowed_stack(memory::descriptor md, std::size_t window_size, std::size_t overlap_size, cudaStream_t stream)
        : cyclic_windowed_stack_impl(std::move(md), window_size, overlap_size), m_Stream(stream)
        {
        }

        cyclic_windowed_stack(cyclic_windowed_stack&&) noexcept = default;
        cyclic_windowed_stack& operator=(cyclic_windowed_stack&&) noexcept = default;

        ~cyclic_windowed_stack() override {}

    private:
        void copy(void* dst, const void* src, std::size_t size) final override
        {
            CHECK_EQ(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, m_Stream), cudaSuccess);
            cuda_sync<ThreadType>::stream_sync(m_Stream);
        }

        void replicate(void* dst, const void* src, std::size_t size) final override
        {
            CHECK_EQ(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, m_Stream), cudaSuccess);
            cuda_sync<ThreadType>::stream_sync(m_Stream);
        }

        cudaStream_t m_Stream;
    };

} // namespace trtlab

================================================
FILE: trtlab/cuda/include/trtlab/cuda/device_guard.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once
#include <string>

#include "trtlab/core/utils.h"

namespace trtlab
{
    class device_guard final
    {
    public:
        device_guard(int device_id);
        ~device_guard();

        DELETE_COPYABILITY(device_guard);
        DELETE_MOVEABILITY(device_guard);

    private:
        int m_DeviceID;
    };

} // namespace trtlab

================================================
FILE: trtlab/cuda/include/trtlab/cuda/device_info.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once
#include <string>

#include <trtlab/core/affinity.h>
#include <nvml.h>

namespace trtlab
{
    struct DeviceInfo
    {
        static auto Affinity(int device_id) -> cpu_set;
        static auto Alignment() -> std::size_t;
        static auto EnergyConsumption(int device_id) -> double;
        static auto MemoryInfo(int device_id) -> nvmlMemory_t;
        static auto PowerUsage(int device_id) -> double;
        static auto PowerLimit(int device_id) -> double;
        static auto UUID(int device_id) -> std::string;
        static auto Count() -> int;
        static auto IsValidDeviceID(int device_id) -> bool;
    };

    namespace cuda
    {
        namespace nvml
        {
            std::size_t  device_count();
            nvmlMemory_t memory_info(int device_id);

        } // namespace nvml
    }     // namespace cuda

} // namespace trtlab

================================================
FILE: trtlab/cuda/include/trtlab/cuda/memory/cuda_allocators.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

#include <cuda.h>
#include <cuda_runtime.h>

#include <trtlab/memory/allocator.h>
#include <trtlab/memory/allocator_traits.h>

#include "device_memory.h"
#include "../device_guard.h"

namespace trtlab
{
    namespace memory
    {
        namespace cuda_detail
        {
            struct cuda_malloc
            {
                using memory_type = device_memory;
                static cudaError_t allocate(void** ptr, std::size_t size)
                {
                    return cudaMalloc(ptr, size);
                }
                static cudaError_t deallocate(void* ptr) noexcept
                {
                    return cudaFree(ptr);
                }
                static allocator_info info()
                {
                    return {"cudaMalloc", nullptr};
                }
            };

            struct cuda_malloc_managed
            {
                using memory_type = device_managed_memory;
                static cudaError_t allocate(void** ptr, std::size_t size)
                {
                    return cudaMallocManaged(ptr, size);
                }
                static cudaError_t deallocate(void* ptr) noexcept
                {
                    return cudaFree(ptr);
                }
                static allocator_info info()
                {
                    return {"cudaMallocManaged", nullptr};
                }
            };

            struct cuda_malloc_host
            {
                using memory_type = host_pinned_memory;
                static cudaError_t allocate(void** ptr, std::size_t size)
                {
                    return cudaMallocHost(ptr, size);
                }
                static cudaError_t deallocate(void* ptr) noexcept
                {
                    return cudaFreeHost(ptr);
                }
                static allocator_info info()
                {
                    return {"cudaMallocHost", nullptr};
                }
            };

            template <typename CudaAllocator>
            struct generic_allocator
            {
                using is_stateful = std::false_type;
                using memory_type = typename CudaAllocator::memory_type;

                static void* allocate_node(std::size_t size, std::size_t)
                {
                    void* addr = nullptr;
                    auto  rc   = CudaAllocator::allocate((void**)&addr, size);
                    if (rc != cudaSuccess)
                    {
                        LOG(ERROR) << info().name << " failed to allocate " << size;
                        throw std::bad_alloc();
                    }
                    return addr;
                }

                static void deallocate_node(void* ptr, std::size_t, std::size_t) noexcept
                {
                    CHECK_EQ(CudaAllocator::deallocate(ptr), cudaSuccess) << "freeing " << info().name << ": " << ptr;
                }

                // note: requires implementation
                static allocator_info info()
                {
                    return CudaAllocator::info();
                }
            };
        } // namespace cuda_detail

        using cuda_malloc         = cuda_detail::generic_allocator<cuda_detail::cuda_malloc>;
        using cuda_malloc_managed = cuda_detail::generic_allocator<cuda_detail::cuda_malloc_managed>;
        using cuda_malloc_host    = cuda_detail::generic_allocator<cuda_detail::cuda_malloc_host>;

        namespace cuda_detail
        {
            template <typename CudaAllocator>
            class device_allocator
            {
                static_assert(std::is_base_of<device_memory, typename CudaAllocator::memory_type>::value, "should be device_memory");

            public:
                using allocator_type = device_allocator<CudaAllocator>;
                using is_stateful    = std::true_type;
                using memory_type    = typename CudaAllocator::memory_type;

                device_allocator(int device_id) : m_DeviceID(device_id) {}
                virtual ~device_allocator() {}

                device_allocator(allocator_type&&)      = default;
                device_allocator(const allocator_type&) = default;

                allocator_type& operator=(allocator_type&&) = default;
                allocator_type& operator=(const allocator_type&) = default;

                void* allocate_node(std::size_t size, std::size_t)
                {
                    device_guard guard(m_DeviceID);
                    return CudaAllocator::allocate_node(size, 0);
                }

                void deallocate_node(void* ptr, std::size_t, std::size_t) noexcept
                {
                    CudaAllocator::deallocate_node(ptr, 0, 0);
                }

                bool operator==(const allocator_type& other)
                {
                    return m_DeviceID == other.DeviceID;
                }

            private:
                int m_DeviceID;
            };
        } // namespace cuda_detail

        using cuda_malloc_allocator         = cuda_detail::device_allocator<cuda_malloc>;
        using cuda_malloc_managed_allocator = cuda_detail::device_allocator<cuda_malloc_managed>;
        using cuda_malloc_host_allocator    = cuda_malloc_host;

        static auto make_cuda_allocator(int device_id = -1)
        {
            if (device_id == -1)
            {
                CHECK_EQ(cudaGetDevice(&device_id), cudaSuccess);
            }
            return memory::make_allocator(cuda_malloc_allocator(device_id));
        };

    } // namespace memory
} // namespace trtlab

================================================
FILE: trtlab/cuda/include/trtlab/cuda/memory/device_memory.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

#include <trtlab/memory/error.h>
#include <trtlab/memory/memory_type.h>

namespace trtlab
{
    namespace memory
    {
        struct device_memory : detail::any_memory
        {
            constexpr static DLDeviceType device_type() noexcept
            {
                return kDLGPU;
            }
            constexpr static std::size_t min_allocation_alignment() noexcept
            {
                return 256UL;
            }
            constexpr static std::size_t max_access_alignment() noexcept
            {
                return 64UL;
            }
            static std::size_t access_alignment_for(std::size_t size)
            {
                using impl = detail::any_memory;
                return impl::alignment_for<device_memory>(size);
            }
        };

        namespace detail
        {
            template <typename T>
            struct check_device_memory
            {
                using has_base = typename std::is_base_of<device_memory, T>::type;
                using has_impl = decltype(is_memory_type_impl<T>(0));

                using valid = std::integral_constant<bool, has_base::value && has_impl::value>;
            };
        } // namespace detail

        template <typename T>
        struct is_device_memory : detail::check_memory_type<T>::valid
        {
        };

        struct device_managed_memory : device_memory
        {
        };

        struct host_pinned_memory : host_memory
        {
            constexpr static DLDeviceType device_type()
            {
                return kDLCPUPinned;
            }
        };

    } // namespace memory
} // namespace trtlab

================================================
FILE: trtlab/cuda/include/trtlab/cuda/sync.h
================================================
#pragma once

#include <cuda.h>
#include <cuda_runtime.h>

#include <trtlab/core/standard_threads.h>
#include <trtlab/core/userspace_threads.h>

#include <trtlab/cuda/common.h>

namespace trtlab
{
    template <typename ThreadType>
    struct cuda_sync;

    template <>
    struct cuda_sync<userspace_threads>
    {
        static void event_sync(cudaEvent_t event)
        {
            cudaError_t rc = cudaEventQuery(event);
            while (rc != cudaSuccess)
            {
                if (rc != cudaErrorNotReady)
                {
                    LOG(ERROR) << cudaGetErrorName(rc);
                    throw std::runtime_error("cuda event query failed");
                }
                boost::this_fiber::yield();
                rc = cudaEventQuery(event);
            }
        }

        static void stream_sync(cudaStream_t stream)
        {
            cudaError_t rc = cudaStreamQuery(stream);
            while (rc != cudaSuccess)
            {
                if (rc != cudaErrorNotReady)
                {
                    LOG(ERROR) << cudaGetErrorName(rc);
                    throw std::runtime_error("cuda stream query failed");
                }
                boost::this_fiber::yield();
                rc = cudaStreamQuery(stream);
            }
        }
    };

    template<>
    struct cuda_sync<standard_threads>
    {
        static void event_sync(cudaEvent_t event)
        {
            CHECK_CUDA(cudaEventSynchronize(event));
        }

        static void stream_sync(cudaStream_t stream)
        {
            CHECK_CUDA(cudaStreamSynchronize(stream));
        }
    };

} // namespace trtlab

================================================
FILE: trtlab/cuda/src/copy.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "trtlab/cuda/memory/copy.h"

#include <cstring>
#include <cuda.h>
#include <cuda_runtime.h>
#include <glog/logging.h>

#include "trtlab/core/memory/memory.h"

using trtlab::CoreMemory;

namespace {

void Copy(CoreMemory& dst, size_t dst_offset, const CoreMemory& src, size_t src_offset, size_t size)
{
    CHECK_LE(size, dst.Size() - dst_offset) << "Copy: dst range is invalid";
    CHECK_LE(size, src.Size() - src_offset) << "Copy: src range is invalid";
    CHECK_EQ(cudaMemcpy(dst[dst_offset], src[src_offset], size, cudaMemcpyDefault), CUDA_SUCCESS);
}

} // namespace

namespace trtlab {

void Copy(HostMemory& dst, size_t dst_offset, const DeviceMemory& src, size_t src_offset,
          size_t size)
{
    Copy(dst, dst_offset, src, src_offset, size);
}

void Copy(DeviceMemory& dst, size_t dst_offset, const HostMemory& src, size_t src_offset,
          size_t size)
{
    Copy(dst, dst_offset, src, src_offset, size);
}

void Copy(DeviceMemory& dst, size_t dst_offset, const DeviceMemory& src, size_t src_offset,
          size_t size)
{
    Copy(dst, dst_offset, src, src_offset, size);
}

} // namespace trtlab

================================================
FILE: trtlab/cuda/src/cuda_allocators.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <trtlab/cuda/memory/cuda_allocators.h>

namespace trtlab
{
    namespace memory
    {
        namespace trtlab
        {
            namespace cuda_detail
            {
            }
        } // namespace trtlab
    }     // namespace memory
} // namespace trtlab

================================================
FILE: trtlab/cuda/src/device_guard.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "trtlab/cuda/device_guard.h"
#include "trtlab/cuda/device_info.h"

#include <cuda.h>
#include <cuda_runtime.h>
#include <glog/logging.h>

namespace trtlab {

device_guard::device_guard(int device_id)
{
    DCHECK_GE(device_id, 0);
    DCHECK_LT(device_id, DeviceInfo::Count());
    CHECK_EQ(cudaGetDevice(&m_DeviceID), CUDA_SUCCESS);
    CHECK_EQ(cudaSetDevice(device_id), CUDA_SUCCESS);
}

device_guard::~device_guard()
{
    CHECK_EQ(cudaSetDevice(m_DeviceID), CUDA_SUCCESS);
}

} // namespace trtlab

================================================
FILE: trtlab/cuda/src/device_info.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "trtlab/cuda/device_info.h"

#include <algorithm>

#include <glog/logging.h>

#include <cuda.h>
#include <cuda_runtime.h>
#include <nvml.h>

#define test_bit(_n, _p) (_n & (1UL << _p))

namespace
{
    struct nvmlState
    {
        nvmlState()
        {
            CHECK_EQ(nvmlInit(), NVML_SUCCESS) << "Failed to initialize NVML";
        }
        ~nvmlState()
        {
            CHECK_EQ(nvmlShutdown(), NVML_SUCCESS) << "Failed to Shutdown NVML";
        }
    };

    static auto nvmlInstatnce = std::make_unique<nvmlState>();

    nvmlDevice_t GetHandleById(unsigned int device_id)
    {
        nvmlDevice_t handle;
        CHECK_EQ(nvmlDeviceGetHandleByIndex(device_id, &handle), NVML_SUCCESS);
        return handle;
    }

} // namespace

namespace trtlab
{
    cpu_set DeviceInfo::Affinity(int device_id)
    {
        nvmlDevice_t  gpu      = GetHandleById(device_id);
        unsigned long cpu_mask = 0;
        cpu_set       cpus;

        CHECK_EQ(nvmlDeviceGetCpuAffinity(gpu, sizeof(cpu_mask), &cpu_mask), NVML_SUCCESS)
            << "Failed to retrieve CpusSet for GPU=" << device_id;

        for (unsigned int i = 0; i < 8 * sizeof(cpu_mask); i++)
        {
            if (test_bit(cpu_mask, i))
            {
                cpus.insert(affinity::system::cpu_from_logical_id(i));
            }
        }

        DLOG(INFO) << "CPU Affinity for GPU " << device_id << ": " << cpus;
        return std::move(cpus);
    }

    std::size_t DeviceInfo::Alignment()
    {
        struct cudaDeviceProp properties;
        CHECK_EQ(CUDA_SUCCESS, cudaGetDeviceProperties(&properties, 0));
        return properties.textureAlignment;
    }

    double DeviceInfo::PowerUsage(int device_id)
    {
        unsigned int power;
        CHECK_EQ(nvmlDeviceGetPowerUsage(GetHandleById(device_id), &power), NVML_SUCCESS);
        return static_cast<double>(power) * 0.001;
    }

    double DeviceInfo::PowerLimit(int device_id)
    {
        unsigned int limit;
        CHECK_EQ(nvmlDeviceGetPowerManagementLimit(GetHandleById(device_id), &limit), NVML_SUCCESS);
        return static_cast<double>(limit) * 0.001;
    }

    std::string DeviceInfo::UUID(int device_id)
    {
        char buffer[256];
        CHECK_EQ(nvmlDeviceGetUUID(GetHandleById(device_id), buffer, 256), NVML_SUCCESS);
        return buffer;
    }

    int DeviceInfo::Count()
    {
        int device_count;
        CHECK_EQ(cudaGetDeviceCount(&device_count), CUDA_SUCCESS);
        return device_count;
    }

    bool DeviceInfo::IsValidDeviceID(int device_id)
    {
        return (device_id > 0) && (device_id < DeviceInfo::Count());
    }

    nvmlMemory_t DeviceInfo::MemoryInfo(int device_id)
    {
        nvmlMemory_t info;
        CHECK_EQ(nvmlDeviceGetMemoryInfo(GetHandleById(device_id), &info), NVML_SUCCESS);
        return info;
    }

    namespace cuda
    {
        namespace nvml
        {
            std::size_t device_count()
            {
                return DeviceInfo::Count();
            }

            nvmlMemory_t memory_info(int device_id)
            {
                return DeviceInfo::MemoryInfo(device_id);
            }
        } // namespace nvml
    }     // namespace cuda

} // namespace trtlab


================================================
FILE: trtlab/cuda/tests/CMakeLists.txt
================================================
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

find_package(GTest)

add_executable(test_cuda
# test_memory.cc
  test_main.cc
  test_allocators.cc
  test_device_info.cc
)

target_link_libraries(test_cuda
  PRIVATE 
    ${PROJECT_NAME}::core
    ${PROJECT_NAME}::cuda
    GTest::gtest
)

add_test(
  NAME cuda
  COMMAND $<TARGET_FILE:test_cuda
)


================================================
FILE: trtlab/cuda/tests/test_allocators.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <trtlab/memory/allocator.h>
#include <trtlab/memory/block_allocators.h>
#include <trtlab/memory/literals.h>
#include <trtlab/memory/transactional_allocator.h>

#include <trtlab/cuda/memory/cuda_allocators.h>

#include <glog/logging.h>
#include <gtest/gtest.h>

#include <chrono>
#include <future>
#include <thread>

using namespace trtlab;
using namespace memory;
using namespace literals;

class TestCudaAllocators : public ::testing::Test
{
};

TEST_F(TestCudaAllocators, cudaMalloc)
{
    auto raw   = cuda_malloc_allocator(0);
    auto alloc = make_allocator(std::move(raw));

    auto p0 = alloc.allocate_node(1024, 256);
    alloc.deallocate_node(p0, 1024, 256);

    auto info = alloc.device_context();
    ASSERT_EQ(info.device_type, kDLGPU);

    auto md = alloc.allocate_descriptor(1024);
}

TEST_F(TestCudaAllocators, TrasactionalCudaMalloc)
{
    auto raw     = cuda_malloc_allocator(0);
    auto tracked = make_allocator_adapter(std::move(raw));
    auto block   = make_block_allocator<fixed_size_block_allocator>(std::move(tracked), 1_MiB);
    auto counted = make_extended_block_allocator<count_limited_block_allocator>(std::move(block), 3);
    auto arena   = make_cached_block_arena(std::move(counted));
    auto txalloc = make_transactional_allocator(std::move(arena));
    auto alloc   = make_allocator(std::move(txalloc));

    alloc.get_allocator().reserve_blocks(3);

    auto p0 = alloc.allocate_descriptor(1024);
}

================================================
FILE: trtlab/cuda/tests/test_device_info.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "glog/logging.h"
#include "trtlab/cuda/device_info.h"
#include "gtest/gtest.h"

using namespace trtlab;

class TestDeviceInfo : public ::testing::Test
{
  protected:
    virtual void SetUp() {}

    virtual void TearDown() {}
};

TEST_F(TestDeviceInfo, Affinity)
{
    auto affinity = DeviceInfo::Affinity(0);
    LOG(INFO) << affinity;
}

================================================
FILE: trtlab/cuda/tests/test_main.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <glog/logging.h>
#include <gtest/gtest.h>

int main(int argc, char **argv) {
    FLAGS_alsologtostderr = 1; // Log to console
    ::google::InitGoogleLogging("trtlab::test_cuda");
    ::testing::InitGoogleTest(&argc, argv);
    ::google::ParseCommandLineFlags(&argc, &argv, true);
    return RUN_ALL_TESTS();
}

================================================
FILE: trtlab/cuda/tests/test_memory.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <list>
#include <memory>

#include "trtlab/core/memory/allocator.h"
#include "trtlab/cuda/memory/cuda_device.h"
#include "trtlab/cuda/memory/cuda_managed.h"
#include "trtlab/cuda/memory/cuda_pinned_host.h"

#include "gtest/gtest.h"

using namespace trtlab;
using namespace trtlab;

namespace {

static size_t one_mb = 1024 * 1024;

template<typename T>
class TestMemory : public ::testing::Test
{
};

using MemoryTypes = ::testing::Types<CudaDeviceMemory, CudaManagedMemory, CudaPinnedHostMemory>;

TYPED_TEST_CASE(TestMemory, MemoryTypes);

TYPED_TEST(TestMemory, make_shared)
{
    auto shared = std::make_shared<Allocator<TypeParam>>(one_mb);
    EXPECT_TRUE(shared->Data());
    EXPECT_EQ(one_mb, shared->Size());

    if(std::dynamic_pointer_cast<DeviceMemory>(shared))
    {
        EXPECT_EQ(shared->DeviceInfo().device_type, kDLGPU);
    }
    else
    {
        EXPECT_EQ(shared->DeviceInfo().device_type, kDLCPUPinned);
    }

    shared.reset();
    EXPECT_FALSE(shared);
}

TYPED_TEST(TestMemory, make_unique)
{
    auto unique = std::make_unique<Allocator<TypeParam>>(one_mb);
    EXPECT_TRUE(unique->Data());
    EXPECT_EQ(one_mb, unique->Size());
    unique.reset();
    EXPECT_FALSE(unique);
}

TYPED_TEST(TestMemory, ctor)
{
    Allocator<TypeParam> memory(one_mb);
    EXPECT_TRUE(memory.Data());
    EXPECT_EQ(one_mb, memory.Size());
}

/*
TYPED_TEST(TestMemory, move_ctor)
{
    Allocator<TypeParam> memory(one_mb);
    Allocator<TypeParam> host(std::move(memory));

    EXPECT_TRUE(host.Data());
    EXPECT_EQ(one_mb, host.Size());

    EXPECT_FALSE(memory.Data());
    EXPECT_EQ(0, memory.Size());
}

TYPED_TEST(TestMemory, move_to_shared_ptr)
{
    Allocator<TypeParam> memory(one_mb);
    auto ptr = std::make_shared<Allocator<TypeParam>>(std::move(memory));
    EXPECT_TRUE(ptr);
    EXPECT_TRUE(ptr->Data());
}
*/

} // namespace


================================================
FILE: trtlab/memory/CMakeLists.txt
================================================
# MODIFICATION MESSAGE

# Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
# This file is subject to the license terms in the LICENSE file
# found in the top-level directory of this distribution.

# root CMakeLists.txt, specifies option and interface library

cmake_minimum_required(VERSION 3.1)

project (TRTLAB_MEMORY)

set(TRTLAB_MEMORY_VERSION_MAJOR 0 CACHE STRING "major version of memory" FORCE)
set(TRTLAB_MEMORY_VERSION_MINOR 1 CACHE STRING "minor version of memory" FORCE)
set(TRTLAB_MEMORY_VERSION_PATCH 0 CACHE STRING "patch version of memory" FORCE)
set(TRTLAB_MEMORY_VERSION "${TRTLAB_MEMORY_VERSION_MAJOR}.${TRTLAB_MEMORY_VERSION_MINOR}.${TRTLAB_MEMORY_VERSION_PATCH}"
                             CACHE STRING "version of memory" FORCE)

# set a debug postfix
set(CMAKE_DEBUG_POSTFIX "-dbg")

if(UNIX)
    include(GNUInstallDirs)

    set(TRTLAB_MEMORY_INC_INSTALL_DIR "${CMAKE_INSTALL_INCLUDEDIR}/trtlab_memory") 
    set(TRTLAB_MEMORY_RUNTIME_INSTALL_DIR "${CMAKE_INSTALL_BINDIR}") 
    set(TRTLAB_MEMORY_LIBRARY_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}")
    set(TRTLAB_MEMORY_ARCHIVE_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}")
    set(TRTLAB_MEMORY_FRAMEWORK_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}")

    set(TRTLAB_MEMORY_CMAKE_CONFIG_INSTALL_DIR "${CMAKE_INSTALL_DATADIR}/trtlab_memory/cmake")
    set(TRTLAB_MEMORY_ADDITIONAL_FILES_INSTALL_DIR "${CMAKE_INSTALL_DATADIR}/trtlab_memory")

    set(TRTLAB_MEMORY_RUNTIME_INSTALL_DIR "bin") # for the nodesize_dbg, just ignore version and the like
    set(TRTLAB_MEMORY_INC_INSTALL_DIR "include/trtlab_memory") # header filestree

elseif(WIN32)
    set(TRTLAB_MEMORY_INC_INSTALL_DIR "include/trtlab_memory")
    set(TRTLAB_MEMORY_RUNTIME_INSTALL_DIR   "bin") 
    set(TRTLAB_MEMORY_LIBRARY_INSTALL_DIR   "bin")
    set(TRTLAB_MEMORY_ARCHIVE_INSTALL_DIR   "lib")
    set(TRTLAB_MEMORY_FRAMEWORK_INSTALL_DIR "bin")

    set(TRTLAB_MEMORY_CMAKE_CONFIG_INSTALL_DIR "cmake")
    set(TRTLAB_MEMORY_ADDITIONAL_FILES_INSTALL_DIR "./")
    set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
else()
	message(FATAL_ERROR "Could not set install folders for this platform!")
endif()

#include(cmake/compatibility.cmake)
include(cmake/configuration.cmake)

set(CMAKE_CXX_STANDARD 17)

# subdirectories
add_subdirectory(src)

if(TRTLAB_MEMORY_BUILD_TESTS)
    enable_testing()
    add_subdirectory(tests)
endif()

if(TRTLAB_MEMORY_BUILD_BENCHMARKS)
    enable_testing()
    add_subdirectory(benchmarks)
endif()

if(TRTLAB_MEMORY_BUILD_TOOLS)
    add_subdirectory(tools)
endif()

if(TRTLAB_MEMORY_BUILD_EXAMPLES)
    add_subdirectory(examples)
endif()

# install readme and license
install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE" "${CMAKE_CURRENT_SOURCE_DIR}/README.md" DESTINATION ${TRTLAB_MEMORY_ADDITIONAL_FILES_INSTALL_DIR})

#install(EXPORT trtlab_memoryTargets DESTINATION ${TRTLAB_MEMORY_CMAKE_CONFIG_INSTALL_DIR} FILE trtlab_memory-config.cmake)


================================================
FILE: trtlab/memory/benchmarks/CMakeLists.txt
================================================
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

find_package(benchmark REQUIRED)

add_executable(bench_memory
  main.cc
  bench_memory.cc
  bench_memory_pool.cc
)

target_link_libraries(bench_memory 
  PRIVATE 
    trtlab_memory
    benchmark::benchmark
)

set(CMAKE_EXE_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS} "-pg")

add_test(NAME bench_memory COMMAND $<TARGET_FILE:bench_memory>)


================================================
FILE: trtlab/memory/benchmarks/bench_memory.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <benchmark/benchmark.h>

#include <trtlab/memory/allocator.h>
#include <trtlab/memory/bfit_allocator.h>
#include <trtlab/memory/malloc_allocator.h>
#include <trtlab/memory/transactional_allocator.h>
#include <trtlab/memory/literals.h>

using namespace trtlab::memory;
using namespace literals;

namespace bench
{
    template <typename Allocator>
    static void run_allocate_node_x1(benchmark::State& state, Allocator& alloc)
    {
        for (auto _ : state)
        {
            auto p0 = (int*)alloc.allocate_node(33_MiB, 8UL);
            benchmark::DoNotOptimize(p0);
            alloc.deallocate_node(p0, 33_MiB, 8UL);
        }

        state.SetItemsProcessed(static_cast<int64_t>(state.iterations()));
    }

    template <typename Allocator>
    static void run_allocate_node_x10(benchmark::State& state, Allocator& alloc)
    {
        for (auto _ : state)
        {
            void* p0, *p1, *p2, *p3, *p4, *p5, *p6, *p7, *p8, *p9;

            benchmark::DoNotOptimize(p0 = alloc.allocate_node(33_MiB, 8UL));
            benchmark::DoNotOptimize(p1 = alloc.allocate_node(44_MiB, 8UL));
            benchmark::DoNotOptimize(p2 = alloc.allocate_node(78_MiB, 8UL));
            benchmark::DoNotOptimize(p3 = alloc.allocate_node(12_MiB, 8UL));
            benchmark::DoNotOptimize(p4 = alloc.allocate_node(32_MiB, 8UL));
            benchmark::DoNotOptimize(p5 = alloc.allocate_node(100_MiB, 8UL));
            benchmark::DoNotOptimize(p6 = alloc.allocate_node(18_MiB, 8UL));
            benchmark::DoNotOptimize(p7 = alloc.allocate_node(21_MiB, 8UL));
            benchmark::DoNotOptimize(p8 = alloc.allocate_node(15_MiB, 8UL));
            benchmark::DoNotOptimize(p9 = alloc.allocate_node(71_MiB, 8UL));

            alloc.deallocate_node(p3, 12_MiB, 8UL);
            alloc.deallocate_node(p1, 44_MiB, 8UL);
            alloc.deallocate_node(p7, 21_MiB, 8UL);
            alloc.deallocate_node(p8, 15_MiB, 8UL);
            alloc.deallocate_node(p0, 33_MiB, 8UL);
            alloc.deallocate_node(p4, 32_MiB, 8UL);
            alloc.deallocate_node(p6, 18_MiB, 8UL);
            alloc.deallocate_node(p5, 100_MiB, 8UL);
            alloc.deallocate_node(p9, 71_MiB, 8UL);
            alloc.deallocate_node(p2, 78_MiB, 8UL);
        }

        state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) * 10);
    }

    template <typename Allocator>
    static void run_allocate_descriptor_x1(benchmark::State& state, Allocator& alloc)
    {
        for (auto _ : state)
        {
            auto p0 = alloc.allocate_descriptor(33_MiB, 8UL);
        }

        state.SetItemsProcessed(static_cast<int64_t>(state.iterations()));
    }

    template <typename Allocator>
    static void run_allocate_descriptor_x10(benchmark::State& state, Allocator& alloc)
    {
        for (auto _ : state)
        {
            auto p0 = alloc.allocate_descriptor(33_MiB, 8UL);
            auto p1 = alloc.allocate_descriptor(44_MiB, 8UL);
            auto p2 = alloc.allocate_descriptor(78_MiB, 8UL);
            auto p3 = alloc.allocate_descriptor(12_MiB, 8UL);
            auto p4 = alloc.allocate_descriptor(32_MiB, 8UL);
            auto p5 = alloc.allocate_descriptor(100_MiB, 8UL);
            auto p6 = alloc.allocate_descriptor(18_MiB, 8UL);
            auto p7 = alloc.allocate_descriptor(21_MiB, 8UL);
            auto p8 = alloc.allocate_descriptor(15_MiB, 8UL);
            auto p9 = alloc.allocate_descriptor(71_MiB, 8UL);
        }

        state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) * 10);
    }

    static auto make_transactional_allocator()
    {
        constexpr std::size_t block_size  = 128_MiB;
        constexpr std::size_t block_count = 6;

        auto malloc  = make_allocator_adapter(malloc_allocator());
        auto block   = make_block_allocator<fixed_size_block_allocator>(std::move(malloc), block_size);
        auto counted = make_extended_block_allocator<count_limited_block_allocator>(std::move(block), block_count);
        auto arena   = make_cached_block_arena(std::move(counted));
        auto alloc   = make_transactional_allocator(std::move(arena));

        alloc.reserve_blocks(block_count);

        return alloc;
    }

    static auto make_bfit_allocator()
    {
        constexpr std::size_t heap_size = 128_MB * 6;
        auto bfit = make_bfit_allocator(heap_size, malloc_allocator());
        return bfit;
    }

} // namespace bench

// malloc

static void memory_malloc_raw_x1(benchmark::State& state)
{
    auto alloc = make_allocator_adapter(malloc_allocator());
    bench::run_allocate_node_x1(state, alloc);
}

static void memory_malloc_raw_x10(benchmark::State& state)
{
    auto alloc = make_allocator_adapter(malloc_allocator());
    bench::run_allocate_node_x10(state, alloc);
}

// transactional

static void memory_transactional_raw_x1(benchmark::State& state)
{
    auto alloc = bench::make_transactional_allocator();
    bench::run_allocate_node_x1(state, alloc);
}

static void memory_transactional_raw_x10(benchmark::State& state)
{
    auto alloc = bench::make_transactional_allocator();
    bench::run_allocate_node_x10(state, alloc);
}

static void memory_transactional_allocator_x1(benchmark::State& state)
{
    auto txalloc = bench::make_transactional_allocator();
    auto alloc   = make_allocator(std::move(txalloc));
    bench::run_allocate_node_x1(state, alloc);
}

static void memory_transactional_allocator_x10(benchmark::State& state)
{
    auto txalloc = bench::make_transactional_allocator();
    auto alloc   = make_allocator(std::move(txalloc));
    bench::run_allocate_node_x10(state, alloc);
}

static void memory_transactional_descriptor_x1(benchmark::State& state)
{
    auto txalloc = bench::make_transactional_allocator();
    auto alloc   = make_allocator(std::move(txalloc));
    bench::run_allocate_descriptor_x1(state, alloc);
}

static void memory_transactional_descriptor_x10(benchmark::State& state)
{
    auto txalloc = bench::make_transactional_allocator();
    auto alloc   = make_allocator(std::move(txalloc));
    bench::run_allocate_descriptor_x10(state, alloc);
}

static void memory_bfit_raw_x10(benchmark::State& state)
{
    auto bfit = bench::make_bfit_allocator();
    bench::run_allocate_node_x10(state, bfit);    
}

static void memory_bfit_allocator_x10(benchmark::State& state)
{
    auto bfit = bench::make_bfit_allocator();
    auto alloc = make_allocator(std::move(bfit));
    bench::run_allocate_node_x10(state, alloc);
}

static void memory_bfit_descriptor_x10(benchmark::State& state)
{
    auto bfit = bench::make_bfit_allocator();
    auto alloc = make_allocator(std::move(bfit));
    bench::run_allocate_descriptor_x10(state, alloc);
}

BENCHMARK(memory_malloc_raw_x1);
BENCHMARK(memory_malloc_raw_x10);

//BENCHMARK(memory_transactional_raw_x1);
//BENCHMARK(memory_transactional_allocator_x1);
//BENCHMARK(memory_transactional_descriptor_x1);

BENCHMARK(memory_transactional_raw_x10);
BENCHMARK(memory_transactional_allocator_x10);
BENCHMARK(memory_transactional_descriptor_x10);

//BENCHMARK(memory_bit_raw_x1);
//BENCHMARK(memory_bfit_allocator_x1);
//BENCHMARK(memory_bfit_descriptor_x1);

BENCHMARK(memory_bfit_raw_x10);
BENCHMARK(memory_bfit_allocator_x10);
BENCHMARK(memory_bfit_descriptor_x10);

================================================
FILE: trtlab/memory/benchmarks/bench_memory_pool.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <benchmark/benchmark.h>

#include <map>

#include <trtlab/memory/allocator.h>
#include <trtlab/memory/block_arena.h>
#include <trtlab/memory/malloc_allocator.h>
#include <trtlab/memory/memory_pool.h>
#include <trtlab/memory/std_allocator.h>
#include <trtlab/memory/literals.h>

using namespace trtlab::memory;
using namespace literals;

static void memory_pool_map_default(benchmark::State& state)
{
    std::map<std::size_t, std::size_t> m;

    for(auto _ : state)
    {
        auto upper = state.range(0);
        for(std::size_t i=0; i<upper; i++)
        {
            m[i] = i;
        }
        m.clear();
    }
}

static void memory_pool_map_malloc_raw(benchmark::State& state)
{
    auto malloc = malloc_allocator();

    // create map
    auto m = std::map<std::size_t, std::size_t, std::less<std::size_t>, std_allocator<std::pair<std::size_t, std::size_t>, decltype(malloc)>>(malloc);

    for(auto _ : state)
    {
        auto upper = state.range(0);
        for(std::size_t i=0; i<upper; i++)
        {
            m[i] = i;
        }
        m.clear();
    }
}

template<typename Key, typename Value, typename BlockAllocator>
auto make_map(BlockAllocator&& block_alloc)
{
    static_assert(is_block_allocator<BlockAllocator>::value, "");

    using node_type = std::pair<Key, Value>;
    auto node_size = alignof(node_type) + sizeof(node_type) + 64;

    auto stack = make_block_stack<uncached>(std::move(block_alloc));
    auto pool  = memory_pool<BlockAllocator>(node_size, std::move(stack));
    return pool;
    //auto alloc = make_thread_unsafe_allocator(std::move(pool));

    //return std::map<Key, Value, std::less<Key>, std_allocator<node_type, decltype(alloc)>>(alloc);
}

static void memory_pool_map_malloc_pooled(benchmark::State& state)
{
    auto malloc = make_allocator_adapter(malloc_allocator());
    auto block  = make_block_allocator<single_block_allocator>(std::move(malloc), 4_MiB);

    // create map
    auto p = make_map<std::size_t, std::size_t>(std::move(block));

    // create map
    auto m = std::map<std::size_t, std::size_t, std::less<std::size_t>, std_allocator<std::pair<std::size_t, std::size_t>, decltype(p)>>(p);


    for(auto _ : state)
    {
        auto upper = state.range(0);
        for(std::size_t i=0; i<upper; i++)
        {
            m[i] = i;
        }
        m.clear();
    }
}

BENCHMARK(memory_pool_map_default)->RangeMultiplier(2)->Range(1<<0, 1<<12);
BENCHMARK(memory_pool_map_malloc_raw)->RangeMultiplier(2)->Range(1<<0, 1<<12);
BENCHMARK(memory_pool_map_malloc_pooled)->RangeMultiplier(2)->Range(1<<0, 1<<12);

================================================
FILE: trtlab/memory/benchmarks/main.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <benchmark/benchmark.h>

BENCHMARK_MAIN();

================================================
FILE: trtlab/memory/cmake/configuration.cmake
================================================
# MODIFICATION MESSAGE

# Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
# This file is subject to the license terms in the LICENSE file
# found in the top-level directory of this distribution.

# defines configuration options
# note: only include it in memory's top-level CMakeLists.txt, after compatibility.cmake

# what to build
# examples/tests if toplevel directory (i.e. direct build, not as subdirectory) and hosted
# tools if hosted

option(TRTLAB_MEMORY_BUILD_TOOLS "whether or not to build the tools" ON)
option(TRTLAB_MEMORY_BUILD_TESTS "whether or not to build the tests" ON)
option(TRTLAB_MEMORY_BUILD_BENCHMARKS "whether or not to build the tools" ON)
option(TRTLAB_MEMORY_BUILD_EXAMPLES "whether or not to build the examples" OFF)

#SET(CPUAFF_ROOT "/usr/local" CACHE STRING "Location of cpuaff header files")
#add_library(cpuaff INTERFACE)
#target_include_directories(cpuaff INTERFACE "${CPUAFF_ROOT}/include")

set(TRTLAB_MEMORY_DEBUG_ASSERT OFF CACHE BOOL "" FORCE)
set(TRTLAB_MEMORY_DEBUG_FILL OFF CACHE BOOL "" FORCE)
set(TRTLAB_MEMORY_DEBUG_FENCE 0 CACHE STRING "" FORCE)
set(TRTLAB_MEMORY_DEBUG_LEAK_CHECK OFF CACHE BOOL "" FORCE)
set(TRTLAB_MEMORY_DEBUG_POINTER_CHECK OFF CACHE BOOL "" FORCE)
set(TRTLAB_MEMORY_DEBUG_DOUBLE_DEALLOC_CHECK OFF CACHE BOOL "" FORCE)

# most of the debugging aspects of foonathan/memory have been disabled
# debug options, pre-set by build type
# if("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
#     # disable force to allow external CMakeLists.txt to override the defaults
#     set(TRTLAB_MEMORY_DEBUG_ASSERT ON CACHE BOOL "")
#     set(TRTLAB_MEMORY_DEBUG_FILL ON CACHE BOOL "")
#     set(TRTLAB_MEMORY_DEBUG_FENCE 8 CACHE STRING "")
#     set(TRTLAB_MEMORY_DEBUG_LEAK_CHECK ON CACHE BOOL "")
#     set(TRTLAB_MEMORY_DEBUG_POINTER_CHECK ON CACHE BOOL "")
#     set(TRTLAB_MEMORY_DEBUG_DOUBLE_DEALLOC_CHECK ON CACHE BOOL "")
# elseif("${CMAKE_BUILD_TYPE}" STREQUAL "RelWithDebInfo")
#     set(TRTLAB_MEMORY_DEBUG_ASSERT OFF CACHE BOOL "" FORCE)
#     set(TRTLAB_MEMORY_DEBUG_FILL ON CACHE BOOL "" FORCE)
#     set(TRTLAB_MEMORY_DEBUG_FENCE 0 CACHE STRING "" FORCE)
#     set(TRTLAB_MEMORY_DEBUG_LEAK_CHECK ON CACHE BOOL "" FORCE)
#     set(TRTLAB_MEMORY_DEBUG_POINTER_CHECK ON CACHE BOOL "" FORCE)
#     set(TRTLAB_MEMORY_DEBUG_DOUBLE_DEALLOC_CHECK OFF CACHE BOOL "" FORCE)
# elseif("${CMAKE_BUILD_TYPE}" STREQUAL "Release")
#     set(TRTLAB_MEMORY_DEBUG_ASSERT OFF CACHE BOOL "" FORCE)
#     set(TRTLAB_MEMORY_DEBUG_FILL OFF CACHE BOOL "" FORCE)
#     set(TRTLAB_MEMORY_DEBUG_FENCE 0 CACHE STRING "" FORCE)
#     set(TRTLAB_MEMORY_DEBUG_LEAK_CHECK OFF CACHE BOOL "" FORCE)
#     set(TRTLAB_MEMORY_DEBUG_POINTER_CHECK OFF CACHE BOOL "" FORCE)
#     set(TRTLAB_MEMORY_DEBUG_DOUBLE_DEALLOC_CHECK OFF CACHE BOOL "" FORCE)
# else()
#     option(TRTLAB_MEMORY_DEBUG_ASSERT "whether or not internal assertions (like the macro assert) are enabled" OFF)
#     option(TRTLAB_MEMORY_DEBUG_FILL   "whether or not the (de-)allocated memory will be pre-filled" OFF)
#     set(TRTLAB_MEMORY_DEBUG_FENCE 0 CACHE STRING "the amount of memory used as fence to help catching overflow errors" )
#     option(TRTLAB_MEMORY_DEBUG_LEAK_CHECK "whether or not leak checking is active" OFF)
#     option(TRTLAB_MEMORY_DEBUG_POINTER_CHECK "whether or not pointer checking on deallocation is active" OFF)
#     option(TRTLAB_MEMORY_DEBUG_DOUBLE_DEALLOC_CHECK "whether or not the (sometimes expensive) check for double deallocation is active" OFF)
# endif()

# other options
option(TRTLAB_MEMORY_EXTERN_TEMPLATE
    "whether or not common template instantiations are already provided by the library" ON)
option(TRTLAB_MEMORY_CHECK_ALLOCATION_SIZE
        "whether or not the size of the allocation will be checked" ON)
option(TRTLAB_MEMORY_THREAD_SAFE_REFERENCE
    "whether or not allocator_reference is thread safe by default" ON)

set(TRTLAB_MEMORY_DEFAULT_ALLOCATOR heap_allocator CACHE STRING
    "the default implementation allocator for higher-level ones")
set(TRTLAB_MEMORY_MEMORY_RESOURCE_HEADER "<foonathan/pmr.hpp>" CACHE STRING
    "the header of the memory_resource class used")
set(TRTLAB_MEMORY_MEMORY_RESOURCE foonathan_comp::memory_resource CACHE STRING
    "the memory_resource class used")
set(TRTLAB_MEMORY_TEMPORARY_STACK_MODE 2 CACHE STRING
     "set to 0 to disable the per-thread stack completely, to 1 to disable the nitfy counter and to 2 to enable everything")


================================================
FILE: trtlab/memory/cmake/dependencies.cmake
================================================
include (ExternalProject)

set (DEPENDENCIES)
set (EXTRA_CMAKE_ARGS)

# trtlab external dependencies
list (APPEND DEPENDENCIES boost gflags glog benchmark googletest cpuaff)

# customize the folder for external projects
# download, source and builds for dependencies 
# will be in <build-dir>/Dependencies
set_property (DIRECTORY PROPERTY EP_BASE Dependencies)

# all dependencies will be installed here
# typical directories: bin, include and lib
set (INSTALL_ROOT ${CMAKE_CURRENT_BINARY_DIR}/local)

# set cmake search paths to pick up installed .cmake files
list(INSERT CMAKE_MODULE_PATH 0 "${INSTALL_ROOT}/lib/cmake")
list(INSERT CMAKE_PREFIX_PATH 0 "${INSTALL_ROOT}/lib/cmake")

# cmake config args forwarded to trtlab
list(APPEND EXTRA_CMAKE_ARGS
  -DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}
  -DCMAKE_MODULE_PATH=${CMAKE_MODULE_PATH}
# -DBoost_VERBOSE=ON
  -DBoost_USE_STATIC_LIBS=ON
  -DFIND_GTEST=OFF
  -DFIND_BENCHMARK=OFF
  -DCPUAFF_ROOT=${INSTALL_ROOT}
)

# short-cut to dependencies build path
set (BUILD_ROOT ${CMAKE_CURRENT_BINARY_DIR}/Dependencies/Build)

# Boost
# =====
# - Use static linking to avoid issues with system-wide installations of Boost.
# - Use numa=on to ensure the numa component of fiber gets built
set(BOOST_COMPONENTS "context,fiber,filesystem")
ExternalProject_Add (boost
  URL https://dl.bintray.com/boostorg/release/1.72.0/source/boost_1_72_0.tar.gz
  URL_HASH SHA256=c66e88d5786f2ca4dbebb14e06b566fb642a1a6947ad8cc9091f9f445134143f
  CONFIGURE_COMMAND ./bootstrap.sh --prefix=${INSTALL_ROOT} --with-libraries=${BOOST_COMPONENTS} numa=on
  BUILD_COMMAND ./b2 link=static cxxflags=-fPIC cflags=-fPIC cxxflags="-std=c++14" numa=on 
                     --build-dir=${BUILD_ROOT}/boost --stagedir=${BUILD_ROOT}/boost
  BUILD_IN_SOURCE 1
  INSTALL_COMMAND ./b2 install numa=on
)

# gflags
# ======
# config, build and install to INSTALL_ROOT
ExternalProject_Add(gflags
  GIT_REPOSITORY "https://github.com/gflags/gflags.git"
  GIT_TAG "v2.2.2"
  CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
             -DCMAKE_INSTALL_PREFIX=${INSTALL_ROOT}
             -DBUILD_SHARED_LIBS=ON
             -DBUILD_STATIC_LIBS=ON
             -DBUILD_PACKAGING=OFF
             -DBUILD_TESTING=OFF
             -BUILD_CONFIG_TESTS=OFF
             -DINSTALL_HEADERS=ON
             -DBUILD_gflags_LIB=OFF
             -DBUILD_gflags_nothreads_LIB=ON
             -DGFLAGS_NAMESPACE=google
# INSTALL_COMMAND   ""
)

# glog
# ====
# - link against shared 
# - todo: compile with -DWITH_GFLAGS=OFF and remove gflags dependency
ExternalProject_Add(glog
  DEPENDS gflags
  GIT_REPOSITORY "https://github.com/google/glog"
  GIT_TAG "v0.4.0"
  CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
             -DCMAKE_INSTALL_PREFIX=${INSTALL_ROOT}
             -DBUILD_TESTING=OFF
# INSTALL_COMMAND   ""
)

# google benchmark
# ================
ExternalProject_Add(benchmark
  DEPENDS 
  GIT_REPOSITORY    https://github.com/google/benchmark.git
  GIT_TAG           "v1.5.0"
  BINARY_DIR        "${CMAKE_CURRENT_BINARY_DIR}/Dependencies/Build/benchmark"
  CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
             -DCMAKE_INSTALL_PREFIX=${INSTALL_ROOT}
             -DBENCHMARK_ENABLE_TESTING=OFF
  INSTALL_COMMAND   ""
)

# google test
# ===========
ExternalProject_Add(googletest
  DEPENDS glog gflags
  GIT_REPOSITORY    https://github.com/google/googletest.git
  GIT_TAG           "release-1.10.0"
  BINARY_DIR        "${CMAKE_CURRENT_BINARY_DIR}/Dependencies/Build/googletest"
  CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
             -DCMAKE_INSTALL_PREFIX=${INSTALL_ROOT}
  INSTALL_COMMAND   ""
)

# cpuaff
# ======
ExternalProject_Add(cpuaff
  URL http://dcdillon.github.io/cpuaff/releases/cpuaff-1.0.6.tar.gz
  CONFIGURE_COMMAND ./configure --prefix=${INSTALL_ROOT}
  BUILD_COMMAND     make include
  INSTALL_COMMAND   make install include
  BUILD_IN_SOURCE 1
)


# trtlab
# ======
ExternalProject_Add (trtlab_memory
  DEPENDS ${DEPENDENCIES}
  SOURCE_DIR ${PROJECT_SOURCE_DIR}
  CMAKE_ARGS -DBUILD_DEPENDENCIES=OFF ${EXTRA_CMAKE_ARGS}
  INSTALL_COMMAND ""
  BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})

================================================
FILE: trtlab/memory/include/trtlab/memory/align.h
================================================
// MODIFICATION_MESSAGE

// Modification notes:
// - alignment is no longer a detail
// - removed compat headers for alignas, alignof and max_align_t

// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

#ifndef TRTLAB_MEMORY_DETAIL_ALIGN_H_INCLUDED
#define TRTLAB_MEMORY_DETAIL_ALIGN_H_INCLUDED

#include <cstdint>

#include "config.h"
#include "detail/assert.h"

namespace trtlab
{
    namespace memory
    {
        using addr_t = unsigned char*;

        // whether or not an alignment is valid, i.e. a power of two and not zero
        constexpr bool is_valid_alignment(std::size_t alignment) noexcept
        {
            return alignment && (alignment & (alignment - 1)) == 0u;
        }

        // align up to a power of 2, align_bytes is expected to be a nonzero
        inline std::size_t align_up(std::size_t v, std::size_t alignment) noexcept
        {
            return (v + (alignment - 1)) & ~(alignment - 1);
        }

        // returns the offset needed to align ptr for given alignment
        // alignment must be valid
        inline std::size_t align_offset(std::uintptr_t address, std::size_t alignment) noexcept
        {
            TRTLAB_MEMORY_ASSERT(is_valid_alignment(alignment));
            auto misaligned = address & (alignment - 1);
            return misaligned != 0 ? (alignment - misaligned) : 0;
        }

        inline std::size_t align_offset(void* ptr, std::size_t alignment) noexcept
        {
            return align_offset(reinterpret_cast<std::uintptr_t>(ptr), alignment);
        }

        inline void* memory_shift(void* memory, std::int64_t size)
        {
            return static_cast<void*>(reinterpret_cast<addr_t>(memory) + size);
        }

        inline std::pair<void*, std::size_t> align_shift(void* ptr, std::size_t alignment) noexcept
        {
            auto offset = align_offset(ptr, alignment);
            return std::make_pair(memory_shift(ptr, offset), offset);
        }

        // whether or not the pointer is aligned for given alignment
        // alignment must be valid
        bool is_aligned(void* ptr, std::size_t alignment) noexcept;

        // this need to be abstracted into memory_type
        // returns the minimum alignment required for a node of given size
        std::size_t alignment_for(std::size_t size) noexcept;


        std::size_t ilog2(std::size_t x);
        std::size_t ilog2_ceil(std::size_t x);

    } // namespace memory
} // namespace trtlab

#endif // TRTLAB_MEMORY_DETAIL_ALIGN_H_INCLUDED


================================================
FILE: trtlab/memory/include/trtlab/memory/allocator.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once
#include <memory>

#include <glog/logging.h>

#include "trtlab/memory/allocator_storage.h"
#include "trtlab/memory/descriptor.h"

namespace trtlab
{
    namespace memory
    {
        namespace allocator_detail
        {
            template <typename RawAllocator, typename Mutex>
            class smart_storage : public iallocator,
                                  public std::enable_shared_from_this<smart_storage<RawAllocator, Mutex>>,
                                  private allocator_storage<direct_storage<RawAllocator>, Mutex>
            {
                using storage_type = allocator_storage<direct_storage<RawAllocator>, Mutex>;

            public:
                using allocator_type = typename storage_type::allocator_type;
                using memory_type    = typename storage_type::memory_type;
                using mutex          = typename storage_type::mutex;

                explicit smart_storage(storage_type&& storage) : storage_type(std::move(storage)),
                  m_min_alignment(storage_type::min_alignment()),
                  m_max_alignment(storage_type::max_alignment()) {}

                smart_storage(smart_storage&& other)
                : storage_type(std::move(other)),
                  m_min_alignment(storage_type::min_alignment()),
                  m_max_alignment(storage_type::max_alignment())
                {
                }
                smart_storage& operator=(smart_storage&& other)
                {
                    storage_type::operator=(std::move(other));
                    m_min_alignment = other.min_alignment();
                    m_max_alignment = other.max_alignment();
                    return *this;
                }

                ~smart_storage() override {}

                allocator_type& get_allocator() noexcept
                {
                    return storage_type::get_allocator();
                }

                const allocator_type& get_allocator() const noexcept
                {
                    return storage_type::get_allocator();
                }

                // disambiguate method in both iallocator and storage_type

                using iallocator::allocate_descriptor;
                using iallocator::device_context;
                using iallocator::max_alignment;
                using iallocator::min_alignment;

            private:
                inline void* do_allocate(std::size_t size, std::size_t alignment) final override
                {
                    return storage_type::allocate_node(size, alignment);
                }

                inline void do_deallocate(void* ptr, std::size_t size, std::size_t alignment) noexcept final override
                {
                    storage_type::deallocate_node(ptr, size, alignment);
                }

                inline descriptor do_allocate_descriptor(std::size_t size, std::size_t alignment) final override
                {
                    alignment = std::min(std::max(alignment, m_min_alignment), m_max_alignment);
                    return descriptor(std::move(this->shared_from_this()), size, alignment);
                }

                inline std::size_t do_max_alignment() const final override
                {
                    return m_max_alignment;
                }

                inline std::size_t do_min_alignment() const final override
                {
                    return m_min_alignment;
                }

                inline std::size_t do_max_size() const final override
                {
                    return storage_type::max_node_size();
                }

                inline DLContext do_device_context() const final override
                {
                    return storage_type::device_context();
                }

                std::size_t m_min_alignment;
                std::size_t m_max_alignment;
            };

            template <typename Mutex, typename RawAllocator>
            auto make_allocator_storage(RawAllocator&& alloc)
            {
                auto storage = allocator_storage<direct_storage<RawAllocator>, Mutex>(std::move(alloc));
                return std::make_shared<smart_storage<RawAllocator, Mutex>>(std::move(storage));
            }

            template <typename StorageType>
            class allocator_impl
            {
                using storage_type = StorageType;

                std::shared_ptr<storage_type> m_storage;

            public:
                using allocator_type = typename storage_type::allocator_type;
                using memory_type    = typename storage_type::memory_type;
                using mutex          = typename storage_type::mutex;
                using is_stateful    = std::true_type;

                explicit allocator_impl(std::shared_ptr<storage_type> storage) : m_storage(storage) 
                {
                    static_assert(std::is_base_of<iallocator, storage_type>::value, "storage must be derived from iallocator");
                }
                virtual ~allocator_impl() = default;

                allocator_impl(const allocator_impl<StorageType>& other) = default;
                allocator_impl& operator=(const allocator_impl<StorageType>& other) = default;

                allocator_impl(allocator_impl&& other) : m_storage(std::exchange(other.m_storage, nullptr)) {}

                allocator_impl& operator=(allocator_impl&& other)
                {
                    m_storage = std::exchange(other.m_storage, nullptr);
                    return *this;
                }

                allocator_impl copy() const noexcept
                {
                    return *this;
                }

                // iallocator

                void* allocate(std::size_t size, std::size_t alignment = 0UL)
                {
                    DCHECK(m_storage);
                    return m_storage->allocate(size, alignment);
                }

                void deallocate(void* ptr, std::size_t size = 0UL, std::size_t alignment = 0UL)
                {
                    DCHECK(m_storage);
                    m_storage->deallocate(ptr, size, alignment);
                }

                descriptor allocate_descriptor(std::size_t size, std::size_t alignment = 0UL)
                {
                    return m_storage->allocate_descriptor(size, alignment);
                }

                // allocator_traits

                void* allocate_node(std::size_t size, std::size_t alignment)
                {
                    DCHECK(m_storage);
                    return m_storage->allocate(size, alignment);
                }

                void* allocate_array(std::size_t count, std::size_t size, std::size_t alignment)
                {
                    DCHECK(m_storage);
                    return m_storage->allocate(count * size, alignment);
                }

                void deallocate_node(void* ptr, std::size_t size, std::size_t alignment) noexcept
                {
                    DCHECK(m_storage);
                    m_storage->deallocate(ptr, size, alignment);
                }

                void deallocate_array(void* ptr, std::size_t count, std::size_t size, std::size_t alignment) noexcept
                {
                    DCHECK(m_storage);
                    m_storage->deallocate(ptr, count * size, alignment);
                }

                DLContext device_context() const
                {
                    DCHECK(m_storage);
                    return m_storage->device_context();
                }

                std::size_t max_alignment() const
                {
                    DCHECK(m_storage);
                    return m_storage->max_alignment();
                }

                std::size_t min_alignment() const
                {
                    DCHECK(m_storage);
                    return m_storage->min_alignment();
                }

                std::size_t max_node_size() const
                {
                    DCHECK(m_storage);
                    return m_storage->max_size();
                }

                std::size_t max_array_size() const
                {
                    DCHECK(m_storage);
                    return m_storage->max_size();
                }

                // storage policy methods

                allocator_type& get_allocator() noexcept
                {
                    return m_storage->get_allocator();
                }

                const allocator_type& get_allocator() const noexcept
                {
                    return *m_storage->get_allocator();
                }

                // access raw allocator

                auto use_count() const noexcept
                {
                    return m_storage.use_count();
                }

                // allocator interface

                iallocator& get_iallocator()
                {
                    return *m_storage;
                }

                std::shared_ptr<storage_type> shared() const
                {
                    return m_storage;
                }

            private:
                template <typename T>
                friend bool operator==(const allocator_impl<T>& lhs, const allocator_impl<T>& rhs) noexcept;
            };

            template <typename T>
            bool operator==(const allocator_impl<T>& lhs, const allocator_impl<T>& rhs) noexcept
            {
                return lhs.m_storage.get() == rhs.m_storage.get();
            }

        } // namespace allocator_detail

        template <typename RawAllocator, typename Mutex>
        using allocator = allocator_detail::allocator_impl<allocator_detail::smart_storage<RawAllocator, Mutex>>;

        // convenience methods to create allocators from raw allocators

        template <typename RawAllocator>
        auto make_allocator(RawAllocator&& alloc)
        {
            auto storage = allocator_detail::make_allocator_storage<detail::mutex_for<RawAllocator, std::mutex>>(std::move(alloc));
            return allocator<RawAllocator, detail::mutex_for<RawAllocator, std::mutex>>(std::move(storage));
        }

        template <typename Mutex, typename RawAllocator>
        auto make_allocator(RawAllocator&& alloc)
        {
            auto storage = allocator_detail::make_allocator_storage<detail::mutex_for<RawAllocator, Mutex>>(std::move(alloc));
            return allocator<RawAllocator, detail::mutex_for<RawAllocator, Mutex>>(std::move(storage));
        }

        // only stateless or threadsafe RawAllocators can be used with this method
        template <typename RawAllocator>
        auto make_thread_unsafe_allocator(RawAllocator&& alloc)
        {
            auto storage = allocator_detail::make_allocator_storage<detail::mutex_for<RawAllocator, no_mutex>>(std::move(alloc));
            return allocator<RawAllocator, no_mutex>(std::move(storage));
        }

        // this specialization of is_shared_allocator
        // the allocator is shared, if - like \ref allocator_reference -
        //   if multiple objects refer to the same internal allocator and if it can be copied.
        template <typename RawAllocator, typename Mutex>
        struct is_shared_allocator<allocator<RawAllocator, Mutex>> : std::true_type
        {
        };

        template <typename RawAllocator, typename Mutex>
        struct is_thread_safe_allocator<allocator<RawAllocator, Mutex>> : std::true_type
        {
        };

    } // namespace memory
} // namespace trtlab

================================================
FILE: trtlab/memory/include/trtlab/memory/allocator_storage.h
================================================
// MODIFICATION MESSAGE

// Modification notes:

// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

#ifndef TRTLAB_MEMORY_ALLOCATOR_STORAGE_H_INCLUDED
#define TRTLAB_MEMORY_ALLOCATOR_STORAGE_H_INCLUDED

/// \file
/// Class template \ref foonathan::memory::allocator_storage, some policies and resulting typedefs.
#include <glog/logging.h>

#include <new>
#include <type_traits>

#include "detail/utility.h"
#include "config.h"
#include "allocator_traits.h"
#include "threading.h"

namespace trtlab
{
    namespace memory
    {
#if !defined(DOXYGEN)
        template <class StoragePolicy, class Mutex>
        class allocator_storage;
#endif

        namespace detail
        {
            // whether or not the allocator of the storage policy is a raw allocator itself
            template <class StoragePolicy>
            using is_nested_policy = is_instantiation_of<allocator_storage, typename StoragePolicy::allocator_type>;

            template <class Alloc>
            void* try_allocate_node(std::true_type, Alloc& alloc, std::size_t size, std::size_t alignment) noexcept
            {
                return composable_allocator_traits<Alloc>::try_allocate_node(alloc, size, alignment);
            }

            template <class Alloc>
            void* try_allocate_array(std::true_type, Alloc& alloc, std::size_t count, std::size_t size, std::size_t alignment) noexcept
            {
                return composable_allocator_traits<Alloc>::try_allocate_array(alloc, count, size, alignment);
            }

            template <class Alloc>
            bool try_deallocate_node(std::true_type, Alloc& alloc, void* ptr, std::size_t size, std::size_t alignment) noexcept
            {
                return composable_allocator_traits<Alloc>::try_deallocate_node(alloc, ptr, size, alignment);
            }

            template <class Alloc>
            bool try_deallocate_array(std::true_type, Alloc& alloc, void* ptr, std::size_t count, std::size_t size,
                                      std::size_t alignment) noexcept
            {
                return composable_allocator_traits<Alloc>::try_deallocate_array(alloc, ptr, count, size, alignment);
            }

            template <class Alloc>
            void* try_allocate_node(std::false_type, Alloc&, std::size_t, std::size_t) noexcept
            {
                TRTLAB_MEMORY_UNREACHABLE("Allocator is not compositioning");
                return nullptr;
            }

            template <class Alloc>
            void* try_allocate_array(std::false_type, Alloc&, std::size_t, std::size_t, std::size_t) noexcept
            {
                TRTLAB_MEMORY_UNREACHABLE("Allocator is not compositioning");
                return nullptr;
            }

            template <class Alloc>
            bool try_deallocate_node(std::false_type, Alloc&, void*, std::size_t, std::size_t) noexcept
            {
                TRTLAB_MEMORY_UNREACHABLE("Allocator is not compositioning");
                return false;
            }

            template <class Alloc>
            bool try_deallocate_array(std::false_type, Alloc&, void*, std::size_t, std::size_t, std::size_t) noexcept
            {
                TRTLAB_MEMORY_UNREACHABLE("Allocator is not compositioning");
                return false;
            }
        } // namespace detail

        /// A \concept{concept_rawallocator,RawAllocator} that stores another allocator.
        /// The \concept{concept_storagepolicy,StoragePolicy} defines the allocator type being stored and how it is stored.
        /// The \c Mutex controls synchronization of the access.
        /// \requires The \c StoragePolicy itself must not store an instance of this class.
        /// \ingroup memory storage
        template <class StoragePolicy, class Mutex>
        class allocator_storage
        : private TRTLAB_EBO(StoragePolicy, detail::mutex_storage<detail::mutex_for<typename StoragePolicy::allocator_type, Mutex>>)
//      : private TRTLAB_EBO(StoragePolicy, detail::mutex_storage<Mutex>)
        {
            static_assert(!detail::is_nested_policy<StoragePolicy>::value,
                          "allocator_storage instantiated with another allocator_storage, double wrapping!");

            using traits            = allocator_traits<typename StoragePolicy::allocator_type>;
            using composable_traits = composable_allocator_traits<typename StoragePolicy::allocator_type>;
            using composable        = is_composable_allocator<typename StoragePolicy::allocator_type>;
            using actual_mutex      = const detail::mutex_storage<detail::mutex_for<typename StoragePolicy::allocator_type, Mutex>>;

        public:
            using allocator_type = typename StoragePolicy::allocator_type;
            using storage_policy = StoragePolicy;
            using mutex          = detail::mutex_for<typename StoragePolicy::allocator_type, Mutex>;
            using is_stateful    = typename traits::is_stateful;
            using memory_type    = typename traits::memory_type;

            /// \effects Creates it by default-constructing the \c StoragePolicy.
            /// \requires The \c StoragePolicy must be default-constructible.
            /// \notes The default constructor may create an invalid allocator storage not associated with any allocator.
            /// If that is the case, it must not be used.
            allocator_storage() = default;
            ~allocator_storage() {}

            /// \effects Creates it by passing it an allocator.
            /// The allocator will be forwarded to the \c StoragePolicy, it decides whether it will be moved, its address stored or something else.
            /// \requires The expression <tt>new storage_policy(std::forward<Alloc>(alloc))</tt> must be well-formed,
            /// otherwise this constructor does not participate in overload resolution.
            template <class Alloc,
                      // MSVC seems to ignore access rights in SFINAE below
                      // use this to prevent this constructor being chosen instead of move for types inheriting from it
                      TRTLAB_REQUIRES((!std::is_base_of<allocator_storage, typename std::decay<Alloc>::type>::value))>
            allocator_storage(Alloc&& alloc, TRTLAB_SFINAE(new storage_policy(detail::forward<Alloc>(alloc))))
            : storage_policy(detail::forward<Alloc>(alloc))
            {
            }

            /// \effects Creates it by passing it another \c allocator_storage with a different \c StoragePolicy but the same \c Mutex type.
            /// Initializes it with the result of \c other.get_allocator().
            /// \requires The expression <tt>new storage_policy(other.get_allocator())</tt> must be well-formed,
            /// otherwise this constructor does not participate in overload resolution.
            template <class OtherPolicy>
            allocator_storage(const allocator_storage<OtherPolicy, Mutex>& other, TRTLAB_SFINAE(new storage_policy(other.get_allocator())))
            : storage_policy(other.get_allocator())
            {
            }

            /// @{
            /// \effects Moves the \c allocator_storage object.
            /// A moved-out \c allocator_storage object must still store a valid allocator object.
            allocator_storage(allocator_storage&& other) noexcept
            : storage_policy(detail::move(other)),
              detail::mutex_storage<detail::mutex_for<typename StoragePolicy::allocator_type, Mutex>>(detail::move(other))
            {
            }

            allocator_storage& operator=(allocator_storage&& other) noexcept
            {
                storage_policy::                                                                         operator=(detail::move(other));
                detail::mutex_storage<detail::mutex_for<typename StoragePolicy::allocator_type, Mutex>>::operator=(detail::move(other));
                return *this;
            }
            /// @}

            /// @{
            /// \effects Copies the \c allocator_storage object.
            /// \requires The \c StoragePolicy must be copyable.
            allocator_storage(const allocator_storage&) = default;
            allocator_storage& operator=(const allocator_storage&) = default;
            /// @}

            /// @{
            /// \effects Calls the function on the stored allocator.
            /// The \c Mutex will be locked during the operation.
            void* allocate_node(std::size_t size, std::size_t alignment)
            {
                std::lock_guard<actual_mutex> lock(*this);

                auto&& alloc = get_allocator();
                alignment = validate_alignment(alignment);
                return traits::allocate_node(alloc, size, alignment);
            }

            void* allocate_array(std::size_t count, std::size_t size, std::size_t alignment)
            {
                std::lock_guard<actual_mutex> lock(*this);

                auto&& alloc = get_allocator();
                alignment = validate_alignment(alignment);
                return traits::allocate_array(alloc, count, size, alignment);
            }

            void deallocate_node(void* ptr, std::size_t size, std::size_t alignment) noexcept
            {
                std::lock_guard<actual_mutex> lock(*this);

                auto&& alloc = get_allocator();
                alignment = validate_alignment(alignment);
                traits::deallocate_node(alloc, ptr, size, alignment);
            }

            void deallocate_array(void* ptr, std::size_t count, std::size_t size, std::size_t alignment) noexcept
            {
                std::lock_guard<actual_mutex> lock(*this);

                auto&& alloc = get_allocator();
                alignment = validate_alignment(alignment);
                traits::deallocate_array(alloc, ptr, count, size, alignment);
            }

            std::size_t max_node_size() const
            {
                std::lock_guard<actual_mutex> lock(*this);

                auto&& alloc = get_allocator();
                return traits::max_node_size(alloc);
            }

            std::size_t max_array_size() const
            {
                std::lock_guard<actual_mutex> lock(*this);

                auto&& alloc = get_allocator();
                return traits::max_array_size(alloc);
            }

            std::size_t max_alignment() const
            {
                std::lock_guard<actual_mutex> lock(*this);

                auto&& alloc = get_allocator();
                return traits::max_alignment(alloc);
            }

            std::size_t min_alignment() const
            {
                std::lock_guard<actual_mutex> lock(*this);

                auto&& alloc = get_allocator();
                return traits::min_alignment(alloc);
            }

            DLContext device_context() const
            {
                std::lock_guard<actual_mutex> lock(*this);

                auto&& alloc = get_allocator();
                return traits::device_context(alloc);
            }
            /// @}

            /// @{
            /// \effects Calls the function on the stored composable allocator.
            /// The \c Mutex will be locked during the operation.
            /// \requires The allocator must be composable,
            /// i.e. \ref is_composable() must return `true`.
            /// \note This check is done at compile-time where possible,
            /// and at runtime in the case of type-erased storage.
            TRTLAB_ENABLE_IF(composable::value)
            void* try_allocate_node(std::size_t size, std::size_t alignment) noexcept
            {
                TRTLAB_MEMORY_ASSERT(is_composable());
                std::lock_guard<actual_mutex> lock(*this);
                auto&&                        alloc = get_allocator();
                return composable_traits::try_allocate_node(alloc, size, alignment);
            }

            TRTLAB_ENABLE_IF(composable::value)
            void* try_allocate_array(std::size_t count, std::size_t size, std::size_t alignment) noexcept
            {
                TRTLAB_MEMORY_ASSERT(is_composable());
                std::lock_guard<actual_mutex> lock(*this);
                auto&&                        alloc = get_allocator();
                return composable_traits::try_allocate_array(alloc, count, size, alignment);
            }

            TRTLAB_ENABLE_IF(composable::value)
            bool try_deallocate_node(void* ptr, std::size_t size, std::size_t alignment) noexcept
            {
                TRTLAB_MEMORY_ASSERT(is_composable());
                std::lock_guard<actual_mutex> lock(*this);
                auto&&                        alloc = get_allocator();
                return composable_traits::try_deallocate_node(alloc, ptr, size, alignment);
            }

            TRTLAB_ENABLE_IF(composable::value)
            bool try_deallocate_array(void* ptr, std::size_t count, std::size_t size, std::size_t alignment) noexcept
            {
                TRTLAB_MEMORY_ASSERT(is_composable());
                std::lock_guard<actual_mutex> lock(*this);
                auto&&                        alloc = get_allocator();
                return composable_traits::try_deallocate_array(alloc, ptr, count, size, alignment);
            }
            /// @}

            /// @{
            /// \effects Forwards to the \c StoragePolicy.
            /// \returns Returns a reference to the stored allocator.
            /// \note This does not lock the \c Mutex.
            auto get_allocator() noexcept -> decltype(std::declval<storage_policy>().get_allocator())
            {
                return storage_policy::get_allocator();
            }

            auto get_allocator() const noexcept -> decltype(std::declval<const storage_policy>().get_allocator())
            {
                return storage_policy::get_allocator();
            }
            /// @}

            /// @{
            /// \returns A proxy object that acts like a pointer to the stored allocator.
            /// It cannot be reassigned to point to another allocator object and only moving is supported, which is destructive.
            /// As long as the proxy object lives and is not moved from, the \c Mutex will be kept locked.
            auto lock() noexcept -> TRTLAB_IMPL_DEFINED(decltype(detail::lock_allocator(std::declval<storage_policy>().get_allocator(),
                                                                                        std::declval<actual_mutex&>())))
            {
                return detail::lock_allocator(get_allocator(), static_cast<actual_mutex&>(*this));
            }

            auto lock() const noexcept
                -> TRTLAB_IMPL_DEFINED(decltype(detail::lock_allocator(std::declval<const storage_policy>().get_allocator(),
                                                                       std::declval<actual_mutex&>())))
            {
                return detail::lock_allocator(get_allocator(), static_cast<actual_mutex&>(*this));
            }
            /// @}.

            /// \returns Whether or not the stored allocator is composable,
            /// that is you can use the compositioning functions.
            /// \note Due to type-erased allocators,
            /// this function can not be `constexpr`.
            bool is_composable() const noexcept
            {
                return StoragePolicy::is_composable();
            }

          private:
            inline std::size_t validate_alignment(std::size_t alignment)
            {
                auto&& alloc = get_allocator();
                auto min_alignment = traits::min_alignment(alloc);
                auto max_alignment = traits::max_alignment(alloc);
                if(alignment > max_alignment || alignment < min_alignment)
                {
                    DLOG(WARNING) << "invalid alignment " << alignment << " passed; using " << min_alignment;
                    return min_alignment;
                }
                return alignment;
            }
        };

        /// Tag type that enables type-erasure in \ref reference_storage.
        /// It can be used everywhere a \ref allocator_reference is used internally.
        /// \ingroup memory storage
        struct any_allocator
        {
        };

        /// A \concept{concept_storagepolicy,StoragePolicy} that stores the allocator directly.
        /// It embeds the allocator inside it, i.e. moving the storage policy will move the allocator.
        /// \ingroup memory storage
        template <class RawAllocator>
        class direct_storage : TRTLAB_EBO(allocator_traits<RawAllocator>::allocator_type)
        {
            static_assert(!std::is_same<RawAllocator, any_allocator>::value, "cannot type-erase in direct_storage");

        public:
            using allocator_type = typename allocator_traits<RawAllocator>::allocator_type;

            /// \effects Creates it by default-constructing the allocator.
            /// \requires The \c RawAllcoator must be default constructible.
            direct_storage() = default;

            /// \effects Creates it by moving in an allocator object.
            direct_storage(allocator_type&& allocator) noexcept : allocator_type(detail::move(allocator)) {}

            /// @{
            /// \effects Moves the \c direct_storage object.
            /// This will move the stored allocator.
            direct_storage(direct_storage&& other) noexcept : allocator_type(detail::move(other)) {}

            direct_storage& operator=(direct_storage&& other) noexcept
            {
                allocator_type::operator=(detail::move(other));
                return *this;
            }
            /// @}

            /// @{
            /// \returns A (\c const) reference to the stored allocator.
            allocator_type& get_allocator() noexcept
            {
                return *this;
            }

            const allocator_type& get_allocator() const noexcept
            {
                return *this;
            }
            /// @}

        protected:
            ~direct_storage() noexcept = default;

            bool is_composable() const noexcept
            {
                return is_composable_allocator<allocator_type>::value;
            }
        };

        /// An alias template for \ref allocator_storage using the \ref direct_storage policy without a mutex.
        /// It has the effect of giving any \concept{concept_rawallocator,RawAllocator} the interface with all member functions,
        /// avoiding the need to wrap it inside the \ref allocator_traits.
        /// \ingroup memory storage
        template <class RawAllocator>
        TRTLAB_ALIAS_TEMPLATE(allocator_adapter, allocator_storage<direct_storage<RawAllocator>, no_mutex>);

        /// \returns A new \ref allocator_adapter object created by forwarding to the constructor.
        /// \relates allocator_adapter
        template <class RawAllocator>
        auto make_allocator_adapter(RawAllocator&& allocator) noexcept -> allocator_adapter<typename std::decay<RawAllocator>::type>
        {
            return {detail::forward<RawAllocator>(allocator)};
        }

/// An alias template for \ref allocator_storage using the \ref direct_storage policy with a mutex.
/// It has a similar effect as \ref allocator_adapter but performs synchronization.
/// The \c Mutex will default to \c std::mutex if threading is supported,
/// otherwise there is no default.
/// \ingroup memory
#if TRTLAB_HAS_THREADING_SUPPORT
        template <class RawAllocator, class Mutex = std::mutex>
        TRTLAB_ALIAS_TEMPLATE(thread_safe_allocator, allocator_storage<direct_storage<RawAllocator>, Mutex>);
#else
        template <class RawAllocator, class Mutex>
        TRTLAB_ALIAS_TEMPLATE(thread_safe_allocator, allocator_storage<direct_storage<RawAllocator>, Mutex>);
#endif

#if TRTLAB_HAS_THREADING_SUPPORT
        /// \returns A new \ref thread_safe_allocator object created by forwarding to the constructor/
        /// \relates thread_safe_allocator
        template <class RawAllocator>
        auto make_thread_safe_allocator(RawAllocator&& allocator) -> thread_safe_allocator<typename std::decay<RawAllocator>::type>
        {
            return detail::forward<RawAllocator>(allocator);
        }
#endif

        /// \returns A new \ref thread_safe_allocator object created by forwarding to the constructor,
        /// specifying a certain mutex type.
        /// \requires It requires threading support from the implementation.
        /// \relates thread_safe_allocator
        template <class Mutex, class RawAllocator>
        auto make_thread_safe_allocator(RawAllocator&& allocator) -> thread_safe_allocator<typename std::decay<RawAllocator>::type, Mutex>
        {
            return detail::forward<RawAllocator>(allocator);
        }

        namespace detail
        {
            struct reference_stateful
            {
            };
            struct reference_stateless
            {
            };
            struct reference_shared
            {
            };

            reference_stateful  reference_type(std::true_type stateful, std::false_type shared);
            reference_stateless reference_type(std::false_type stateful, std::true_type shared);
            reference_stateless reference_type(std::false_type stateful, std::false_type shared);
            reference_shared    reference_type(std::true_type stateful, std::true_type shared);

            template <class RawAllocator, class Tag>
            class reference_storage_impl;

            // reference to stateful: stores a pointer to an allocator
            template <class RawAllocator>
            class reference_storage_impl<RawAllocator, reference_stateful>
            {
            protected:
                reference_storage_impl() noexcept : alloc_(nullptr) {}

                reference_storage_impl(RawAllocator& allocator) noexcept : alloc_(&allocator) {}

                bool is_valid() const noexcept
                {
                    return alloc_ != nullptr;
                }

                RawAllocator& get_allocator() const noexcept
                {
                    TRTLAB_MEMORY_ASSERT(alloc_ != nullptr);
                    return *alloc_;
                }

            private:
                RawAllocator* alloc_;
            };

            // reference to stateless: store in static storage
            template <class RawAllocator>
            class reference_storage_impl<RawAllocator, reference_stateless>
            {
            protected:
                reference_storage_impl() noexcept = default;

                reference_storage_impl(const RawAllocator&) noexcept {}

                bool is_valid() const noexcept
                {
                    return true;
                }

                RawAllocator& get_allocator() const noexcept
                {
                    static RawAllocator alloc;
                    return alloc;
                }
            };

            // reference to shared: stores RawAllocator directly
            template <class RawAllocator>
            class reference_storage_impl<RawAllocator, reference_shared>
            {
            protected:
                reference_storage_impl() noexcept = default;

                reference_storage_impl(const RawAllocator& alloc) noexcept : alloc_(alloc) {}

                bool is_valid() const noexcept
                {
                    return true;
                }

                RawAllocator& get_allocator() const noexcept
                {
                    return alloc_;
                }

            private:
                mutable RawAllocator alloc_;
            };
        } // namespace detail

        /// Specifies whether or not a \concept{concept_rawallocator,RawAllocator} has shared semantics.
        /// It is shared, if - like \ref allocator_reference - if multiple objects refer to the same internal allocator and if it can be copied.
        /// This sharing is stateful, however, stateless allocators are not considered shared in the meaning of this traits. <br>
        /// If a \c RawAllocator is shared, it will be directly embedded inside \ref reference_storage since it already provides \ref allocator_reference like semantics, so there is no need to add them manually,<br>
        /// Specialize it for your own types, if they provide sharing semantics and can be copied.
        /// They also must provide an `operator==` to check whether two allocators refer to the same shared one.
        /// \note This makes no guarantees about the lifetime of the shared object, the sharing allocators can either own or refer to a shared object.
        /// \ingroup memory storage
        template <class RawAllocator>
        struct is_shared_allocator : std::false_type
        {
        };

        /// A \concept{concept_storagepolicy,StoragePolicy} that stores a reference to an allocator.
        /// For stateful allocators it only stores a pointer to an allocator object and copying/moving only copies the pointer.
        /// For stateless allocators it does not store anything, an allocator will be constructed as needed.
        /// For allocators that are already shared (determined through \ref is_shared_allocator) it will store the allocator type directly.
        /// \note It does not take ownership over the allocator in the stateful case, the user has to ensure that the allocator object stays valid.
        /// In the other cases the lifetime does not matter.
        /// \ingroup memory storage
        template <class RawAllocator>
        class reference_storage
#ifndef DOXYGEN
        : TRTLAB_EBO(detail::reference_storage_impl<typename allocator_traits<RawAllocator>::allocator_type,
                                                    decltype(detail::reference_type(typename allocator_traits<RawAllocator>::is_stateful{},
                                                                                    is_shared_allocator<RawAllocator>{}))>)
#endif
        {
            using storage =
                detail::reference_storage_impl<typename allocator_traits<RawAllocator>::allocator_type,
                                               decltype(detail::reference_type(typename allocator_traits<RawAllocator>::is_stateful{},
                                                                               is_shared_allocator<RawAllocator>{}))>;

        public:
            using allocator_type = typename allocator_traits<RawAllocator>::allocator_type;
            using memory_type    = typename allocator_traits<RawAllocator>::memory_type;

            /// Default constructor.
            /// \effects If the allocator is stateless, this has no effect and the object is usable as an allocator.
            /// If the allocator is stateful, creates an invalid reference without any associated allocator.
            /// Then it must not be used.
            /// If the allocator is shared, default constructs the shared allocator.
            /// If the shared allocator does not have a default constructor, this constructor is ill-formed.
            reference_storage() noexcept = default;

            /// \effects Creates it from a stateless or shared allocator.
            /// It will not store anything, only creates the allocator as needed.
            /// \requires The \c RawAllocator is stateless or shared.
            reference_storage(const allocator_type& alloc) noexcept : storage(alloc) {}

            /// \effects Creates it from a reference to a stateful allocator.
            /// It will store a pointer to this allocator object.
            /// \note The user has to take care that the lifetime of the reference does not exceed the allocator lifetime.
            reference_storage(allocator_type& alloc) noexcept : storage(alloc) {}

            /// @{
            /// \effects Copies the \c allocator_reference object.
            /// Only copies the pointer to it in the stateful case.
            reference_storage(const reference_storage&) noexcept = default;
            reference_storage& operator=(const reference_storage&) noexcept = default;
            /// @}

            /// \returns Whether or not the reference is valid.
            /// It is only invalid, if it was created by the default constructor and the allocator is stateful.
            explicit operator bool() const noexcept
            {
                return storage::is_valid();
            }

            /// \returns Returns a reference to the allocator.
            /// \requires The reference must be valid.
            allocator_type& get_allocator() const noexcept
            {
                return storage::get_allocator();
            }

        protected:
            ~reference_storage() noexcept = default;

            bool is_composable() const noexcept
            {
                return is_composable_allocator<allocator_type>::value;
            }
        };

        /// Specialization of the class template \ref reference_storage that is type-erased.
        /// It is triggered by the tag type \ref any_allocator.
        /// The specialization can store a reference to any allocator type.
        /// \ingroup memory storage
        template <>
        class reference_storage<any_allocator>
        {
            class base_allocator
            {
            public:
                using is_stateful = std::true_type;

                virtual ~base_allocator() = default;

                virtual void clone(void* storage) const noexcept = 0;

                void* allocate_node(std::size_t size, std::size_t alignment)
                {
                    return allocate_impl(1, size, alignment);
                }

                void* allocate_array(std::size_t count, std::size_t size, std::size_t alignment)
                {
                    return allocate_impl(count, size, alignment);
                }

                void deallocate_node(void* node, std::size_t size, std::size_t alignment) noexcept
                {
                    deallocate_impl(node, 1, size, alignment);
                }

                void deallocate_array(void* array, std::size_t count, std::size_t size, std::size_t alignment) noexcept
                {
                    deallocate_impl(array, count, size, alignment);
                }

                void* try_allocate_node(std::size_t size, std::size_t alignment) noexcept
                {
                    return try_allocate_impl(1, size, alignment);
                }

                void* try_allocate_array(std::size_t count, std::size_t size, std::size_t alignment) noexcept
                {
                    return try_allocate_impl(count, size, alignment);
                }

                bool try_deallocate_node(void* node, std::size_t size, std::size_t alignment) noexcept
                {
                    return try_deallocate_impl(node, 1, size, alignment);
                }

                bool try_deallocate_array(void* array, std::size_t count, std::size_t size, std::size_t alignment) noexcept
                {
                    return try_deallocate_impl(array, count, size, alignment);
                }

                // count 1 means node
                virtual void* allocate_impl(std::size_t count, std::size_t size, std::size_t alignment)                       = 0;
                virtual void  deallocate_impl(void* ptr, std::size_t count, std::size_t size, std::size_t alignment) noexcept = 0;

                virtual void* try_allocate_impl(std::size_t count, std::size_t size, std::size_t alignment) noexcept = 0;

                virtual bool try_deallocate_impl(void* ptr, std::size_t count, std::size_t size, std::size_t alignment) noexcept = 0;

                std::size_t max_node_size() const
                {
                    return max(query::node_size);
                }

                std::size_t max_array_size() const
                {
                    return max(query::array_size);
                }

                std::size_t max_alignment() const
                {
                    return max(query::max_alignment);
                }

                std::size_t min_alignment() const
                {
                    return max(query::min_alignment);
                }

                DLContext device_context() const
                {
                    return context_impl();
                }

                virtual bool is_composable() const noexcept = 0;

            protected:
                enum class query
                {
                    node_size,
                    array_size,
                    max_alignment,
                    min_alignment
                };

                virtual std::size_t max(query q) const   = 0;
                virtual DLContext   context_impl() const = 0;
            };

            static constexpr std::size_t padding_bytes = 16UL;

        public:
            using allocator_type = TRTLAB_IMPL_DEFINED(base_allocator);

            /// \effects Creates it from a reference to any stateful \concept{concept_rawallocator,RawAllocator}.
            /// It will store a pointer to this allocator object.
            /// \note The user has to take care that the lifetime of the reference does not exceed the allocator lifetime.
            template <class RawAllocator>
            reference_storage(RawAllocator& alloc) noexcept
            {
                // if you hit this assertions, then you are passing a stateful and shared allocator whose size is larger
                // than the basic_allocator<default_instantiation> + padding bytes
                // padding was sized such that an allocator<RawAllocator, Mutex> could be passed
                static_assert(sizeof(basic_allocator<RawAllocator>) <= sizeof(basic_allocator<default_instantiation>) + padding_bytes,
                              "requires all instantiations to have certain maximum size");
                ::new (static_cast<void*>(&storage_)) basic_allocator<RawAllocator>(alloc);
            }

            // \effects Creates it from any stateless \concept{concept_rawallocator,RawAllocator}.
            /// It will not store anything, only creates the allocator as needed.
            /// \requires The \c RawAllocator is stateless.
            template <class RawAllocator>
            reference_storage(const RawAllocator& alloc, TRTLAB_REQUIRES(!allocator_traits<RawAllocator>::is_stateful::value)) noexcept
            {
                static_assert(sizeof(basic_allocator<RawAllocator>) <= sizeof(basic_allocator<default_instantiation>),
                              "requires all instantiations to have certain maximum size");
                ::new (static_cast<void*>(&storage_)) basic_allocator<RawAllocator>(alloc);
            }

            /// \effects Creates it from the internal base class for the type-erasure.
            /// Has the same effect as if the actual stored allocator were passed to the other constructor overloads.
            /// \note This constructor is used internally to avoid double-nesting.
            reference_storage(const TRTLAB_IMPL_DEFINED(base_allocator) & alloc) noexcept
            {
                alloc.clone(&storage_);
            }

            /// @{
            /// \effects Copies the \c reference_storage object.
            /// It only copies the pointer to the allocator.
            reference_storage(const reference_storage& other) noexcept
            {
                other.get_allocator().clone(&storage_);
            }

            reference_storage& operator=(const reference_storage& other) noexcept
            {
                get_allocator().~allocator_type();
                other.get_allocator().clone(&storage_);
                return *this;
            }
            /// @}

            /// \returns A reference to the allocator.
            /// The actual type is implementation-defined since it is the base class used in the type-erasure,
            /// but it provides the full \concept{concept_rawallocator,RawAllocator} member functions.
            /// \note There is no way to access any custom member functions of the allocator type.
            allocator_type& get_allocator() const noexcept
            {
                auto mem = static_cast<void*>(&storage_);
                return *static_cast<base_allocator*>(mem);
            }

        protected:
            ~reference_storage() noexcept
            {
                get_allocator().~allocator_type();
            }

            bool is_composable() const noexcept
            {
                return get_allocator().is_composable();
            }

        private:
            template <class RawAllocator>
            class basic_allocator
            : public base_allocator,
              private detail::reference_storage_impl<typename allocator_traits<RawAllocator>::allocator_type,
                                                     decltype(detail::reference_type(typename allocator_traits<RawAllocator>::is_stateful{},
                                                                                     is_shared_allocator<RawAllocator>{}))>
            {
                using traits     = allocator_traits<RawAllocator>;
                using composable = is_composable_allocator<typename traits::allocator_type>;
                using storage =
                    detail::reference_storage_impl<typename allocator_traits<RawAllocator>::allocator_type,
                                                   decltype(detail::reference_type(typename allocator_traits<RawAllocator>::is_stateful{},
                                                                                   is_shared_allocator<RawAllocator>{}))>;

            public:
                // non stateful
                basic_allocator(const RawAllocator& alloc) noexcept : storage(alloc) {}

                // stateful
                basic_allocator(RawAllocator& alloc) noexcept : storage(alloc) {}

            private:
                typename traits::allocator_type& get() const noexcept
                {
                    return storage::get_allocator();
                }

                void clone(void* storage) const noexcept override
                {
                    ::new (storage) basic_allocator(get());
                }

                void* allocate_impl(std::size_t count, std::size_t size, std::size_t alignment) override
                {
                    auto&& alloc = get();
                    if (count == 1u)
                        return traits::allocate_node(alloc, size, alignment);
                    else
                        return traits::allocate_array(alloc, count, size, alignment);
                }

                void deallocate_impl(void* ptr, std::size_t count, std::size_t size, std::size_t alignment) noexcept override
                {
                    auto&& alloc = get();
                    if (count == 1u)
                        traits::deallocate_node(alloc, ptr, size, alignment);
                    else
                        traits::deallocate_array(alloc, ptr, count, size, alignment);
                }

                void* try_allocate_impl(std::size_t count, std::size_t size, std::size_t alignment) noexcept override
                {
                    auto&& alloc = get();
                    if (count == 1u)
                        return detail::try_allocate_node(composable{}, alloc, size, alignment);
                    else
                        return detail::try_allocate_array(composable{}, alloc, count, size, alignment);
                }

                bool try_deallocate_impl(void* ptr, std::size_t count, std::size_t size, std::size_t alignment) noexcept override
                {
                    auto&& alloc = get();
                    if (count == 1u)
                        return detail::try_deallocate_node(composable{}, alloc, ptr, size, alignment);
                    else
                        return detail::try_deallocate_array(composable{}, alloc, ptr, count, size, alignment);
                }

                bool is_composable() const noexcept override
                {
                    return composable::value;
                }

                std::size_t max(query q) const override
                {
                    auto&& alloc = get();
                    if (q == query::node_size)
                        return traits::max_node_size(alloc);
                    else if (q == query::array_size)
                        return traits::max_array_size(alloc);
                    else if (q == query::max_alignment)
                        return traits::max_alignment(alloc);
                    return traits::min_alignment(alloc);
                }

                DLContext context_impl() const override
                {
                    auto&& alloc = get();
                    return traits::device_context(alloc);
                }
            };

            // use a stateful instantiation to determine size and alignment
            // base_allocator is stateful
            using default_instantiation = basic_allocator<base_allocator>;
            using storage = std::aligned_storage<sizeof(default_instantiation) + padding_bytes, alignof(default_instantiation)>::type;
            mutable storage storage_;
        };

        /// An alias template for \ref allocator_storage using the \ref reference_storage policy with a given \c Mutex.
        /// It will store a reference to the given allocator type. The tag type \ref any_allocator enables type-erasure.
        /// The \c Mutex defaults to the \ref default_mutex.
        /// \ingroup memory storage
        template <class RawAllocator>
        TRTLAB_ALIAS_TEMPLATE(allocator_reference, allocator_storage<reference_storage<RawAllocator>, no_mutex>);

        /// \returns A new \ref allocator_reference object by forwarding the allocator to the constructor.
        /// \relates allocator_reference
        template <class RawAllocator>
        auto make_allocator_reference(RawAllocator&& allocator) noexcept -> allocator_reference<typename std::decay<RawAllocator>::type>
        {
            return {std::forward<RawAllocator>(allocator)};
        }

        /// An alias for the \ref reference_storage specialization using type-erasure.
        /// \ingroup memory storage
        using any_reference_storage = reference_storage<any_allocator>;

        /// An alias for \ref allocator_storage using the \ref any_reference_storage.
        /// It will store a reference to any \concept{concept_rawallocator,RawAllocator}.
        /// Wrap the allocator in a \ref thread_safe_allocator if you want thread safety.
        using any_allocator_reference = allocator_storage<any_reference_storage, no_mutex>;

        /// \returns A new \ref any_allocator_reference object by forwarding the allocator to the constructor.
        /// \relates any_allocator_reference
        template <class RawAllocator>
        auto make_any_allocator_reference(RawAllocator&& allocator) noexcept -> any_allocator_reference
        {
            return {std::forward<RawAllocator>(allocator)};
        }
    } // namespace memory
} // namespace trtlab

#endif // TRTLAB_MEMORY_ALLOCATOR_STORAGE_H_INCLUDED


================================================
FILE: trtlab/memory/include/trtlab/memory/allocator_traits.h
================================================
// MODIFICATION MESSAGE

// Modification notes:
// - dlpack added to project
// - removal of max_alignment trait
// - addition of memory_type trait [required/no-default]
// - adding allocator_alignment -> allocate_node should be aligned to this value

// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

#ifndef TRTLAB_MEMORY_ALLOCATOR_TRAITS_H_INCLUDED
#define TRTLAB_MEMORY_ALLOCATOR_TRAITS_H_INCLUDED

/// \file
/// The default specialization of the \ref foonathan::memory::allocator_traits.

#include <cstddef>
#include <type_traits>
#include <memory>

#include "detail/utility.h"

#include "config.h"
#include "align.h"
#include "memory_type.h"

namespace trtlab
{
    namespace memory
    {
        namespace detail
        {
            template <class Allocator>
            std::true_type has_construct(
                int, TRTLAB_SFINAE(std::declval<Allocator>().construct(std::declval<typename Allocator::pointer>(),
                                                                       std::declval<typename Allocator::value_type>())));

            template <class Allocator>
            std::false_type has_construct(short);

            template <class Allocator>
            std::true_type has_destroy(int, TRTLAB_SFINAE(std::declval<Allocator>().destroy(std::declval<typename Allocator::pointer>())));

            template <class Allocator>
            std::false_type has_destroy(short);

            template <class Allocator>
            struct check_standard_allocator
            {
                using custom_construct = decltype(has_construct<Allocator>(0));
                using custom_destroy   = decltype(has_destroy<Allocator>(0));

                using valid = std::integral_constant<bool, !custom_construct::value && !custom_destroy::value>;
            };
        } // namespace detail

        /// Traits class that checks whether or not a standard \c Allocator can be used as \concept{concept_rawallocator,RawAllocator}.
        /// It checks the existence of a custom \c construct(), \c destroy() function, if provided,
        /// it cannot be used since it would not be called.<br>
        /// Specialize it for custom \c Allocator types to override this check.
        /// \ingroup memory core
        template <class Allocator>
        struct allocator_is_raw_allocator : TRTLAB_EBO(detail::check_standard_allocator<Allocator>::valid)
        {
        };

        /// Specialization of \ref allocator_is_raw_allocator that allows \c std::allocator again.
        /// \ingroup memory core
        template <typename T>
        struct allocator_is_raw_allocator<std::allocator<T>> : std::true_type
        {
        };

        namespace traits_detail // use seperate namespace to avoid name clashes
        {
            // full_concept has the best conversion rank, error the lowest
            // used to give priority to the functions
            struct error
            {
                operator void*() const noexcept
                {
                    TRTLAB_MEMORY_UNREACHABLE("this is just to hide an error and move static_assert to the front");
                    return nullptr;
                }
            };
            struct std_concept : error
            {
            };
            struct min_concept : std_concept
            {
            };
            struct full_concept : min_concept
            {
            };

            // used to delay assert in handle_error() until instantiation
            template <typename T>
            struct invalid_allocator_concept
            {
                static const bool error = false;
            };

            //=== allocator_type ===//
            // if Allocator has a member template `rebind`, use that to rebind to `char`
            // else if Allocator has a member `value_type`, rebind by changing argument
            // else does nothing
            template <class Allocator>
            auto rebind_impl(int) -> typename Allocator::template rebind<char>::other&;

            template <class Allocator, typename T>
            struct allocator_rebinder
            {
                using type = Allocator&;
            };

            template <template <typename, typename...> class Alloc, typename U, typename... Args, typename T>
            struct allocator_rebinder<Alloc<U, Args...>, T>
            {
                using type = Alloc<T, Args...>&;
            };

            template <class Allocator, typename = typename Allocator::value_type>
            auto rebind_impl(char) -> typename allocator_rebinder<Allocator, char>::type;

            template <class Allocator>
            auto rebind_impl(...) -> Allocator&;

            template <class Allocator>
            struct allocator_type_impl // required for MSVC
            {
                using type = decltype(rebind_impl<Allocator>(0));
            };

            template <class Allocator>
            using allocator_type = typename std::decay<typename allocator_type_impl<Allocator>::type>::type;

            //=== is_stateful ===//
            // first try to access Allocator::is_stateful,
            // then use whether or not the type is empty
            template <class Allocator>
            auto is_stateful(full_concept) -> decltype(typename Allocator::is_stateful{});

            template <class Allocator, bool IsEmpty>
            struct is_stateful_impl;

            template <class Allocator>
            struct is_stateful_impl<Allocator, true>
            {
                static_assert(std::is_default_constructible<Allocator>::value,
                              "RawAllocator is empty but not default constructible ."
                              "This means it is not a stateless allocator. "
                              "If this is actually intended provide the appropriate is_stateful "
                              "typedef in your class.");
                using type = std::false_type;
            };

            template <class Allocator>
            struct is_stateful_impl<Allocator, false>
            {
                using type = std::true_type;
            };

            template <class Allocator>
            auto is_stateful(min_concept) -> typename is_stateful_impl<Allocator, std::is_empty<Allocator>::value>::type;

            //=== allocate_node() ===//
            // first try Allocator::allocate_node
            // then assume std_allocator and call Allocator::allocate
            // then error
            template <class Allocator>
            auto allocate_node(full_concept, Allocator& alloc, std::size_t size, std::size_t alignment)
                -> TRTLAB_AUTO_RETURN_TYPE(alloc.allocate_node(size, alignment), void*)

                    template <class Allocator>
                    auto allocate_node(std_concept, Allocator& alloc, std::size_t size, std::size_t)
                        -> TRTLAB_AUTO_RETURN(static_cast<void*>(alloc.allocate(size)))

                            template <class Allocator>
                            error allocate_node(error, Allocator&, std::size_t, std::size_t)
            {
                static_assert(invalid_allocator_concept<Allocator>::error, "type is not a RawAllocator as it does not provide: void* "
                                                                           "allocate_node(std::size_t, std::size_t)");
                return {};
            }

            //=== deallocate_node() ===//
            // first try Allocator::deallocate_node
            // then assume std_allocator and call Allocator::deallocate
            // then error
            template <class Allocator>
            auto deallocate_node(full_concept, Allocator& alloc, void* ptr, std::size_t size, std::size_t alignment) noexcept
                -> TRTLAB_AUTO_RETURN_TYPE(alloc.deallocate_node(ptr, size, alignment), void)

                    template <class Allocator>
                    auto deallocate_node(std_concept, Allocator& alloc, void* ptr, std::size_t size, std::size_t) noexcept
                -> TRTLAB_AUTO_RETURN_TYPE(alloc.deallocate(static_cast<char*>(ptr), size), void)

                    template <class Allocator>
                    error deallocate_node(error, Allocator&, void*, std::size_t, std::size_t)
            {
                static_assert(invalid_allocator_concept<Allocator>::error, "type is not a RawAllocator as it does not provide: void "
                                                                           "deallocate_node(void*, std::size_t, "
                                                                           "std::size_t)");
                return error{};
            }

            //=== allocate_array() ===//
            // first try Allocator::allocate_array
            // then forward to allocate_node()
            template <class Allocator>
            auto allocate_array(full_concept, Allocator& alloc, std::size_t count, std::size_t size, std::size_t alignment)
                -> TRTLAB_AUTO_RETURN_TYPE(alloc.allocate_array(count, size, alignment), void*)

                    template <class Allocator>
                    void* allocate_array(min_concept, Allocator& alloc, std::size_t count, std::size_t size, std::size_t alignment)
            {
                return allocate_node(full_concept{}, alloc, count * size, alignment);
            }

            //=== deallocate_array() ===//
            // first try Allocator::deallocate_array
            // then forward to deallocate_node()
            template <class Allocator>
            auto deallocate_array(full_concept, Allocator& alloc, void* ptr, std::size_t count, std::size_t size,
                                  std::size_t alignment) noexcept
                -> TRTLAB_AUTO_RETURN_TYPE(alloc.deallocate_array(ptr, count, size, alignment), void)

                    template <class Allocator>
                    void deallocate_array(min_concept, Allocator& alloc, void* ptr, std::size_t count, std::size_t size,
                                          std::size_t alignment) noexcept
            {
                deallocate_node(full_concept{}, alloc, ptr, count * size, alignment);
            }

            //=== max_node_size() ===//
            // first try Allocator::max_node_size()
            // then return maximum value
            template <class Allocator>
            auto max_node_size(full_concept, const Allocator& alloc) -> TRTLAB_AUTO_RETURN_TYPE(alloc.max_node_size(), std::size_t)

                template <class Allocator>
                std::size_t max_node_size(min_concept, const Allocator&) noexcept
            {
                return std::size_t(-1);
            }

            //=== max_node_size() ===//
            // first try Allocator::max_array_size()
            // then forward to max_node_size()
            template <class Allocator>
            auto max_array_size(full_concept, const Allocator& alloc) -> TRTLAB_AUTO_RETURN_TYPE(alloc.max_array_size(), std::size_t)

                template <class Allocator>
                std::size_t max_array_size(min_concept, const Allocator& alloc)
            {
                return max_node_size(full_concept{}, alloc);
            }

            //=== memory_type ===//
            template <class Allocator>
            auto memory_type(full_concept) -> decltype(typename Allocator::memory_type{});

            template <class Allocator>
            error memory_type(error)
            {
                static_assert(invalid_allocator_concept<Allocator>::error, "type is not a RawAllocator as it does not provide: using "
                                                                           "memory_type");
                return error{};
            }

            //=== min_alignment() ===//
            // first try Allocator::min_alignment()
            // then return Allocator::memory_type::min_allocation_alignment()
            template <class Allocator>
            auto min_alignment(full_concept, const Allocator& alloc) -> TRTLAB_AUTO_RETURN_TYPE(alloc.min_alignment(), std::size_t)

                template <class Allocator>
                std::size_t min_alignment(min_concept, const Allocator&)
            {
                return decltype(memory_type<Allocator>(full_concept{}))::min_allocation_alignment();
            }

            //=== max_alignment() ===//
            // first try Allocator::max_alignment()
            // then return Allocator::min_alignment() [see above]
            template <class Allocator>
            auto max_alignment(full_concept, const Allocator& alloc) -> TRTLAB_AUTO_RETURN_TYPE(alloc.max_alignment(), std::size_t)

                template <class Allocator>
                std::size_t max_alignment(min_concept, const Allocator& alloc)
            {
                return min_alignment(full_concept{}, alloc);
            }

            //=== device_context ===//
            // first try Allocator::context,
            // otherwise default to {kDLCPU, 0}
            template <class Allocator>
            auto device_context(full_concept, const Allocator& alloc) -> TRTLAB_AUTO_RETURN_TYPE(alloc.device_context(), DLContext)

                template <class Allocator>
                DLContext device_context(min_concept, const Allocator&)
            {
                return {decltype(memory_type<Allocator>(full_concept{}))::device_type(), 0};
            }

        } // namespace traits_detail

        /// The default specialization of the allocator_traits for a \concept{concept_rawallocator,RawAllocator}.
        /// See the last link for the requirements on types that do not specialize this class and the interface documentation.
        /// Any specialization must provide the same interface.
        /// \ingroup memory core
        template <class Allocator>
        class allocator_traits
        {
        public:
            using allocator_type = traits_detail::allocator_type<Allocator>;
            using is_stateful    = decltype(traits_detail::is_stateful<Allocator>(traits_detail::full_concept{}));
            using memory_type    = decltype(traits_detail::memory_type<Allocator>(traits_detail::full_concept{}));

            static void* allocate_node(allocator_type& state, std::size_t size, std::size_t alignment)
            {
                static_assert(allocator_is_raw_allocator<Allocator>::value,
                              "Allocator cannot be used as RawAllocator because it provides custom construct()/destroy()");
                return traits_detail::allocate_node(traits_detail::full_concept{}, state, size, alignment);
            }

            static void* allocate_array(allocator_type& state, std::size_t count, std::size_t size, std::size_t alignment)
            {
                static_assert(allocator_is_raw_allocator<Allocator>::value,
                              "Allocator cannot be used as RawAllocator because it provides custom construct()/destroy()");
                return traits_detail::allocate_array(traits_detail::full_concept{}, state, count, size, alignment);
            }
/*
            static void* allocate_array(allocator_type& state, std::size_t count, std::size_t size)
            {
                static_assert(allocator_is_raw_allocator<Allocator>::value,
                              "Allocator cannot be used as RawAllocator because it provides custom construct()/destroy()");
                auto alignment = traits_detail::min_alignment(traits_detail::full_concept{}, state);
                return traits_detail::allocate_array(traits_detail::full_concept{}, state, count, size, alignment);
            }
*/
            static void deallocate_node(allocator_type& state, void* node, std::size_t size, std::size_t alignment) noexcept
            {
                static_assert(allocator_is_raw_allocator<Allocator>::value,
                              "Allocator cannot be used as RawAllocator because it provides custom construct()/destroy()");
                traits_detail::deallocate_node(traits_detail::full_concept{}, state, node, size, alignment);
            }

            static void deallocate_array(allocator_type& state, void* array, std::size_t count, std::size_t size,
                                         std::size_t alignment) noexcept
            {
                static_assert(allocator_is_raw_allocator<Allocator>::value,
                              "Allocator cannot be used as RawAllocator because it provides custom construct()/destroy()");
                traits_detail::deallocate_array(traits_detail::full_concept{}, state, array, count, size, alignment);
            }

            static void deallocate_array(allocator_type& state, void* array, std::size_t count, std::size_t size) noexcept
            {
                static_assert(allocator_is_raw_allocator<Allocator>::value,
                              "Allocator cannot be used as RawAllocator because it provides custom construct()/destroy()");
                auto alignment = traits_detail::min_alignment(traits_detail::full_concept{}, state);
                traits_detail::deallocate_array(traits_detail::full_concept{}, state, array, count, size, alignment);
            }

            static std::size_t max_node_size(const allocator_type& state)
            {
                static_assert(allocator_is_raw_allocator<Allocator>::value,
                              "Allocator cannot be used as RawAllocator because it provides custom construct()/destroy()");
                return traits_detail::max_node_size(traits_detail::full_concept{}, state);
            }

            static std::size_t max_array_size(const allocator_type& state)
            {
                static_assert(allocator_is_raw_allocator<Allocator>::value,
                              "Allocator cannot be used as RawAllocator because it provides custom construct()/destroy()");
                return traits_detail::max_array_size(traits_detail::full_concept{}, state);
            }

            static std::size_t min_alignment(const allocator_type& state)
            {
                static_assert(allocator_is_raw_allocator<Allocator>::value,
                              "Allocator cannot be used as RawAllocator because it provides custom construct()/destroy()");
                return traits_detail::min_alignment(traits_detail::full_concept{}, state);
            }

            static std::size_t max_alignment(const allocator_type& state)
            {
                static_assert(allocator_is_raw_allocator<Allocator>::value,
                              "Allocator cannot be used as RawAllocator because it provides custom construct()/destroy()");
                return traits_detail::max_alignment(traits_detail::full_concept{}, state);
            }

            static DLContext device_context(const allocator_type& state)
            {
                static_assert(allocator_is_raw_allocator<Allocator>::value,
                              "Allocator cannot be used as RawAllocator because it provides custom construct()/destroy()");
                return traits_detail::device_context(traits_detail::full_concept{}, state);
            }

#if !defined(DOXYGEN)
            using trtlab_memory_default_traits = std::true_type;
#endif
        };

        namespace detail
        {
            template <class RawAllocator>
            typename allocator_traits<RawAllocator>::trtlab_memory_default_traits alloc_uses_default_traits(RawAllocator&);

            std::false_type alloc_uses_default_traits(...);

            template <typename T>
            struct has_invalid_alloc_function
            : std::is_same<decltype(traits_detail::allocate_node(traits_detail::full_concept{},
                                                                 std::declval<typename allocator_traits<T>::allocator_type&>(), 0, 0)),
                           traits_detail::error>
            {
            };

            template <typename T>
            struct has_invalid_dealloc_function
            : std::is_same<decltype(traits_detail::deallocate_node(traits_detail::full_concept{},
                                                                   std::declval<typename allocator_traits<T>::allocator_type&>(), nullptr,
                                                                   0, 0)),
                           traits_detail::error>
            {
            };

            template <typename T, class DefaultTraits>
            struct is_raw_allocator : std::true_type
            {
            };

            template <typename T>
            struct is_raw_allocator<T, std::integral_constant<bool, true>>
            : std::integral_constant<bool, allocator_is_raw_allocator<T>::value
                                               && !(has_invalid_alloc_function<T>::value || has_invalid_dealloc_function<T>::value)>
            {
            };
        } // namespace detail

        /// Traits that check whether a type models concept \concept{concept_rawallocator,RawAllocator}.<br>
        /// It must either provide the necessary functions for the default traits specialization or has specialized it.
        /// \ingroup memory core
        template <typename T>
        struct is_raw_allocator : detail::is_raw_allocator<T, decltype(detail::alloc_uses_default_traits(std::declval<T&>()))>
        {
        };

        namespace traits_detail
        {
            //=== try_allocate_node() ===//
            // try Allocator::try_allocate_node
            // otherwise error
            template <class Allocator>
            auto try_allocate_node(full_concept, Allocator& alloc, std::size_t size, std::size_t alignment) noexcept
                -> TRTLAB_AUTO_RETURN_TYPE(alloc.try_allocate_node(size, alignment), void*)

                    template <class Allocator>
                    error try_allocate_node(error, Allocator&, std::size_t, std::size_t)
            {
                static_assert(invalid_allocator_concept<Allocator>::error,
                              "type is not a composable RawAllocator as it does not provide: void* "
                              "try_allocate_node(std::size_t, "
                              "std::size_t)");
                return {};
            }

            //=== try_deallocate_node() ===//
            // try Allocator::try_deallocate_node
            // otherwise error
            template <class Allocator>
            auto try_deallocate_node(full_concept, Allocator& alloc, void* ptr, std::size_t size, std::size_t alignment) noexcept
                -> TRTLAB_AUTO_RETURN_TYPE(alloc.try_deallocate_node(ptr, size, alignment), bool)

                    template <class Allocator>
                    error try_deallocate_node(error, Allocator&, void*, std::size_t, std::size_t)
            {
                static_assert(invalid_allocator_concept<Allocator>::error,
                              "type is not a composable RawAllocator as it does not provide: bool "
                              "try_deallocate_node(void*, std::size_t, "
                              "std::size_t)");
                return error{};
            }

            //=== try_allocate_array() ===//
            // first try Allocator::try_allocate_array
            // then forward to try_allocate_node()
            template <class Allocator>
            auto try_allocate_array(full_concept, Allocator& alloc, std::size_t count, std::size_t size, std::size_t alignment) noexcept
                -> TRTLAB_AUTO_RETURN_TYPE(alloc.try_allocate_array(count, size, alignment), void*)

                    template <class Allocator>
                    void* try_allocate_array(min_concept, Allocator& alloc, std::size_t count, std::size_t size, std::size_t alignment)
            {
                return try_allocate_node(full_concept{}, alloc, count * size, alignment);
            }

            //=== try_deallocate_array() ===//
            // first try Allocator::try_deallocate_array
            // then forward to try_deallocate_node()
            template <class Allocator>
            auto try_deallocate_array(full_concept, Allocator& alloc, void* ptr, std::size_t count, std::size_t size,
                                      std::size_t alignment) noexcept
                -> TRTLAB_AUTO_RETURN_TYPE(alloc.try_deallocate_array(ptr, count, size, alignment), bool)

                    template <class Allocator>
                    bool try_deallocate_array(min_concept, Allocator& alloc, void* ptr, std::size_t count, std::size_t size,
                                              std::size_t alignment) noexcept
            {
                return try_deallocate_node(full_concept{}, alloc, ptr, count * size, alignment);
            }
        } // namespace traits_detail

        /// The default specialization of the composable_allocator_traits for a \concept{concept_composableallocator,ComposableAllocator}.
        /// See the last link for the requirements on types that do not specialize this class and the interface documentation.
        /// Any specialization must provide the same interface.
        /// \ingroup memory core
        template <class Allocator>
        class composable_allocator_traits
        {
        public:
            using allocator_type = typename allocator_traits<Allocator>::allocator_type;

            static void* try_allocate_node(allocator_type& state, std::size_t size, std::size_t alignment) noexcept
            {
                static_assert(is_raw_allocator<Allocator>::value, "ComposableAllocator must be RawAllocator");
                return traits_detail::try_allocate_node(traits_detail::full_concept{}, state, size, alignment);
            }

            static void* try_allocate_array(allocator_type& state, std::size_t count, std::size_t size, std::size_t alignment) noexcept
            {
                static_assert(is_raw_allocator<Allocator>::value, "ComposableAllocator must be RawAllocator");
                return traits_detail::try_allocate_array(traits_detail::full_concept{}, state, count, size, alignment);
            }

            static bool try_deallocate_node(allocator_type& state, void* node, std::size_t size, std::size_t alignment) noexcept
            {
                static_assert(is_raw_allocator<Allocator>::value, "ComposableAllocator must be RawAllocator");
                return traits_detail::try_deallocate_node(traits_detail::full_concept{}, state, node, size, alignment);
            }

            static bool try_deallocate_array(allocator_type& state, void* array, std::size_t count, std::size_t size,
                                             std::size_t alignment) noexcept
            {
                static_assert(is_raw_allocator<Allocator>::value, "ComposableAllocator must be RawAllocator");
                return traits_detail::try_deallocate_array(traits_detail::full_concept{}, state, array, count, size, alignment);
            }

#if !defined(DOXYGEN)
            using trtlab_memory_default_traits = std::true_type;
#endif
        };

        namespace detail
        {
            template <class RawAllocator>
            typename composable_allocator_traits<RawAllocator>::trtlab_memory_default_traits composable_alloc_uses_default_traits(
                RawAllocator&);

            std::false_type composable_alloc_uses_default_traits(...);

            template <typename T>
            struct has_invalid_try_alloc_function
            : std::is_same<decltype(traits_detail::try_allocate_node(traits_detail::full_concept{},
                                                                     std::declval<typename allocator_traits<T>::allocator_type&>(), 0, 0)),
                           traits_detail::error>
            {
            };

            template <typename T>
            struct has_invalid_try_dealloc_function
            : std::is_same<decltype(traits_detail::try_deallocate_node(traits_detail::full_concept{},
                                                                       std::declval<typename allocator_traits<T>::allocator_type&>(),
                                                                       nullptr, 0, 0)),
                           traits_detail::error>
            {
            };

            template <typename T, class DefaultTraits>
            struct is_composable_allocator : memory::is_raw_allocator<T>
            {
            };

            template <typename T>
            struct is_composable_allocator<T, std::integral_constant<bool, true>>
            : std::integral_constant<bool, memory::is_raw_allocator<T>::value
                                               && !(has_invalid_try_alloc_function<T>::value || has_invalid_try_dealloc_function<T>::value)>
            {
            };
        } // namespace detail

        /// Traits that check whether a type models concept \concept{concept_rawallocator,ComposableAllocator}.<br>
        /// It must be a \concept[concept_rawallocator,RawAllocator} and either provide the necessary functions for the default traits specialization or has specialized it.
        /// \ingroup memory core
        template <typename T>
        struct is_composable_allocator
        : detail::is_composable_allocator<T, decltype(detail::composable_alloc_uses_default_traits(std::declval<T&>()))>
        {
        };
    } // namespace memory
} // namespace trtlab

#endif // TRTLAB_MEMORY_ALLOCATOR_TRAITS_H_INCLUDED


================================================
FILE: trtlab/memory/include/trtlab/memory/bfit_allocator.h
================================================
#pragma once
#include <set>
#include <utility>
#include <experimental/propagate_const>

#include <glog/logging.h>

#include "align.h"
#include "memory_block.h"
#include "utils.h"

namespace trtlab
{
    namespace memory
    {
        namespace bfit_detail
        {
            // node for double linked list holding all memory blocks - both allocated and free
            // offset allows for aligned allocation if the block start is not aligned as requested
            struct memory_node : public memory_block
            {
                memory_node() : memory_block(), is_allocated(false), prev_node(nullptr), next_node(nullptr), offset(0) {}
                memory_node(const memory_block& block)
                : memory_block(block), is_allocated(false), prev_node(nullptr), next_node(nullptr), offset(0)
                {
                }

                memory_node(const memory_node&) = delete;
                memory_node& operator=(const memory_node&) = delete;

                memory_node(memory_node&&) = delete;
                memory_node& operator=(memory_node&&) = delete;

                // memory_block::memory will is the start of the data segment
                // i.e. the value of the pointer returned to the user by allocate

                // memory_block::size is the entire size of the block including any
                // left and right padding

                bool         is_allocated;
                memory_node* prev_node;
                memory_node* next_node;
                std::size_t  offset;
            };

            template <typename Compare = std::less<>>
            struct memory_node_compare_size : public memory_block_compare_size<Compare>
            {
                constexpr bool operator()(std::size_t size, const memory_node* node) const
                {
                    return memory_block_compare_size<Compare>::operator()(size, *node);
                }

                constexpr bool operator()(const memory_node* node, std::size_t size) const
                {
                    return memory_block_compare_size<Compare>::operator()(*node, size);
                }

                constexpr bool operator()(const memory_node* lhs, const memory_node* rhs) const
                {
                    return memory_block_compare_size<Compare>::operator()(*lhs, *rhs);
                }
            };

            template <typename Compare = std::less<>>
            struct memory_node_compare_addr : public memory_block_compare_addr<Compare>
            {
                constexpr bool operator()(void* addr, const memory_node* node) const
                {
                    return memory_block_compare_addr<Compare>::operator()(addr, *node);
                }

                constexpr bool operator()(const memory_node* node, void* addr) const
                {
                    return memory_block_compare_addr<Compare>::operator()(*node, addr);
                }

                constexpr bool operator()(const memory_node* lhs, const memory_node* rhs) const
                {
                    return memory_block_compare_addr<Compare>::operator()(*lhs, *rhs);
                }
            };

            class node_in_place
            {
            };

            class node_external
            {
            };

        } // namespace bfit_detail

        class bfit_options
        {
        public:
            bfit_options();

            bfit_options(const bfit_options&) = delete;
            bfit_options& operator=(const bfit_options&) = delete;

            bfit_options(bfit_options&&) noexcept;
            bfit_options& operator=(bfit_options&&) noexcept;

            void set_initial_size(std::size_t);
            void set_max_size(std::size_t);
            void set_allow_growth(bool);
            void set_growth_factor(float);

            std::size_t initial_size() const;
            std::size_t max_size() const;
            bool        allow_growth() const;
            float       growth_factor() const;

        private:
            class impl;
            std::experimental::propagate_const<std::unique_ptr<impl>> p_impl;
        };

        template <typename RawAllocator>
        class bfit_allocator : TRTLAB_EBO(RawAllocator)
        {
            using node_t = bfit_detail::memory_node;
            using traits = allocator_traits<RawAllocator>;

            struct stats;

        public:
            using is_stateful    = std::true_type;
            using memory_type    = typename traits::memory_type;
            using allocator_type = typename traits::allocator_type;

            bfit_allocator(std::size_t initial_size, RawAllocator&& alloc) : allocator_type(std::move(alloc))
            {
                m_min_alignment = traits::min_alignment(get_allocator());
                auto memory     = traits::allocate_node(get_allocator(), initial_size, m_min_alignment);
                auto node       = priv_create_node({memory, initial_size});
                auto root       = priv_create_node({memory, initial_size});

                // special case: mark as allocated to avoid being merged during a free
                root->is_allocated = true;
                root->next_node    = node;
                node->prev_node    = root;

                m_heads.push_back(root);
                m_free_nodes.insert(node);
            }

            ~bfit_allocator()
            {
                if (!m_heads.empty())
                {
                    auto& alloc = get_allocator();
                    VLOG(3) << "deallocating " << m_heads.size() << " root allocations";
                    for (auto& block_ptr : m_heads)
                    {
                        DCHECK(block_ptr->next_node);
                        DCHECK(!block_ptr->next_node->next_node);
                        traits::deallocate_node(alloc, block_ptr->next_node->memory, block_ptr->next_node->size, m_min_alignment);
                    }
                }
            }

            bfit_allocator(const bfit_allocator&) = delete;
            bfit_allocator& operator=(const bfit_allocator&) = delete;

            bfit_allocator(bfit_allocator&&) = default;
            bfit_allocator& operator=(bfit_allocator&&) = default;

            void* allocate_node(std::size_t size, std::size_t alignment)
            {
                auto search = m_free_nodes.lower_bound(size);
                if (search == m_free_nodes.end())
                {
                    // no memory block available to handle request
                    // trigger fallback -- possible fallbacks:
                    // 1) request caches to be depopulated
                    // 2) attempt to grow the allocation pool
                    // for now, just fail
                    LOG(ERROR) << "unable to find block to fullfil request of " << size << " bytes";
                    throw std::bad_alloc();
                }

                // partition the best fit memory block
                auto [alloc_node, free_node] = priv_split_node(*search, size, alignment);
                DCHECK(alloc_node) << "internal allocation failure";

                // remove from free nodes
                m_free_nodes.erase(search);

                // insert new allocation in to allocated nodes
                m_alloc_nodes.insert(alloc_node);

                // if sufficient remaining free space, add to free nodes
                if (free_node)
                {
                    m_free_nodes.insert(free_node);
                }

                DVLOG(3) << "allocate_node succeeded: " << alloc_node->memory << " - " << bytes_to_string(alloc_node->size) << " bytes";
                return alloc_node->memory;
            }

            void deallocate_node(void* memory, std::size_t size, std::size_t alignment) noexcept
            {
                DVLOG(5) << "deallocating node at " << memory;

                // todo: this lookup could be skipped entirely if we allocate a descriptor that holds
                // the linked list node info as part of the descriptor
                auto search = m_alloc_nodes.find(memory);
                if (search == m_alloc_nodes.end())
                {
                    LOG(ERROR) << "failed to find allocation for " << memory;
                    auto node = debug_scan_for_node(memory);
                    if (node)
                    {
                        LOG(ERROR) << "found node containing " << memory << "; marked as " << (node->is_allocated ? "ALLOCATED" : "FREED");
                    }
                    LOG(FATAL) << "this is a fatal allocation error: aborting...";
                }

                // note: erasing the pointer from alloc_nodes does not free the node's allocation
                node_t* free_node = *search;
                m_alloc_nodes.erase(search);

                // mark as deallocated
                free_node->is_allocated = false;

                // remove any offset - the memory_block starting addres and size make it contiguous
                // with its left and right neighbors
                priv_collapse_alignment(free_node);

                // merge with neighboring free/deallocated blocks
                if (free_node->next_node && free_node->next_node->is_allocated == false)
                {
                    DVLOG(5) << "merge current node (" << free_node << ") with right neighbor (" << free_node->next_node << ")";
                    // erasing the node only removes the pointer from the set; it does not deallocate it
                    CHECK_EQ(m_free_nodes.erase(free_node->next_node), 1);
                    auto destroy_node = free_node->next_node;
                    priv_merge_node(free_node, free_node->next_node);
                    priv_destroy_node(destroy_node);
                }
                if (free_node->prev_node && free_node->prev_node->is_allocated == false)
                {
                    DVLOG(5) << "merge left node (" << free_node->prev_node << ") with current node (" << free_node << ")";
                    // erasing the node only removes the pointer from the set; it does not deallocate it
                    CHECK_EQ(m_free_nodes.erase(free_node->prev_node), 1);
                    free_node         = free_node->prev_node;
                    auto destroy_node = free_node->next_node;
                    priv_merge_node(free_node, free_node->next_node);
                    priv_destroy_node(destroy_node);
                }

                m_free_nodes.insert(free_node);
            }

            allocator_type& get_allocator()
            {
                return *this;
            }

            const allocator_type& get_allocator() const
            {
                return *this;
            }

            std::string debug_print_allocate_blocks() const
            {
                return priv_list_of_nodes(m_alloc_nodes);
            }

            std::size_t free_nodes() const
            {
                return m_free_nodes.size();
            }

            std::size_t used_nodes() const
            {
                return m_alloc_nodes.size();
            }

            const node_t* debug_scan_for_node(void* memory)
            {
                for (auto& head : m_heads)
                {
                    auto node = head;
                    while (true)
                    {
                        node = node->next_node;
                        if (!node)
                            break;
                        if (node->contains(memory))
                            return node;
                    }
                }
                return nullptr;
            }

            bool allocation_found(void* memory)
            {
                auto search = m_alloc_nodes.find(memory);
                return search != m_alloc_nodes.end();
            }

        private:
            std::pair<node_t*, node_t*> priv_split_node(const node_t* bfit, std::size_t size, std::size_t alignment)
            {
                DVLOG(5) << "spliting node with " << bfit->size << " bytes into allocation of " << size << " bytes";

                // create one or two new nodes in the double linked list
                // when complete, the bfit node is removed from the list
                auto alloc_node            = priv_create_node(*bfit);
                auto [data_start, loffset] = align_shift(alloc_node->memory, alignment);

                // the loffset + size was larger than the block (unlikely)
                if (size + loffset > bfit->size)
                    return std::pair<node_t*, node_t*>(nullptr, nullptr);

                auto data_end              = memory_shift(data_start, size);
                auto [free_start, roffset] = align_shift(data_end, m_min_alignment);

                alloc_node->memory       = data_start;
                alloc_node->offset       = loffset;
                alloc_node->size         = loffset + size + roffset;
                alloc_node->is_allocated = true;

                DVLOG(10) << "new used_node; memory=" << data_start << "; size=" << alloc_node->size << "; offset=" << loffset;

                // partial linked_list update; insert alloc_node after bfit->prev_node
                // [bfit->prev_node] [alloc_node]
                alloc_node->prev_node = bfit->prev_node;
                if (bfit->prev_node)
                    bfit->prev_node->next_node = alloc_node;

                // compute the remaining free space
                std::size_t free_size = bfit->size - alloc_node->size;

                if (free_size < 1024)
                {
                    DVLOG(5) << "remaining free space is not sufficent for a free_node: " << free_size;
                    // complete linked list update
                    // [bfit->prev_node] [alloc_node] [bfit->next_node]
                    alloc_node->next_node = bfit->next_node;
                    if (bfit->next_node)
                        bfit->next_node->prev_node = alloc_node;
                    return std::pair<node_t*, node_t*>(alloc_node, nullptr);
                }

                // there is enough free memory in the allocated block to warrant a new free node
                auto free_node = priv_create_node({free_start, free_size});

                // insert free_node into linked list after alloc_node
                // [bfit->prev_node] [alloc_node] [free_node]
                alloc_node->next_node = free_node;
                free_node->prev_node  = alloc_node;

                // complete final linked list update
                // [bfit->prev_node] [alloc_node] [free_node] [bfit->next_node]
                free_node->next_node = bfit->next_node;
                if (bfit->next_node)
                    bfit->next_node->prev_node = free_node;

                return std::make_pair(alloc_node, free_node);
            }

            void priv_merge_node(node_t* dst, node_t* src)
            {
                CHECK(!src->is_allocated);
                DCHECK_EQ(memory_shift(dst->memory, dst->size), src->memory);
                DCHECK_EQ(src->prev_node, dst);
                DCHECK_EQ(src->offset, 0);
                DCHECK(!dst->is_allocated);

                // extend the dst node to include the src node
                dst->size += src->size;

                // remove src from the linked list
                dst->next_node = src->next_node;
                if (dst->next_node)
                    dst->next_node->prev_node = dst;
            }

            void priv_collapse_alignment(node_t* node)
            {
                if (node->offset)
                {
                    node->memory = memory_shift(node->memory, -1 * node->offset);
                    node->offset = 0;
                }
            }

            node_t* priv_create_node(const memory_block& block)
            {
                // this is where you call your templated node allocator
                // whether the node is allocated "in-block", i.e. as part of the actuall backing memory
                // or if th node is allocated "out-of-block", i.e. the node data is stored apart from the backing memory
                // the latter is needed for gpu memory

                auto node = new node_t(block);

                // move to constructor
                node->memory       = block.memory;
                node->size         = block.size;
                node->offset       = 0;
                node->is_allocated = false;
                node->next_node    = nullptr;
                node->prev_node    = nullptr;

                return node;
            }

            void priv_destroy_node(node_t* node) noexcept
            {
                // this is where you call your templated node deallocator
                delete node;
            }

            template <typename Container>
            std::string priv_list_of_nodes(const Container& container) const
            {
                std::stringstream os;
                os << std::endl;
                for (auto& item : container)
                {
                    os << item->memory << " - " << item->size << " - " << item->offset << std::endl;
                }
                return os.str();
            }

            std::size_t                                                m_min_alignment;
            std::vector<const node_t*>                                 m_heads;
            std::set<node_t*, bfit_detail::memory_node_compare_size<>> m_free_nodes;
            std::set<node_t*, bfit_detail::memory_node_compare_addr<>> m_alloc_nodes;
        };

        template <typename RawAllocator>
        auto make_bfit_allocator(std::size_t initial_size, RawAllocator&& alloc)
        {
            return bfit_allocator<RawAllocator>(initial_size, std::move(alloc));
        }

    } // namespace memory
} // namespace trtlab


================================================
FILE: trtlab/memory/include/trtlab/memory/block_allocators.h
================================================
// MODIFICATION MESSAGE

// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

#pragma once

#include <type_traits>

#include "detail/utility.h"
#include "allocator_traits.h"
#include "memory_block.h"
#include "config.h"
#include "error.h"

namespace trtlab
{
    namespace memory
    {
        namespace detail
        {
            static std::size_t max_size_t = std::size_t(-1);

            template <class BlockAllocator>
            std::true_type is_block_allocator_impl(
                int, TRTLAB_SFINAE(std::declval<memory_block&>() = std::declval<BlockAllocator&>().allocate_block()),
                TRTLAB_SFINAE(std::declval<std::size_t&>() = std::declval<BlockAllocator&>().next_block_size()),
                TRTLAB_SFINAE(std::declval<BlockAllocator>().deallocate_block(memory_block{})));

            template <typename T>
            std::false_type is_block_allocator_impl(short);
        } // namespace detail

        /// Traits that check whether a type models concept \concept{concept_blockallocator,BlockAllocator}.
        /// \ingroup memory core
        template <typename T>
        struct is_block_allocator : decltype(detail::is_block_allocator_impl<T>(0))
        {
        };

        /// A \concept{concept_blockallocator,BlockAllocator} that allows only one block allocation.
        /// It can be used to prevent higher-level allocators from expanding.
        /// The one block allocation is performed through the \c allocate_array() function of the given \concept{concept_rawallocator,RawAllocator}.
        /// \ingroup memory adapter

        template <class RawAllocator>
        class single_block_allocator : TRTLAB_EBO(allocator_traits<RawAllocator>::allocator_type)
        {
            using traits = allocator_traits<RawAllocator>;

        public:
            using allocator_type = typename traits::allocator_type;
            using memory_type    = typename traits::memory_type;

            /// \effects Creates it by passing it the size of the block and the allocator object.
            /// \requires \c block_size must be greater than 0,
            explicit single_block_allocator(std::size_t block_size, allocator_type alloc = allocator_type()) noexcept
            : allocator_type(detail::move(alloc)), block_size_(block_size), initial_size_(block_size)
            {
            }

            /// \effects Allocates a new memory block or throws an exception if there was already one allocation.
            /// \returns The new \ref memory_block.
            /// \throws Anything thrown by the \c allocate_array() function of the \concept{concept_rawallocator,RawAllocator} or \ref out_of_memory if this is not the first call.
            memory_block allocate_block()
            {
                if (block_size_)
                {
                    auto alignment = traits::min_alignment(get_allocator());
                    auto memory = traits::allocate_array(get_allocator(), block_size_, 1UL, alignment);
                    block_size_ = 0u;
                    return {memory, initial_size_};
                }
                throw out_of_fixed_memory(info(), block_size_);
            }

            /// \effects Deallocates the previously allocated memory block.
            /// It also resets and allows a new call again.
            void deallocate_block(memory_block block) noexcept
            {
                DCHECK_EQ(block.size, initial_size_);
                traits::deallocate_array(get_allocator(), block.memory, block.size, 1UL);
                block_size_ = block.size;
            }

            /// \returns The size of the next block which is either the initial size or \c 0.
            std::size_t next_block_size() const noexcept
            {
                return block_size_;
            }

            /// \returns A reference to the used \concept{concept_rawallocator,RawAllocator} object.
            allocator_type& get_allocator() noexcept
            {
                return *this;
            }

            const allocator_type& get_allocator() const noexcept
            {
                return *this;
            }

            DLContext device_context() const noexcept
            {
                return traits::device_context(get_allocator());
            }

        private:
            allocator_info info() noexcept
            {
                return {TRTLAB_MEMORY_LOG_PREFIX "::single_block_allocator", this};
            }

            std::size_t block_size_;
            std::size_t initial_size_;
        };

        template <class RawAllocator>
        class fixed_size_block_allocator : TRTLAB_EBO(allocator_traits<RawAllocator>::allocator_type)
        {
            using traits = allocator_traits<RawAllocator>;

        public:
            using allocator_type = typename traits::allocator_type;
            using memory_type    = typename traits::memory_type;

            /// \effects Creates it by passing it the size of the block and the allocator object.
            /// \requires \c block_size must be greater than 0,
            explicit fixed_size_block_allocator(std::size_t block_size, allocator_type alloc = allocator_type()) noexcept
            : allocator_type(detail::move(alloc)), block_size_(block_size)
            {
            }

            /// \effects Allocates a new memory block or throws an exception if there was already one allocation.
            /// \returns The new \ref memory_block.
            /// \throws Anything thrown by the \c allocate_array() function of the \concept{concept_rawallocator,RawAllocator} or \ref out_of_memory if this is not the first call.
            memory_block allocate_block()
            {
                auto alignment = traits::min_alignment(get_allocator());
                auto         mem = traits::allocate_array(get_allocator(), block_size_, 1, alignment);
                memory_block block(mem, block_size_);
                return block;
            }

            /// \effects Deallocates the previously allocated memory block.
            /// It also resets and allows a new call again.
            void deallocate_block(memory_block block) noexcept
            {
                DCHECK_EQ(block.size, block_size_);
                traits::deallocate_array(get_allocator(), block.memory, block.size, 1);
            }

            /// \returns The size of the next block which is either the initial size or \c 0.
            std::size_t next_block_size() const noexcept
            {
                return block_size_;
            }

            /// \returns A reference to the used \concept{concept_rawallocator,RawAllocator} object.
            allocator_type& get_allocator() noexcept
            {
                return *this;
            }

            const allocator_type& get_allocator() const noexcept
            {
                return *this;
            }

            DLContext device_context() const noexcept
            {
                return traits::device_context(get_allocator());
            }

        private:
            allocator_info info() noexcept
            {
                return {TRTLAB_MEMORY_LOG_PREFIX "::fixed_size_block_allocator", this};
            }

            std::size_t block_size_;
        };

        template <class RawAllocator>
        class growing_block_allocator : TRTLAB_EBO(allocator_traits<RawAllocator>::allocator_type)
        {
            using traits = allocator_traits<RawAllocator>;

        public:
            using allocator_type = typename traits::allocator_type;
            using memory_type    = typename traits::memory_type;

            /// \effects Creates it by passing it the size of the block and the allocator object.
            /// \requires \c block_size must be greater than 0,
            explicit growing_block_allocator(std::size_t block_size, allocator_type alloc = allocator_type(),
                                             std::size_t max_size = detail::max_size_t, double scale_factor = 2) noexcept
            : allocator_type(detail::move(alloc)), block_size_(block_size), max_size_(max_size), scale_factor_(scale_factor)
            {
            }

            /// \effects Allocates a new memory block or throws an exception if there was already one allocation.
            /// \returns The new \ref memory_block.
            /// \throws Anything thrown by the \c allocate_array() function of the \concept{concept_rawallocator,RawAllocator} or \ref out_of_memory if this is not the first call.
            memory_block allocate_block()
            { 
                auto alignment = traits::min_alignment(get_allocator());
                auto         mem = traits::allocate_array(get_allocator(), block_size_, 1, alignment);
                memory_block block(mem, block_size_);
                std::size_t  next_size = block_size_ * scale_factor_;
                if (next_size <= max_size_)
                    block_size_ = next_size;
                return block;
            }

            /// \effects Deallocates the previously allocated memory block.
            /// It also resets and allows a new call again.
            void deallocate_block(memory_block block) noexcept
            {
                DCHECK_LE(block.size, max_size_);
                traits::deallocate_array(get_allocator(), block.memory, block.size, 1);
            }

            /// \returns The size of the next block which is either the initial size or \c 0.
            std::size_t next_block_size() const noexcept
            {
                return block_size_;
            }

            /// \returns A reference to the used \concept{concept_rawallocator,RawAllocator} object.
            allocator_type& get_allocator() noexcept
            {
                return *this;
            }

            const allocator_type& get_allocator() const noexcept
            {
                return *this;
            }

            DLContext device_context() const noexcept
            {
                return traits::device_context(get_allocator());
            }

        private:
            allocator_info info() noexcept
            {
                return {TRTLAB_MEMORY_LOG_PREFIX "::growing_block_allocator", this};
            }

            std::size_t block_size_;
            std::size_t max_size_;
            double      scale_factor_;
        };

        template <typename BlockAllocator>
        class count_limited_block_allocator : TRTLAB_EBO(BlockAllocator)
        {

        public:
            using block_allocator_type = BlockAllocator;
            using allocator_type = typename block_allocator_type::allocator_type;
            using memory_type    = typename block_allocator_type::memory_type;

            count_limited_block_allocator(BlockAllocator&& alloc, std::size_t max_blocks)
            : block_allocator_type(std::move(alloc)), m_max_blocks(max_blocks), m_block_count(0)
            {
                DCHECK(max_blocks);
                static_assert(is_block_allocator<BlockAllocator>::value, "needs to be a valid block allocator");
            }

            memory_block allocate_block()
            {
                if (m_block_count < m_max_blocks)
                {
                    m_block_count++;
                    return block_allocator_type::allocate_block();
                }
                throw std::bad_alloc();
            }

            /// \effects Deallocates the previously allocated memory block.
            /// It also resets and allows a new call again.
            void deallocate_block(memory_block block) noexcept
            {
                DCHECK(m_block_count);
                block_allocator_type::deallocate_block(block);
                m_block_count--;
            }

            /// \returns The size of the next block which is either the initial size or \c 0.
            std::size_t next_block_size() const noexcept
            {
                return block_allocator_type::next_block_size();
            }

            std::size_t block_count() const noexcept
            {
                return m_block_count;
            }

            DLContext device_context() const noexcept
            {
                return block_allocator_type::device_context();
            }

            /// \returns A reference to the used \concept{concept_rawallocator,RawAllocator} object.
            allocator_type& get_allocator() noexcept
            {
                return block_allocator_type::get_allocator();
            }

            const allocator_type& get_allocator() const noexcept
            {
                return block_allocator_type::get_allocator();
            }

            /// \returns A reference to the used \concept{concept_rawallocator,RawAllocator} object.
            block_allocator_type& get_block_allocator() noexcept
            {
                return *this;
            }

            /// \returns A reference to the used \concept{concept_rawallocator,RawAllocator} object.
            const block_allocator_type& get_block_allocator() const noexcept
            {
                return *this;
            }

        private:
            allocator_info info() noexcept
            {
                return {TRTLAB_MEMORY_LOG_PREFIX "::count_limited_block_allocator", this};
            }

            std::size_t m_max_blocks;
            std::size_t m_block_count;
        };

        template <typename BlockAllocator>
        class size_limited_block_allocator : TRTLAB_EBO(BlockAllocator)
        {
        public:
            using block_allocator_type = BlockAllocator;
            using allocator_type       = typename block_allocator_type::allocator_type;
            using memory_type          = typename block_allocator_type::memory_type;

            size_limited_block_allocator(BlockAllocator&& alloc, std::size_t max_size)
            : block_allocator_type(std::move(alloc)), m_max_size(max_size), m_size(0)
            {
                DCHECK(m_max_size);
                static_assert(is_block_allocator<BlockAllocator>::value, "needs to be a valid block allocator");
            }

            memory_block allocate_block()
            {
                if (m_size + block_allocator_type::next_block_size() <= m_max_size)
                {
                    auto block = block_allocator_type::allocate_block();
                    m_size += block.size;
                    return block;
                }
                throw std::bad_alloc();
            }

            /// \effects Deallocates the previously allocated memory block.
            /// It also resets and allows a new call again.
            void deallocate_block(memory_block block) noexcept
            {
                DCHECK_LE(block.size, m_size);
                block_allocator_type::deallocate_block(block);
                m_size -= block.size;
            }

            /// \returns The size of the next block which is either the initial size or \c 0.
            std::size_t next_block_size() const noexcept
            {
                return block_allocator_type::next_block_size();
            }

            std::size_t bytes_allocated() const noexcept
            {
                return m_size;
            }

            DLContext device_context() const noexcept
            {
                return block_allocator_type::device_context();
            }

            /// \returns A reference to the used \concept{concept_rawallocator,RawAllocator} object.
            allocator_type& get_allocator() noexcept
            {
                return block_allocator_type::get_allocator();
            }

            /// \returns A reference to the used \concept{concept_rawallocator,RawAllocator} object.
            const allocator_type& get_allocator() const noexcept
            {
                return block_allocator_type::get_allocator();
            }

            /// \returns A reference to the used \concept{concept_rawallocator,RawAllocator} object.
            block_allocator_type& get_block_allocator() noexcept
            {
                return *this;
            }

            /// \returns A reference to the used \concept{concept_rawallocator,RawAllocator} object.
            const block_allocator_type& get_block_allocator() const noexcept
            {
                return *this;
            }

        private:
            allocator_info info() noexcept
            {
                return {TRTLAB_MEMORY_LOG_PREFIX "::size_limited_block_allocator", this};
            }

            std::size_t m_max_size;
            std::size_t m_size;
        };

        namespace detail
        {
            template <class RawAlloc>
            using default_block_wrapper = fixed_size_block_allocator<RawAlloc>;

            template <template <class...> class Wrapper, class BlockAllocator, typename... Args>
            BlockAllocator make_block_allocator(std::true_type, BlockAllocator&& block_alloc, std::size_t block_size, Args&&... args)
            {
                return Wrapper<BlockAllocator>(std::forward<Args>(args)..., std::move(block_alloc));
            }

            template <template <class...> class BlockAllocator, class RawAlloc, typename... Args>
            auto make_block_allocator(std::false_type, RawAlloc&& alloc, std::size_t block_size, Args&&... args) -> BlockAllocator<RawAlloc>
            {
                return BlockAllocator<RawAlloc>(block_size, std::move(alloc), std::forward<Args>(args)...);
            }

        } // namespace detail

        /// Takes either a \concept{concept_blockallocator,BlockAllocator} or a \concept{concept_rawallocator,RawAllocator}.
        /// In the first case simply aliases the type unchanged, in the second to \ref growing_block_allocator (or the template in `BlockAllocator`) with the \concept{concept_rawallocator,RawAllocator}.
        /// Using this allows passing normal \concept{concept_rawallocator,RawAllocators} as \concept{concept_blockallocator,BlockAllocators}.
        /// \ingroup memory core
        template <class BlockOrRawAllocator, template <typename...> class BlockAllocator>
        using make_block_allocator_t =
            TRTLAB_IMPL_DEFINED(typename std::conditional<is_block_allocator<BlockOrRawAllocator>::value, BlockOrRawAllocator,
                                                          BlockAllocator<BlockOrRawAllocator>>::type);

        /// @{
        /// Helper function make a \concept{concept_blockallocator,BlockAllocator}.
        /// \returns A \concept{concept_blockallocator,BlockAllocator} of the given type created with the given arguments.
        /// \requires Same requirements as the constructor.
        /// \ingroup memory core
        /*
        template <class BlockOrRawAllocator, typename... Args>
        make_block_allocator_t<BlockOrRawAllocator> make_block_allocator(std::size_t block_size, Args&&... args)
        {
            return detail::make_block_allocator<
                detail::default_block_wrapper,
                BlockOrRawAllocator>(is_block_allocator<BlockOrRawAllocator>{}, block_size,
                                     detail::forward<Args>(args)...);
        }
        */

        /*
        template <template <class...> class BlockAllocator, typename RawAllocator, typename... Args>
        BlockAllocator<RawAllocator> make_block_allocator(RawAllocator&& alloc, std::size_t block_size, Args&&... args)
        {
            return BlockAllocator<RawAllocator>(block_size, std::move(alloc), std::forward<Args>(args)...);
        }
*/
        template <template <class...> class BlockAllocator, class BlockOrRawAllocator, typename... Args>
        make_block_allocator_t<BlockOrRawAllocator, BlockAllocator> make_block_allocator(BlockOrRawAllocator&& alloc, Args&&... args)
        {
            return detail::make_block_allocator<BlockAllocator, BlockOrRawAllocator>(is_block_allocator<BlockOrRawAllocator>{},
                                                                                     std::move(alloc), std::forward<Args>(args)...);
        }

        template <template <class...> class BlockAllocator, class BlockOrRawAllocator, typename... Args>
        make_block_allocator_t<BlockOrRawAllocator, BlockAllocator> make_block_allocator(Args&&... args)
        {
            static_assert(!is_block_allocator<BlockOrRawAllocator>{}, "should be a raw");
            BlockOrRawAllocator raw;
            return detail::make_block_allocator<BlockAllocator, BlockOrRawAllocator>(std::false_type{}, std::move(raw),
                                                                                     std::forward<Args>(args)...);
        }

        template <template <class...> class Extension, class BlockAllocator, typename... Args>
        Extension<BlockAllocator> make_extended_block_allocator(BlockAllocator&& block_alloc, Args&&... args)
        {
            static_assert(is_block_allocator<BlockAllocator>{}, "should be a block allocator");
            return Extension<BlockAllocator>(std::move(block_alloc), std::forward<Args>(args)...);
        }

    } // namespace memory
} // namespace trtlab

================================================
FILE: trtlab/memory/include/trtlab/memory/block_arena.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once
#include <cstddef>
#include <map>

#include <utility>

#include <glog/logging.h>

#include "block_allocators.h"
#include "detail/block_list.h"

namespace trtlab
{
    namespace memory
    {
        constexpr bool cached_arena   = true;
        constexpr bool uncached_arena = false;

        namespace detail
        {
            template <bool Cached, typename BlockList = detail::block_list>
            class block_cache;

            template <typename BlockList>
            class block_cache<cached_arena, BlockList>
            {
            public:
                block_cache() noexcept {}

                block_cache(block_cache<cached_arena>&& other) noexcept : m_cache(std::move(other.m_cache))
                {
                    DVLOG(3) << "block_cache was moved via move constructor";
                }

                block_cache& operator=(block_cache<cached_arena>&& other) noexcept
                {
                    DVLOG(3) << "block_cache was moved via move assignment";
                    m_cache = std::move(other.m_cache);
                }

                bool cache_empty() const noexcept
                {
                    return m_cache.empty();
                }

                std::size_t cache_size() const noexcept
                {
                    return m_cache.size();
                }

                bool take_from_cache(memory_block& block) noexcept
                {
                    if (m_cache.empty())
                    {
                        return false;
                    }
                    block = m_cache.allocate();
                    DVLOG(3) << "Acquired memory_block from cache: " << block.memory << "; " << block.size;
                    return true;
                }

                template <typename BlockAllocator>
                void deallocate_block(BlockAllocator&, memory_block&& block) noexcept
                {

                    DVLOG(3) << "deallocate_block " << block.memory << " was cached instead of released";
                    m_cache.insert(std::move(block));
                    DVLOG(3) << "deallocate_block " << block.memory << " was cached instead of released";
                }

                std::size_t cache_next_block_size() const noexcept
                {
                    return m_cache.next_block_size();
                }

                template <typename BlockAllocator>
                void cache_shrink_to_fit(BlockAllocator& alloc) noexcept
                {
                    static_assert(is_block_allocator<BlockAllocator>{}, "BlockAllocator is not a BlockAllocator!");
                    while (!m_cache.empty())
                    {
                        alloc.deallocate_block(m_cache.allocate());
                    }
                }

                template <typename BlockAllocator>
                void cache_reserve(BlockAllocator& alloc, std::size_t block_count)
                {
                    static_assert(is_block_allocator<BlockAllocator>{}, "BlockAllocator is not a BlockAllocator!");
                    while (m_cache.size() < block_count)
                    {
                        DVLOG(4) << "cache size: " << m_cache.size();
                        m_cache.insert(alloc.allocate_block());
                    }
                }

            private:
                BlockList m_cache;
            };

            template <typename BlockList>
            class block_cache<uncached_arena, BlockList>
            {
            public:
                block_cache() noexcept {}

                block_cache(block_cache<uncached_arena>&& other) noexcept {}

                block_cache& operator=(block_cache<uncached_arena>&& other) noexcept {}

                bool cache_empty() const noexcept
                {
                    return true;
                }

                std::size_t cache_size() const noexcept
                {
                    return 0u;
                }

                bool take_from_cache(memory_block& block) noexcept
                {
                    return false;
                }

                template <typename BlockAllocator>
                void deallocate_block(BlockAllocator& alloc, memory_block&& block) noexcept
                {
                    static_assert(is_block_allocator<BlockAllocator>{}, "BlockAllocator is not a BlockAllocator!");
                    alloc.deallocate_block(block);
                }

                std::size_t cache_next_block_size() const noexcept
                {
                    return 0u;
                }

                template <typename BlockAllocator>
                void cache_shrink_to_fit(BlockAllocator&) noexcept
                {
                }

                template <typename BlockAllocator>
                void cache_reserve(BlockAllocator&, std::size_t)
                {
                }
            };
        } // namespace detail

        template <typename BlockAllocator, bool Cached, typename BlockList>
        class block_arena : TRTLAB_EBO(BlockAllocator), TRTLAB_EBO(detail::block_cache<Cached, BlockList>)
        {
            static_assert(is_block_allocator<BlockAllocator>::value, "BlockAllocator is not a BlockAllocator!");

            using cache_type       = detail::block_cache<Cached, BlockList>;
            using block_arena_type = block_arena<BlockAllocator, Cached, BlockList>;

        public:
            using block_allocator_type = BlockAllocator;
            using allocator_type       = typename block_allocator_type::allocator_type;
            using memory_type          = typename block_allocator_type::memory_type;
            using is_cached            = std::integral_constant<bool, Cached>;
            using is_stateful          = std::true_type;

            //explicit transactional_arena(std::size_t block_size, Args&&... args)
            //    : block_allocator_type(block_size, detail::forward<Args>(args)...) {}

            explicit block_arena(block_allocator_type&& block_alloc) noexcept : block_allocator_type(std::move(block_alloc)) {}

            block_arena(block_arena_type&& other) noexcept : block_allocator_type(std::move(other)), cache_type(std::move(other)) {}

            block_arena_type& operator=(block_arena_type&& other) noexcept
            {
                block_allocator_type::operator=(std::move(other));
                cache_type::          operator=(std::move(other));
            }

            // explicitly delete copy ctor and assignment
            block_arena(const block_arena_type&) = delete;
            block_arena_type& operator=(const block_arena_type&) = delete;

            ~block_arena()
            {
                // deallocate blocks in cache
                shrink_to_fit();
            }

            // block_allocator methods

            memory_block allocate_block()
            {
                memory_block block;
                if (!this->take_from_cache(block))
                {
                    block = block_allocator_type::allocate_block();
                }
                return block;
            }

            void deallocate_block(memory_block block)
            {
                DCHECK(block.memory);
                DCHECK(block.size);
                DVLOG(3) << "deallocate_block: " << block.memory << "; " << block.size;
                cache_type::deallocate_block(get_block_allocator(), std::move(block));
            }

            std::size_t next_block_size() const noexcept
            {
                if (!this->cache_empty())
                {
                    return this->cache_next_block_size();
                }
                return block_allocator_type::next_block_size();
            }

            // arena methods

            void shrink_to_fit() noexcept
            {
                this->cache_shrink_to_fit(get_block_allocator());
            }

            void reserve_blocks(std::size_t block_count) noexcept
            {
                this->cache_reserve(get_block_allocator(), block_count);
            }

            DLContext device_context() const noexcept
            {
                return block_allocator_type::device_context();
            } 

            // allocator access and methods

            allocator_type& get_allocator() noexcept
            {
                return block_allocator_type::get_allocator();
            }

            const allocator_type& get_allocator() const noexcept
            {
                return block_allocator_type::get_allocator();
            }

            block_allocator_type& get_block_allocator() noexcept
            {
                return *this;
            }

            const block_allocator_type& get_block_allocator() const noexcept
            {
                return *this;
            }
        };

        template <bool Cached, typename BlockAllocator>
        auto make_block_arena(BlockAllocator&& block_alloc)
        {
            using memory_type = typename BlockAllocator::memory_type;
            using list_type = typename std::conditional<std::is_base_of<host_memory, memory_type>::value, detail::block_list, detail::block_list_oob>::type;
            return block_arena<BlockAllocator, Cached, list_type>(std::move(block_alloc));
        }

        template <typename BlockAllocator>
        auto make_cached_block_arena(BlockAllocator&& block_alloc)
        {
            return make_block_arena<cached_arena>(std::move(block_alloc));
        }

        template <typename BlockAllocator>
        auto make_uncached_block_arena(BlockAllocator&& block_alloc)
        {
            return make_block_arena<uncached_arena>(std::move(block_alloc));
        }

    } // namespace memory
} // namespace trtlab

================================================
FILE: trtlab/memory/include/trtlab/memory/block_manager.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once
#include <cstddef>
#include <map>
#include <queue>
#include <utility>

#include <glog/logging.h>

#include "memory_block.h"

namespace trtlab
{
    namespace memory
    {
        template <typename BlockType>
        class block_manager final
        {
            static_assert(std::is_base_of<memory_block, BlockType>::value, "should be derived from memory_block");

        public:
            using block_type = BlockType;

            block_manager()  = default;
            ~block_manager() = default;

            block_manager(block_manager&& other) noexcept : m_block_map(std::move(other.m_block_map)) {}

            block_manager& operator=(block_manager&& other)
            {
                m_block_map = std::move(other.m_block_map);
                return *this;
            }

            block_manager(const block_manager&) = delete;
            block_manager& operator=(const block_manager&) = delete;

            const block_type& add_block(block_type&& block)
            {
                auto key = reinterpret_cast<std::uintptr_t>(block.memory) + block.size;
                // TODO: check if an overlapping block exists
                // this would be a failure condition
                DVLOG(1) << "adding block: " << key << " - " << block.memory << "; " << block.size;
                m_block_map[key] = std::move(block);
                return m_block_map[key];
            }

            block_type* find_block(const void* ptr)
            {
                auto search = find_entry(ptr);
                if (search != m_block_map.end() && search->second.contains(ptr))
                {
                    DVLOG(3) << this << ": block found";
                    return &search->second;
                }
                DVLOG(3) << this << ": no block found for " << ptr;
                return nullptr;
            }

            void drop_block(void* ptr)
            {
                DVLOG(1) << "dropping block: " << ptr;
                auto search = find_entry(ptr);
                if (search != m_block_map.end())
                {
                    DVLOG(3) << "found block; dropping block: " << search->first << "; " << search->second.memory;
                    m_block_map.erase(search);
                }
            }

            auto size() const noexcept
            {
                return m_block_map.size();
            }

            void clear() noexcept
            {
                DVLOG(2) << "clearing block map";
                m_block_map.clear();
            }

            std::vector<void*> blocks() const noexcept
            {
                DVLOG(2) << "getting a vector of blocks - " << m_block_map.size();
                std::vector<void*> v;
                v.reserve(m_block_map.size());
                for (const auto& it : m_block_map)
                {
                    v.push_back(it.second.memory);
                }
                return v;
            }

            bool owns(void* addr)
            {
                auto block = find_block(addr);
                return (block && block->contains(addr));
            }

        private:
            inline auto find_entry(const void* ptr)
            {
                DVLOG(3) << "looking for block containing: " << ptr;
                auto key = reinterpret_cast<std::uintptr_t>(ptr);
                return m_block_map.upper_bound(key);
            }

            // todo: used a static block allocator here to avoid allocation issues
            std::map<std::uintptr_t, block_type> m_block_map;
        };
    } // namespace memory
} // namespace trtlab

================================================
FILE: trtlab/memory/include/trtlab/memory/block_stack.h
================================================
// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

#ifndef TRTLAB_MEMORY_MEMORY_ARENA_H_INCLUDED
#define TRTLAB_MEMORY_MEMORY_ARENA_H_INCLUDED

#include <type_traits>

#include "detail/debug_helpers.h"
#include "detail/assert.h"
#include "detail/utility.h"
#include "allocator_traits.h"
#include "config.h"
#include "error.h"
#include "memory_block.h"
#include "block_allocators.h"
#include "block_arena.h"

namespace trtlab
{
    namespace memory
    {

        template <class BlockAllocator, bool Cached = true>
        class block_stack;

        constexpr bool cached   = true;
        constexpr bool uncached = false;

        namespace detail
        {
            // stores memory block in an intrusive linked list and allows LIFO access
            class memory_block_stack
            {
            public:
                memory_block_stack() noexcept : head_(nullptr) {}

                ~memory_block_stack() noexcept {}

                memory_block_stack(memory_block_stack&& other) noexcept : head_(other.head_)
                {
                    other.head_ = nullptr;
                }

                memory_block_stack& operator=(memory_block_stack&& other) noexcept
                {
                    memory_block_stack tmp(detail::move(other));
                    swap(*this, tmp);
                    return *this;
                }

                friend void swap(memory_block_stack& a, memory_block_stack& b) noexcept
                {
                    detail::adl_swap(a.head_, b.head_);
                }

                // the raw allocated block returned from an allocator
                using allocated_mb = memory_block;

                // the inserted block slightly smaller to allow for the fixup value
                using inserted_mb = memory_block;

                // how much an inserted block is smaller
                static const std::size_t implementation_offset;

                // pushes a memory block
                void push(allocated_mb block) noexcept;

                // pops a memory block and returns the original block
                allocated_mb pop() noexcept;

                // steals the top block from another stack
                void steal_top(memory_block_stack& other) noexcept;

                // returns the last pushed() inserted memory block
                inserted_mb top() const noexcept
                {
                    TRTLAB_MEMORY_ASSERT(head_);
                    auto mem = static_cast<void*>(head_);
                    return {static_cast<char*>(mem) + node::offset, head_->usable_size};
                }

                bool empty() const noexcept
                {
                    return head_ == nullptr;
                }

                bool owns(const void* ptr) const noexcept;

                // O(n) size
                std::size_t size() const noexcept;

            private:
                struct node
                {
                    node*       prev;
                    std::size_t usable_size;

                    node(node* p, std::size_t size) noexcept : prev(p), usable_size(size) {}

                    static const std::size_t div_alignment;
                    static const std::size_t mod_offset;
                    static const std::size_t offset;
                };

                node* head_;
            };

            template <bool Cached>
            class memory_arena_cache;

            template <>
            class memory_arena_cache<cached_arena>
            {
            protected:
                bool cache_empty() const noexcept
                {
                    return cached_.empty();
                }

                std::size_t cache_size() const noexcept
                {
                    return cached_.size();
                }

                std::size_t cached_block_size() const noexcept
                {
                    return cached_.top().size;
                }

                bool take_from_cache(detail::memory_block_stack& used) noexcept
                {
                    if (cached_.empty())
                        return false;
                    used.steal_top(cached_);
                    return true;
                }

                template <class BlockAllocator>
                void do_deallocate_block(BlockAllocator&, detail::memory_block_stack& used) noexcept
                {
                    cached_.steal_top(used);
                }

                template <class BlockAllocator>
                void do_shrink_to_fit(BlockAllocator& alloc) noexcept
                {
                    detail::memory_block_stack to_dealloc;
                    // pop from cache and push to temporary stack
                    // this revers order
                    while (!cached_.empty())
                        to_dealloc.steal_top(cached_);
                    // now dealloc everything
                    while (!to_dealloc.empty())
                        alloc.deallocate_block(to_dealloc.pop());
                }

            private:
                detail::memory_block_stack cached_;
            };

            template <>
            class memory_arena_cache<uncached_arena>
            {
            protected:
                bool cache_empty() const noexcept
                {
                    return true;
                }

                std::size_t cache_size() const noexcept
                {
                    return 0u;
                }

                std::size_t cached_block_size() const noexcept
                {
                    return 0u;
                }

                bool take_from_cache(detail::memory_block_stack&) noexcept
                {
                    return false;
                }

                template <class BlockAllocator>
                void do_deallocate_block(BlockAllocator& alloc, detail::memory_block_stack& used) noexcept
                {
                    alloc.deallocate_block(used.pop());
                }

                template <class BlockAllocator>
                void do_shrink_to_fit(BlockAllocator&) noexcept
                {
                }
            };
        } // namespace detail


        // TODO:
        // - expose the block_allocator via get_block_allocator
        // - expose the block_allocator's allocator via get_allocator

        /// A block arena that manages huge memory blocks for a higher-level allocator.
        /// Some allocators like \ref memory_stack work on huge memory blocks,
        /// this class manages them for those allocators.
        /// It uses a \concept{concept_blockallocator,BlockAllocator} for the allocation of those blocks.
        /// The memory blocks in use are put onto a stack like structure, deallocation will pop from the top,
        /// so it is only possible to deallocate the last allocated block of the arena.
        /// Block can be cached or uncached via the Cached template parameter
        /// \ref cached_arena (or \c true) enables it explicitly.
        /// \ingroup memory core
        template <class BlockAllocator, bool Cached>
        class block_stack : TRTLAB_EBO(BlockAllocator), TRTLAB_EBO(detail::memory_arena_cache<Cached>)
        {
            static_assert(is_block_allocator<BlockAllocator>::value, "BlockAllocator is not a BlockAllocator!");
            using cache = detail::memory_arena_cache<Cached>;

        public:
            using allocator_type = BlockAllocator;
            using memory_type    = typename BlockAllocator::memory_type;
            using is_cached      = std::integral_constant<bool, Cached>;

            /// \effects Creates it by giving it the size and other arguments for the \concept{concept_blockallocator,BlockAllocator}.
            /// It forwards these arguments to its constructor.
            /// \requires \c block_size must be greater than \c 0 and other requirements depending on the \concept{concept_blockallocator,BlockAllocator}.
            /// \throws Anything thrown by the constructor of the \c BlockAllocator.
            template <typename... Args>
            explicit block_stack(std::size_t block_size, Args&&... args) : allocator_type(block_size, detail::forward<Args>(args)...)
            {
            }

            block_stack(BlockAllocator&& block_alloc) noexcept : allocator_type(std::move(block_alloc)) {}

            /// \effects Deallocates all memory blocks that where requested back to the \concept{concept_blockallocator,BlockAllocator}.
            ~block_stack() noexcept
            {
                // clear cache
                shrink_to_fit();
                // now deallocate everything
                while (!used_.empty())
                    allocator_type::deallocate_block(used_.pop());
            }

            /// @{
            /// \effects Moves the arena.
            /// The new arena takes ownership over all the memory blocks from the other arena object,
            /// which is empty after that.
            /// This does not invalidate any memory blocks.
            block_stack(block_stack&& other) noexcept
            : allocator_type(detail::move(other)), cache(detail::move(other)), used_(detail::move(other.used_))
            {
            }

            block_stack& operator=(block_stack&& other) noexcept
            {
                block_stack tmp(detail::move(other));
                swap(*this, tmp);
                return *this;
            }
            /// @}

            /// \effects Swaps to memory arena objects.
            /// This does not invalidate any memory blocks.
            friend void swap(block_stack& a, block_stack& b) noexcept
            {
                detail::adl_swap(static_cast<allocator_type&>(a), static_cast<allocator_type&>(b));
                detail::adl_swap(static_cast<cache&>(a), static_cast<cache&>(b));
                detail::adl_swap(a.used_, b.used_);
            }

            /// \effects Allocates a new memory block.
            /// It first uses a cache of previously deallocated blocks, if caching is enabled,
            /// if it is empty, allocates a new one.
            /// \returns The new \ref memory_block.
            /// \throws Anything thrown by the \concept{concept_blockallocator,BlockAllocator} allocation function.
            memory_block allocate_block()
            {
                if (!cache::take_from_cache(used_))
                    used_.push(allocator_type::allocate_block());

                auto block = used_.top();
                detail::debug_fill_internal(block.memory, block.size, false);
                return block;
            }

            /// \returns The current memory block.
            /// This is the memory block that will be deallocated by the next call to \ref deallocate_block().
            memory_block current_block() const noexcept
            {
                return used_.top();
            }

            /// \effects Deallocates the current memory block.
            /// The current memory block is the block on top of the stack of blocks.
            /// If caching is enabled, it does not really deallocate it but puts it onto a cache for later use,
            /// use \ref shrink_to_fit() to purge that cache.
            void deallocate_block() noexcept
            {
                auto block = used_.top();
                detail::debug_fill_internal(block.memory, block.size, true);
                this->do_deallocate_block(get_allocator(), used_);
            }

            /// \returns If `ptr` is in memory owned by the arena.
            bool owns(const void* ptr) const noexcept
            {
                return used_.owns(ptr);
            }

            /// \effects Purges the cache of unused memory blocks by returning them.
            /// The memory blocks will be deallocated in reversed order of allocation.
            /// Does nothing if caching is disabled.
            void shrink_to_fit() noexcept
            {
                this->do_shrink_to_fit(get_allocator());
            }

            /// \returns The capacity of the arena, i.e. how many blocks are used and cached.
            std::size_t capacity() const noexcept
            {
                return size() + cache_size();
            }

            /// \returns The size of the cache, i.e. how many blocks can be allocated without allocation.
            std::size_t cache_size() const noexcept
            {
                return cache::cache_size();
            }

            /// \returns The size of the arena, i.e. how many blocks are in use.
            /// It is always smaller or equal to the \ref capacity().
            std::size_t size() const noexcept
            {
                return used_.size();
            }

            bool empty() const noexcept
            {
                return size() == 0;
            }

            /// \returns The size of the next memory block,
            /// i.e. of the next call to \ref allocate_block().
            /// If there are blocks in the cache, returns size of the next one.
            /// Otherwise forwards to the \concept{concept_blockallocator,BlockAllocator} and subtracts an implementation offset.
            std::size_t next_block_size() const noexcept
            {
                return cache::cache_empty() ? allocator_type::next_block_size() - detail::memory_block_stack::implementation_offset :
                                             cache::cached_block_size();
            }

            /// \returns A reference of the \concept{concept_blockallocator,BlockAllocator} object.
            /// \requires It is undefined behavior to move this allocator out into another object.
            allocator_type& get_allocator() noexcept
            {
                return *this;
            }

            const allocator_type& get_allocator() const noexcept
            {
                return *this;
            }

            DLContext device_context() const
            {
                return allocator_type::device_context();
            }

        private:
            detail::memory_block_stack used_;
        };

        template <typename BlockAllocator>
        using cached_block_stack = block_stack<BlockAllocator, true>;

        template <typename BlockAllocator>
        using uncached_block_stack = block_stack<BlockAllocator, false>;

        template<bool Cached, typename BlockAllocator>
        auto make_block_stack(BlockAllocator&& alloc)
        {
            static_assert(is_block_allocator<BlockAllocator>{}, "should be a block allocator");
            return block_stack<BlockAllocator, Cached>(std::move(alloc));
        }

    } // namespace memory
} // namespace trtlab

#endif // TRTLAB_MEMORY_MEMORY_ARENA_H_INCLUDED


================================================
FILE: trtlab/memory/include/trtlab/memory/config.h
================================================
// MODIFICATION_MESSAGE

// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

/// \file
/// Configuration macros.

#ifndef TRTLAB_MEMORY_CONFIG_H_INCLUDED
#define TRTLAB_MEMORY_CONFIG_H_INCLUDED

#if !defined(DOXYGEN)
#define TRTLAB_MEMORY_IMPL_IN_CONFIG_H
#include "config_impl.h"
#undef TRTLAB_MEMORY_IMPL_IN_CONFIG_H
#endif

// general compatibility headers
// #include <foonathan/constexpr.hpp>
// #include <foonathan/noexcept.hpp>
// #include <foonathan/exception_support.hpp>
// #include <foonathan/hosted_implementation.hpp>

// log prefix
#ifndef TRTLAB_MEMORY_LOG_PREFIX
#define TRTLAB_MEMORY_LOG_PREFIX "trtlab::memory"
#endif

// version
#define TRTLAB_MEMORY_VERSION                                                                   \
    (TRTLAB_MEMORY_VERSION_MAJOR * 100 + TRTLAB_MEMORY_VERSION_MINOR)

// use this macro to mark implementation-defined types
// gives it more semantics and useful with doxygen
// add PREDEFINED: TRTLAB_IMPL_DEFINED():=implementation_defined
#ifndef TRTLAB_IMPL_DEFINED
#define TRTLAB_IMPL_DEFINED(...) __VA_ARGS__
#endif

// use this macro to mark base class which only purpose is EBO
// gives it more semantics and useful with doxygen
// add PREDEFINED: TRTLAB_EBO():=
#ifndef TRTLAB_EBO
#define TRTLAB_EBO(...) __VA_ARGS__
#endif

#ifndef TRTLAB_ALIAS_TEMPLATE
// defines a template alias
// usage:
// template <typename T>
// TRTLAB_ALIAS_TEMPLATE(bar, foo<T, int>);
// useful for doxygen
#ifdef DOXYGEN
#define TRTLAB_ALIAS_TEMPLATE(Name, ...)                                                        \
    class Name : public __VA_ARGS__                                                                \
    {                                                                                              \
    }
#else
#define TRTLAB_ALIAS_TEMPLATE(Name, ...) using Name = __VA_ARGS__
#endif
#endif

#ifdef DOXYGEN
// dummy definitions of config macros for doxygen

/// The major version number.
/// \ingroup memory core
#define TRTLAB_MEMORY_VERSION_MAJOR 1

/// The minor version number.
/// \ingroup memory core
#define TRTLAB_MEMORY_VERSION_MINOR 1

/// The total version number of the form \c Mmm.
/// \ingroup memory core
#define TRTLAB_MEMORY_VERSION                                                                   \
    (TRTLAB_MEMORY_VERSION_MAJOR * 100 + TRTLAB_MEMORY_VERSION_MINOR)

/// Whether or not the allocation size will be checked,
/// i.e. the \ref bad_allocation_size thrown.
/// \ingroup memory core
#define TRTLAB_MEMORY_CHECK_ALLOCATION_SIZE 1

/// Whether or not the \ref foonathan::memory::default_mutex will be \c std::mutex or \ref foonathan::memory::no_mutex.
/// \ingroup memory core
#define TRTLAB_MEMORY_THREAD_SAFE_REFERENCE 1

/// Whether or not internal assertions in the library are enabled.
/// \ingroup memory core
#define TRTLAB_MEMORY_DEBUG_ASSERT 1

/// Whether or not allocated memory will be filled with special values.
/// \ingroup memory core
#define TRTLAB_MEMORY_DEBUG_FILL 1

/// The size of the fence memory, it has no effect if \ref TRTLAB_MEMORY_DEBUG_FILL is \c false.
/// \note For most allocators, the actual value doesn't matter and they use appropriate defaults to ensure alignment etc.
/// \ingroup memory core
#define TRTLAB_MEMORY_DEBUG_FENCE 1

/// Whether or not leak checking is enabled.
/// \ingroup memory core
#define TRTLAB_MEMORY_DEBUG_LEAK_CHECK 1

/// Whether or not the deallocation functions will check for pointers that were never allocated by an allocator.
/// \ingroup memory core
#define TRTLAB_MEMORY_DEBUG_POINTER_CHECK 1

/// Whether or not the deallocation functions will check for double free errors.
/// This option makes no sense if \ref TRTLAB_MEMORY_DEBUG_POINTER_CHECK is \c false.
/// \ingroup memory core
#define TRTLAB_MEMORY_DEBUG_DOUBLE_DEALLOC_CHECK 1

/// Whether or not everything is in namespace <tt>foonathan::memory</tt>.
/// If \c false, a namespace alias <tt>namespace memory = foonathan::memory</tt> is automatically inserted into each header,
/// allowing to qualify everything with <tt>foonathan::</tt>.
/// \note This option breaks in combination with using <tt>using namespace foonathan;</tt>.
/// \ingroup memory core
#define TRTLAB_MEMORY_NAMESPACE_PREFIX 1

/// The mode of the automatic \ref temporary_stack creation.
/// Set to `2` to enable automatic lifetime management of the per-thread stack through nifty counter.
/// Then all memory will be freed upon program termination automatically.
/// Set to `1` to disable automatic lifetime managment of the per-thread stack,
/// requires managing it through the \ref temporary_stack_initializer.
/// Set to `0` to disable the per-thread stack completely.
/// \ref get_temporary_stack() will abort the program upon call.
/// \ingroup memory allocator
#define TRTLAB_MEMORY_TEMPORARY_STACK_MODE 2
#endif

#endif // TRTLAB_MEMORY_CONFIG_H_INCLUDED


================================================
FILE: trtlab/memory/include/trtlab/memory/debugging.h
================================================
// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

#ifndef TRTLAB_MEMORY_DEBUGGING_H_INCLUDED
#define TRTLAB_MEMORY_DEBUGGING_H_INCLUDED

/// \file
/// Debugging facilities.

#include "config.h"

namespace trtlab
{
    namespace memory
    {
        struct allocator_info;

        /// The magic values that are used for debug filling.
        /// If \ref TRTLAB_MEMORY_DEBUG_FILL is \c true, memory will be filled to help detect use-after-free or missing initialization errors.
        /// These are the constants for the different types.
        /// \ingroup memory core
        enum class debug_magic : unsigned char
        {
            /// Marks internal memory used by the allocator - "allocated block".
            internal_memory = 0xAB,
            /// Marks internal memory currently not used by the allocator - "freed block".
            internal_freed_memory = 0xFB,
            /// Marks allocated, but not yet used memory - "clean memory".
            new_memory = 0xCD,
            /// Marks freed memory - "dead memory".
            freed_memory = 0xDD,
            /// Marks buffer memory used to ensure proper alignment.
            /// This memory can also serve as \ref debug_magic::fence_memory.
            alignment_memory = 0xED,
            /// Marks buffer memory used to protect against overflow - "fence memory".
            /// The option \ref TRTLAB_MEMORY_DEBUG_FENCE controls the size of a memory fence that will be placed before or after a memory block.
            /// It helps catching buffer overflows.
            fence_memory = 0xFD
        };

        /// The type of the handler called when a memory leak is detected.
        /// Leak checking can be controlled via the option \ref TRTLAB_MEMORY_DEBUG_LEAK_CHECK
        /// and only affects calls through the \ref allocator_traits, not direct calls.
        /// The handler gets the \ref allocator_info and the amount of memory leaked.
        /// This can also be negative, meaning that more memory has been freed than allocated.
        /// \requiredbe A leak handler shall log the leak, abort the program, do nothing or anything else that seems appropriate.
        /// It must not throw any exceptions since it is called in the cleanup process.
        /// \defaultbe On a hosted implementation it logs the leak to \c stderr and returns, continuing execution.
        /// On a freestanding implementation it does nothing.
        /// \ingroup memory core
        using leak_handler = void (*)(const allocator_info& info, std::ptrdiff_t amount);

        /// Exchanges the \ref leak_handler.
        /// \effects Sets \c h as the new \ref leak_handler in an atomic operation.
        /// A \c nullptr sets the default \ref leak_handler.
        /// \returns The previous \ref leak_handler. This is never \c nullptr.
        /// \ingroup memory core
        leak_handler set_leak_handler(leak_handler h);

        /// Returns the \ref leak_handler.
        /// \returns The current \ref leak_handler. This is never \c nullptr.
        /// \ingroup memory core
        leak_handler get_leak_handler();

        /// The type of the handler called when an invalid pointer is passed to a deallocation function.
        /// Pointer checking can be controlled via the options \ref TRTLAB_MEMORY_DEBUG_POINTER_CHECK and \ref TRTLAB_MEMORY_DEBUG_DOUBLE_DEALLOC_CHECK.
        /// The handler gets the \ref allocator_info and the invalid pointer.
        /// \requiredbe An invalid pointer handler shall terminate the program.
        /// It must not throw any exceptions since it might be called in the cleanup process.
        /// \defaultbe On a hosted implementation it logs the information to \c stderr and calls \c std::abort().
        /// On a freestanding implementation it only calls \c std::abort().
        /// \ingroup memory core
        using invalid_pointer_handler = void (*)(const allocator_info& info, const void* ptr);

        /// Exchanges the \ref invalid_pointer_handler.
        /// \effects Sets \c h as the new \ref invalid_pointer_handler in an atomic operation.
        /// A \c nullptr sets the default \ref invalid_pointer_handler.
        /// \returns The previous \ref invalid_pointer_handler. This is never \c nullptr.
        /// \ingroup memory core
        invalid_pointer_handler set_invalid_pointer_handler(invalid_pointer_handler h);

        /// Returns the \ref invalid_pointer_handler.
        /// \returns The current \ref invalid_pointer_handler. This is never \c nullptr.
        /// \ingroup memory core
        invalid_pointer_handler get_invalid_pointer_handler();

        /// The type of the handler called when a buffer under/overflow is detected.
        /// If \ref TRTLAB_MEMORY_DEBUG_FILL is \c true and \ref TRTLAB_MEMORY_DEBUG_FENCE has a non-zero value
        /// the allocator classes check if a write into the fence has occured upon deallocation.
        /// The handler gets the memory block belonging to the corrupted fence, its size and the exact address.
        /// \requiredbe A buffer overflow handler shall terminate the program.
        /// It must not throw any exceptions since it me be called in the cleanup process.
        /// \defaultbe On a hosted implementation it logs the information to \c stderr and calls \c std::abort().
        /// On a freestanding implementation it only calls \c std::abort().
        /// \ingroup memory core
        using buffer_overflow_handler = void (*)(const void* memory, std::size_t size,
                                                 const void* write_ptr);

        /// Exchanges the \ref buffer_overflow_handler.
        /// \effects Sets \c h as the new \ref buffer_overflow_handler in an atomic operation.
        /// A \c nullptr sets the default \ref buffer_overflow_handler.
        /// \returns The previous \ref buffer_overflow_handler. This is never \c nullptr.
        /// \ingroup memory core
        buffer_overflow_handler set_buffer_overflow_handler(buffer_overflow_handler h);

        /// Returns the \ref buffer_overflow_handler.
        /// \returns The current \ref buffer_overflow_handler. This is never \c nullptr.
        /// \ingroup memory core
        buffer_overflow_handler get_buffer_overflow_handler();
    }
} // namespace trtlab::memory

#endif // TRTLAB_MEMORY_DEBUGGING_H_INCLUDED


================================================
FILE: trtlab/memory/include/trtlab/memory/deleter.h
================================================
// MODIFICATION MESSAGE

// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

#ifndef TRTLAB_MEMORY_DELETER_H_INCLUDED
#define TRTLAB_MEMORY_DELETER_H_INCLUDED

/// \file
/// \c Deleter classes using a \concept{concept_rawallocator,RawAllocator}.

#include <type_traits>

#include "allocator_storage.h"
#include "config.h"
#include "threading.h"

namespace trtlab
{
    namespace memory
    {
        /// A deleter class that deallocates the memory through a specified \concept{concept_rawallocator,RawAllocator}.
        ///
        /// It deallocates memory for a specified type but does not call its destructors.
        /// Only a reference to the \c RawAllocator is stored, access to it is synchronized by a given \c Mutex which defaults to \ref default_mutex.
        /// \ingroup memory adapter
        template <typename Type, class RawAllocator>
        class allocator_deallocator : TRTLAB_EBO(allocator_reference<RawAllocator>)
        {
            static_assert(!std::is_abstract<Type>::value,
                          "use allocator_polymorphic_deallocator for storing base classes");

        public:
            using allocator_type = typename allocator_reference<RawAllocator>::allocator_type;
            using value_type     = Type;

            /// \effects Creates it without any associated allocator.
            /// The deallocator must not be used if that is the case.
            /// \notes This functions is useful if you have want to create an empty smart pointer without giving it an allocator.
            allocator_deallocator() noexcept = default;

            /// \effects Creates it by passing it an \ref allocator_reference.
            /// It will store the reference to it and uses the referenced allocator object for the deallocation.
            allocator_deallocator(allocator_reference<RawAllocator> alloc) noexcept
            : allocator_reference<RawAllocator>(alloc)
            {
            }

            /// \effects Deallocates the memory given to it.
            /// Calls \c deallocate_node(pointer, sizeof(value_type), alignof(value_type)) on the referenced allocator object.
            /// \requires The deallocator must not have been created by the default constructor.
            void operator()(value_type* pointer) noexcept
            {
                this->deallocate_node(pointer, sizeof(value_type), alignof(value_type));
            }

            /// \returns The reference to the allocator.
            /// It has the same type as the call to \ref allocator_reference::get_allocator().
            /// \requires The deallocator must not be created by the default constructor.
            auto get_allocator() const noexcept
                -> decltype(std::declval<allocator_reference<allocator_type>>().get_allocator())
            {
                return this->allocator_reference<allocator_type>::get_allocator();
            }
        };

        /// Specialization of \ref allocator_deallocator for array types.
        /// Otherwise the same behavior.
        /// \ingroup memory adapter
        template <typename Type, class RawAllocator>
        class allocator_deallocator<Type[], RawAllocator>
        : TRTLAB_EBO(allocator_reference<RawAllocator>)
        {
            static_assert(!std::is_abstract<Type>::value, "must not create polymorphic arrays");

        public:
            using allocator_type = typename allocator_reference<RawAllocator>::allocator_type;
            using value_type     = Type;;

            /// \effects Creates it without any associated allocator.
            /// The deallocator must not be used if that is the case.
            /// \notes This functions is useful if you have want to create an empty smart pointer without giving it an allocator.
            allocator_deallocator() noexcept : size_(0u) {}

            /// \effects Creates it by passing it an \ref allocator_reference and the size of the array that will be deallocated.
            /// It will store the reference to the allocator and uses the referenced allocator object for the deallocation.
            allocator_deallocator(allocator_reference<RawAllocator> alloc, std::size_t size) noexcept
            : allocator_reference<RawAllocator>(alloc), size_(size) {}

            /// \effects Deallocates the memory given to it.
            /// Calls \c deallocate_array(pointer, size, sizeof(value_type), alignof(value_type))
            /// on the referenced allocator object with the size given in the constructor.
            /// \requires The deallocator must not have been created by the default constructor.
            void operator()(value_type* pointer) noexcept
            {
                this->deallocate_array(pointer, size_, sizeof(value_type),
                                       alignof(value_type));
            }

            /// \returns The reference to the allocator.
            /// It has the same type as the call to \ref allocator_reference::get_allocator().
            /// \requires The deallocator must not have been created by the default constructor.
            auto get_allocator() const noexcept
                -> decltype(std::declval<allocator_reference<allocator_type>>().get_allocator())
            {
                return this->allocator_reference<allocator_type>::get_allocator();
            }

            /// \returns The size of the array that will be deallocated.
            /// This is the same value as passed in the constructor, or `0` if it was created by the default constructor.
            std::size_t array_size() const noexcept
            {
                return size_;
            }

        private:
            std::size_t size_;
        };

        /// A deleter class that deallocates the memory of a derived type through a specified \concept{concept_rawallocator,RawAllocator}.
        ///
        /// It can only be created from a \ref allocator_deallocator and thus must only be used for smart pointers initialized by derived-to-base conversion of the pointer.
        /// \ingroup memory adapter
        template <typename BaseType, class RawAllocator>
        class allocator_polymorphic_deallocator
        : TRTLAB_EBO(allocator_reference<RawAllocator>)
        {
        public:
            using allocator_type = typename allocator_reference<RawAllocator>::allocator_type;
            using value_type     = BaseType;

            /// \effects Creates it from a deallocator for a derived type.
            /// It will deallocate the memory as if done by the derived type.
            template <typename T, TRTLAB_REQUIRES((std::is_base_of<BaseType, T>::value))>
            allocator_polymorphic_deallocator(allocator_deallocator<T, RawAllocator> dealloc)
            : allocator_reference<RawAllocator>(dealloc.get_allocator()),
              derived_size_(sizeof(T)),
              derived_alignment_(alignof(T))
            {
            }

            /// \effects Deallocates the memory given to it.
            /// Calls \c deallocate_node(pointer, size, alignment) on the referenced allocator object,
            /// where \c size and \c alignment are the values of the type it was created with.
            void operator()(value_type* pointer) noexcept
            {
                this->deallocate_node(pointer, derived_size_, derived_alignment_);
            }

            /// \returns The reference to the allocator.
            /// It has the same type as the call to \ref allocator_reference::get_allocator().
            auto get_allocator() const noexcept
                -> decltype(std::declval<allocator_reference<allocator_type>>().get_allocator())
            {
                return this->allocator_reference<allocator_type>::get_allocator();
            }

        private:
            std::size_t derived_size_, derived_alignment_;
        };

        /// Similar to \ref allocator_deallocator but calls the destructors of the object.
        /// Otherwise behaves the same.
        /// \ingroup memory adapter
        template <typename Type, class RawAllocator>
        class allocator_deleter : TRTLAB_EBO(allocator_reference<RawAllocator>)
        {
            static_assert(!std::is_abstract<Type>::value,
                          "use allocator_polymorphic_deleter for storing base classes");

        public:
            using allocator_type = typename allocator_reference<RawAllocator>::allocator_type;
            using value_type     = Type;

            /// \effects Creates it without any associated allocator.
            /// The deleter must not be used if that is the case.
            /// \notes This functions is useful if you have want to create an empty smart pointer without giving it an allocator.
            allocator_deleter() noexcept = default;

            /// \effects Creates it by passing it an \ref allocator_reference.
            /// It will store the reference to it and uses the referenced allocator object for the deallocation.
            allocator_deleter(allocator_reference<RawAllocator> alloc) noexcept
            : allocator_reference<RawAllocator>(alloc)
            {
            }

            /// \effects Calls the destructor and deallocates the memory given to it.
            /// Calls \c deallocate_node(pointer, sizeof(value_type), alignof(value_type))
            /// on the referenced allocator object for the deallocation.
            /// \requires The deleter must not have been created by the default constructor.
            void operator()(value_type* pointer) noexcept
            {
                pointer->~value_type();
                this->deallocate_node(pointer, sizeof(value_type), alignof(value_type));
            }

            /// \returns The reference to the allocator.
            /// It has the same type as the call to \ref allocator_reference::get_allocator().
            auto get_allocator() const noexcept
                -> decltype(std::declval<allocator_reference<allocator_type>>().get_allocator())
            {
                return this->allocator_reference<allocator_type>::get_allocator();
            }
        };

        /// Specialization of \ref allocator_deleter for array types.
        /// Otherwise the same behavior.
        /// \ingroup memory adapter
        template <typename Type, class RawAllocator>
        class allocator_deleter<Type[], RawAllocator>
        : TRTLAB_EBO(allocator_reference<RawAllocator>)
        {
            static_assert(!std::is_abstract<Type>::value, "must not create polymorphic arrays");

        public:
            using allocator_type = typename allocator_reference<RawAllocator>::allocator_type;
            using value_type     = Type;;

            /// \effects Creates it without any associated allocator.
            /// The deleter must not be used if that is the case.
            /// \notes This functions is useful if you have want to create an empty smart pointer without giving it an allocator.
            allocator_deleter() noexcept : size_(0u) {}

            /// \effects Creates it by passing it an \ref allocator_reference and the size of the array that will be deallocated.
            /// It will store the reference to the allocator and uses the referenced allocator object for the deallocation.
            allocator_deleter(allocator_reference<RawAllocator> alloc, std::size_t size) noexcept
            : allocator_reference<RawAllocator>(alloc), size_(size) {}

            /// \effects Calls the destructors and deallocates the memory given to it.
            /// Calls \c deallocate_array(pointer, size, sizeof(value_type), alignof(value_type))
            /// on the referenced allocator object with the size given in the constructor for the deallocation.
            /// \requires The deleter must not have been created by the default constructor.
            void operator()(value_type* pointer) noexcept
            {
                for (auto cur = pointer; cur != pointer + size_; ++cur)
                    cur->~value_type();
                this->deallocate_array(pointer, size_, sizeof(value_type),
                                       alignof(value_type));
            }

            /// \returns The reference to the allocator.
            /// It has the same type as the call to \ref allocator_reference::get_allocator().
            /// \requires The deleter must not be created by the default constructor.
            auto get_allocator() const noexcept
                -> decltype(std::declval<allocator_reference<allocator_type>>().get_allocator())
            {
                return this->allocator_reference<allocator_type>::get_allocator();
            }

            /// \returns The size of the array that will be deallocated.
            /// This is the same value as passed in the constructor, or `0` if it was created by the default constructor.
            std::size_t array_size() const noexcept
            {
                return size_;
            }

        private:
            std::size_t size_;
        };

        /// Similar to \ref allocator_polymorphic_deallocator but calls the destructors of the object.
        /// Otherwise behaves the same.
        /// \note It has a relatively high space overhead, so only use it if you have to.
        /// \ingroup memory adapter
        template <typename BaseType, class RawAllocator>
        class allocator_polymorphic_deleter : TRTLAB_EBO(allocator_reference<RawAllocator>)
        {
        public:
            using allocator_type = typename allocator_reference<RawAllocator>::allocator_type;
            using value_type     = BaseType;

            /// \effects Creates it from a deleter for a derived type.
            /// It will deallocate the memory as if done by the derived type.
            template <typename T, TRTLAB_REQUIRES((std::is_base_of<BaseType, T>::value))>
            allocator_polymorphic_deleter(allocator_deleter<T, RawAllocator> deleter)
            : allocator_reference<RawAllocator>(deleter.get_allocator()),
              derived_size_(sizeof(T)),
              derived_alignment_(alignof(T))
            {
                TRTLAB_MEMORY_ASSERT(std::size_t(derived_size_) == sizeof(T) && std::size_t(derived_alignment_) == alignof(T));
            }

            /// \effects Deallocates the memory given to it.
            /// Calls \c deallocate_node(pointer, size, alignment) on the referenced allocator object,
            /// where \c size and \c alignment are the values of the type it was created with.
            void operator()(value_type* pointer) noexcept
            {
                pointer->~value_type();
                this->deallocate_node(pointer, derived_size_, derived_alignment_);
            }

            /// \returns The reference to the allocator.
            /// It has the same type as the call to \ref allocator_reference::get_allocator().
            auto get_allocator() const noexcept
                -> decltype(std::declval<allocator_reference<allocator_type>>().get_allocator())
            {
                return this->allocator_reference<allocator_type>::get_allocator();
            }

        private:
            unsigned short derived_size_, derived_alignment_; // use unsigned short here to save space
        };
    } // namespace memory
} // namespace trtlab

#endif //TRTLAB_MEMORY_DELETER_H_INCLUDED


================================================
FILE: trtlab/memory/include/trtlab/memory/descriptor.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once
#include <cstdlib>
#include <memory>

#include <dlpack/dlpack.h>
#include <glog/logging.h>

namespace trtlab
{
    namespace memory
    {
        struct iallocator;

        class descriptor final
        {
            using storage_type = std::shared_ptr<iallocator>;

        public:
            descriptor();
            descriptor(std::shared_ptr<iallocator> alloc, std::size_t size, std::size_t alignment);

            descriptor(const descriptor& other) noexcept = delete;
            descriptor& operator=(const descriptor& other) noexcept = delete;

            descriptor(descriptor&& other) noexcept
            : m_storage(std::exchange(other.m_storage, nullptr)),
              m_size(std::exchange(other.m_size, 0u)),
              m_alignment(std::exchange(other.m_alignment, 0u)),
              m_data(std::exchange(other.m_data, nullptr))
            {
            }

            descriptor& operator=(descriptor&& other) noexcept
            {
                m_data      = std::exchange(other.m_data, nullptr);
                m_size      = std::exchange(other.m_size, 0u);
                m_alignment = std::exchange(other.m_alignment, 0u);
                m_storage   = std::exchange(other.m_storage, nullptr);
                return *this;
            }

            ~descriptor()
            {
                release();
            }

            void* data() noexcept
            {
                return m_data;
            }
            const void* data() const noexcept
            {
                return m_data;
            }
            std::size_t size() const noexcept
            {
                return m_size;
            };

            DLContext device_context() const;

            void release();

            std::shared_ptr<descriptor> make_shared();

        private:
            storage_type m_storage;
            std::size_t  m_size;
            std::size_t  m_alignment;
            void*        m_data;

            friend std::ostream& operator<<(std::ostream& os, const descriptor& md);
        };

        // clang-format off
        struct iallocator
        {
            virtual ~iallocator() = default;

            inline void* allocate(std::size_t size, std::size_t alignment = 0UL) { return do_allocate(size, alignment); }
            inline void deallocate(void* ptr, std::size_t size = 0UL, std::size_t alignment = 0UL) noexcept { do_deallocate(ptr, size, alignment); }
            inline descriptor allocate_descriptor(std::size_t size, std::size_t alignment = 0UL) { return do_allocate_descriptor(size, alignment); }

            inline std::size_t max_alignment() const { return do_max_alignment(); }
            inline std::size_t min_alignment() const { return do_min_alignment(); }
            inline std::size_t max_size() const { return do_max_size(); }

            inline DLContext device_context() const { return do_device_context();}

        private:
            virtual void*       do_allocate(std::size_t, std::size_t)                   = 0;
            virtual void        do_deallocate(void*, std::size_t, std::size_t) noexcept = 0;
            virtual descriptor  do_allocate_descriptor(std::size_t, std::size_t)        = 0;
            virtual std::size_t do_min_alignment() const                                = 0;
            virtual std::size_t do_max_alignment() const                                = 0;
            virtual std::size_t do_max_size() const                                     = 0;
            virtual DLContext   do_device_context() const                               = 0;
        };
        // clang-format on 

        std::ostream& operator<<(std::ostream& os, const descriptor& md);
    } // namespace memory
} // namespace trtlab

================================================
FILE: trtlab/memory/include/trtlab/memory/detail/assert.h
================================================
// MODIFICATION_MESSAGE

// Modification notes:
// - Replaced in-library logging feature with glog

// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

#ifndef TRTLAB_MEMORY_DETAIL_ASSERT_H_INCLUDED
#define TRTLAB_MEMORY_DETAIL_ASSERT_H_INCLUDED

#include <cstdlib>

#include "../config.h"

#include <glog/logging.h>

#if defined(TRTLAB_MEMORY_ASSERT)
#define TRTLAB_MEMORY_ASSERT(Expr) CHECK(Expr)
#define TRTLAB_MEMORY_ASSERT_MSG(Expr, Msg) CHECK(Expr) << Msg
#define TRTLAB_MEMORY_UNREACHABLE(Msg) LOG(FATAL) << Msg
#define TRTLAB_MEMORY_WARNING(Msg) LOG(WARNING) << Msg
#else
#define TRTLAB_MEMORY_ASSERT(Expr)
#define TRTLAB_MEMORY_ASSERT_MSG(Expr, Msg)
#define TRTLAB_MEMORY_UNREACHABLE(Msg) std::abort()
#define TRTLAB_MEMORY_WARNING(Msg)
#endif

#endif // TRTLAB_MEMORY_DETAIL_ASSERT_H_INCLUDED


================================================
FILE: trtlab/memory/include/trtlab/memory/detail/block_list.h
================================================
#ifndef TRTLAB_MEMORY_DETAILL_BLOCK_LIST_H_INCLUDED
#define TRTLAB_MEMORY_DETAILL_BLOCK_LIST_H_INCLUDED

#include <cstddef>
#include <cstdint>
#include <deque>

#include "../align.h"
#include "utility.h"
#include "../config.h"
#include "../memory_block.h"

namespace trtlab
{
    namespace memory
    {
        namespace detail
        {
            // stores free blocks for a memory pool
            // memory blocks are fragmented and stored in a list
            // only accessible one block at a time
            // designed for large block which themselves will be uses as backing stores
            class block_list
            {
            protected:
                struct node
                {
                    std::size_t size;
                    node*       next;
                };

            public:
                // minimum element size
                static constexpr auto min_element_size = sizeof(node);
                // alignment
                static constexpr auto min_element_alignment = alignof(node);

                //=== constructor ===//
                block_list() noexcept;

                block_list(block_list&& other) noexcept;
                ~block_list() noexcept = default;

                block_list& operator=(block_list&& other) noexcept;

                friend void swap(block_list& a, block_list& b) noexcept;

                //=== insert/allocation/deallocation ===//
                // inserts a new memory block, by splitting it up and setting the links
                // does not own memory!
                // mem must be aligned for alignment()
                // pre: size != 0
                void insert(memory_block&&) noexcept;

                // returns a single block from the list
                // pre: !empty()
                memory_block allocate() noexcept;

                // deallocates a single block
                void deallocate(memory_block&&) noexcept;

                std::size_t next_block_size() const noexcept
                {
                    return (empty() ? 0u : first_->size);
                }

                // number of nodes remaining
                std::size_t size() const noexcept
                {
                    return capacity_;
                }

                bool empty() const noexcept
                {
                    return first_ == nullptr;
                }

            private:
                node*       first_;
                std::size_t capacity_;
            };

            // unlike block_list; block_list_oob stores the nodes in a std::deque
            // instead of as part of the allocations
            // use block_list_oob when tracking blocks whose memory the cpu can not
            // directly read/write, e.g. gpu memory
            class block_list_oob
            {
            public:
                // minimum element size
                static constexpr auto min_element_size = sizeof(long);
                // alignment
                static constexpr auto min_element_alignment = alignof(long);

                //=== constructor ===//
                block_list_oob() noexcept;

                block_list_oob(block_list_oob&& other) noexcept;
                ~block_list_oob() noexcept = default;

                block_list_oob& operator=(block_list_oob&& other) noexcept;

                friend void swap(block_list_oob& a, block_list_oob& b) noexcept;

                //=== insert/allocation/deallocation ===//
                // inserts a new memory block, by splitting it up and setting the links
                // does not own memory!
                // mem must be aligned for alignment()
                // pre: size != 0
                void insert(memory_block&&) noexcept;

                // returns a single block from the list
                // pre: !empty()
                memory_block allocate() noexcept;

                // deallocates a single block
                void deallocate(memory_block&&) noexcept;

                std::size_t next_block_size() const noexcept
                {
                    return (empty() ? 0u : nodes.front().size);
                }

                // number of nodes remaining
                std::size_t size() const noexcept
                {
                    return nodes.size();
                }

                bool empty() const noexcept
                {
                    return nodes.empty();
                }

            private:
                std::deque<memory_block> nodes;
            };

            void swap(block_list& a, block_list& b) noexcept;

            void swap(block_list_oob& a, block_list_oob& b) noexcept;
            /*
            // stores free blocks for a memory pool
            // memory blocks are fragmented and stored in a list
            // only accessible one block at a time
            // designed for large block which themselves will be uses as backing stores
            class sorted_block_list
            {
                struct block_node : public memory_block
                {
                    block_node* next;
                };

            public:
                // minimum element size
                static constexpr auto min_element_size = sizeof(char*);
                // alignment
                static constexpr auto min_element_alignment = alignof(char*);

                //=== constructor ===//
                sorted_block_list() noexcept;

                // calls other constructor plus insert
                sorted_block_list(const memory_block& block) noexcept;
                sorted_block_list(void* mem, std::size_t size) noexcept;

                sorted_block_list(sorted_block_list&& other) noexcept;
                ~sorted_block_list() noexcept = default;

                sorted_block_list& operator=(sorted_block_list&& other) noexcept;

                friend void swap(sorted_block_list& a, sorted_block_list& b) noexcept;

                //=== insert/allocation/deallocation ===//
                // inserts a new memory block, by splitting it up and setting the links
                // does not own memory!
                // mem must be aligned for alignment()
                // pre: size != 0
                void insert(const memory_block& block) noexcept;
                void insert(void* mem, std::size_t size) noexcept;

                // returns a single block from the list
                // pre: !empty()
                memory_block allocate() noexcept;

                // deallocates a single block
                void deallocate(const memory_block&) noexcept;

                // size of next block in the list
                // will be the large available block
                std::size_t next_block_size() noexcept;

                // alignment of all nodes
                std::size_t alignment() const noexcept;

                // number of nodes remaining
                std::size_t capacity() const noexcept
                {
                    return capacity_;
                }

                bool empty() const noexcept
                {
                    return first_ == nullptr;
                }

            private:
                void insert_impl(void* mem, std::size_t size) noexcept;

                block_node* first_;
                std::size_t capacity_;
            };

            void swap(sorted_block_list& a, sorted_block_list& b) noexcept;
*/
        } // namespace detail
    }     // namespace memory
} // namespace trtlab

#endif

================================================
FILE: trtlab/memory/include/trtlab/memory/detail/container_node_sizes.h
================================================
// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

#ifndef TRTLAB_MEMORY_DETAIL_CONTAINER_NODE_SIZES_H_INCLUDED
#define TRTLAB_MEMORY_DETAIL_CONTAINER_NODE_SIZES_H_INCLUDED

#include "container_node_sizes_impl.h"

#endif //TRTLAB_MEMORY_DETAIL_CONTAINER_NODE_SIZES_H_INCLUDED


================================================
FILE: trtlab/memory/include/trtlab/memory/detail/debug_helpers.h
================================================
// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

#ifndef TRTLAB_MEMORY_DEBUG_HELPERS_H_INCLUDED
#define TRTLAB_MEMORY_DEBUG_HELPERS_H_INCLUDED

#include <atomic>
#include <type_traits>

#include "../config.h"

namespace trtlab
{
    namespace memory
    {
        enum class debug_magic : unsigned char;
        struct allocator_info;

        namespace detail
        {
            using debug_fill_enabled = std::integral_constant<bool, TRTLAB_MEMORY_DEBUG_FILL>;
            constexpr std::size_t debug_fence_size =
                TRTLAB_MEMORY_DEBUG_FILL ? TRTLAB_MEMORY_DEBUG_FENCE : 0u;

#if TRTLAB_MEMORY_DEBUG_FILL
            #error "boo"
            // fills size bytes of memory with debug_magic
            void debug_fill(void* memory, std::size_t size, debug_magic m) noexcept;

            // returns nullptr if memory is filled with debug_magic
            // else returns pointer to mismatched byte
            void* debug_is_filled(void* memory, std::size_t size, debug_magic m) noexcept;

            // fills fence, new and fence
            // returns after fence
            void* debug_fill_new(void* memory, std::size_t node_size,
                                 std::size_t fence_size = debug_fence_size) noexcept;

            // fills free memory and returns memory starting at fence
            void* debug_fill_free(void* memory, std::size_t node_size,
                                  std::size_t fence_size = debug_fence_size) noexcept;

            // fills internal memory
            void debug_fill_internal(void* memory, std::size_t size, bool free) noexcept;
#else
            inline void debug_fill(void*, std::size_t, debug_magic) noexcept
            {
            }

            inline void* debug_is_filled(void*, std::size_t, debug_magic) noexcept
            {
                return nullptr;
            }

            inline void* debug_fill_new(void* memory, std::size_t, std::size_t) noexcept
            {
                return memory;
            }

            inline void* debug_fill_free(void* memory, std::size_t, std::size_t) noexcept
            {
                return static_cast<char*>(memory);
            }

            inline void debug_fill_internal(void*, std::size_t, bool) noexcept
            {
            }
#endif

            void debug_handle_invalid_ptr(const allocator_info& info, void* ptr);

            // validates given ptr by evaluating the Functor
            // if the Functor returns false, calls the debug_leak_checker
            // note: ptr is just used as the information passed to the invalid ptr handler
            template <class Functor>
            void debug_check_pointer(Functor condition, const allocator_info& info, void* ptr)
            {
#if TRTLAB_MEMORY_DEBUG_POINTER_CHECK
                if (!condition())
                    debug_handle_invalid_ptr(info, ptr);
#else
                (void)ptr;
                (void)condition;
                (void)info;
#endif
            }

            // validates ptr by using a more expensive double-dealloc check
            template <class Functor>
            void debug_check_double_dealloc(Functor condition, const allocator_info& info,
                                            void* ptr)
            {
#if TRTLAB_MEMORY_DEBUG_DOUBLE_DEALLOC_CHECK
                debug_check_pointer(condition, info, ptr);
#else
                (void)condition;
                (void)info;
                (void)ptr;
#endif
            }

            void debug_handle_memory_leak(const allocator_info& info, std::ptrdiff_t amount);

            // does no leak checking, null overhead
            template <class Handler>
            class no_leak_checker
            {
            public:
                no_leak_checker() noexcept
                {
                }
                no_leak_checker(no_leak_checker&&) noexcept
                {
                }
                ~no_leak_checker() noexcept
                {
                }

                no_leak_checker& operator=(no_leak_checker&&) noexcept
                {
                    return *this;
                }

                void on_allocate(std::size_t) noexcept
                {
                }
                void on_deallocate(std::size_t) noexcept
                {
                }
            };

            // does leak checking per-object
            // leak is detected upon destructor
            template <class Handler>
            class object_leak_checker : Handler
            {
            public:
                object_leak_checker() noexcept : allocated_(0)
                {
                }

                object_leak_checker(object_leak_checker&& other) noexcept
                    : allocated_(other.allocated_)
                {
                    other.allocated_ = 0;
                }

                ~object_leak_checker() noexcept
                {
                    if (allocated_ != 0)
                        this->operator()(allocated_);
                }

                object_leak_checker& operator=(object_leak_checker&& other) noexcept
                {
                    allocated_       = other.allocated_;
                    other.allocated_ = 0;
                    return *this;
                }

                void on_allocate(std::size_t size) noexcept
                {
                    allocated_ += std::ptrdiff_t(size);
                }

                void on_deallocate(std::size_t size) noexcept
                {
                    allocated_ -= std::ptrdiff_t(size);
                }

            private:
                std::ptrdiff_t allocated_;
            };

            // does leak checking on a global basis
            // call macro TRTLAB_MEMORY_GLOBAL_LEAK_CHECKER(handler, var_name) in the header
            // when last counter gets destroyed, leak is detected
            template <class Handler>
            class global_leak_checker_impl
            {
            public:
                struct counter : Handler
                {
                    counter()
                    {
                        ++no_counter_objects_;
                    }

                    ~counter()
                    {
                        --no_counter_objects_;
                        if (no_counter_objects_ == 0u && allocated_ != 0u)
                            this->operator()(allocated_);
                    }
                };

                global_leak_checker_impl() noexcept
                {
                }
                global_leak_checker_impl(global_leak_checker_impl&&) noexcept
                {
                }
                ~global_leak_checker_impl() noexcept
                {
                }

                global_leak_checker_impl& operator=(global_leak_checker_impl&&) noexcept
                {
                    return *this;
                }

                void on_allocate(std::size_t size) noexcept
                {
                    allocated_ += std::ptrdiff_t(size);
                }

                void on_deallocate(std::size_t size) noexcept
                {
                    allocated_ -= std::ptrdiff_t(size);
                }

            private:
                static std::atomic<std::size_t>    no_counter_objects_;
                static std::atomic<std::ptrdiff_t> allocated_;
            };

            template <class Handler>
            std::atomic<std::size_t> global_leak_checker_impl<Handler>::no_counter_objects_(0u);

            template <class Handler>
            std::atomic<std::ptrdiff_t> global_leak_checker_impl<Handler>::allocated_(0);

#if TRTLAB_MEMORY_DEBUG_LEAK_CHECK
            template <class Handler>
            using global_leak_checker = global_leak_checker_impl<Handler>;

#define TRTLAB_MEMORY_GLOBAL_LEAK_CHECKER(handler, var_name)                                    \
    static foonathan::memory::detail::global_leak_checker<handler>::counter var_name;
#else
            template <class Handler>
            using global_leak_checker = no_leak_checker<int>; // only one instantiation

#define TRTLAB_MEMORY_GLOBAL_LEAK_CHECKER(handler, var_name)
#endif

#if TRTLAB_MEMORY_DEBUG_LEAK_CHECK
            template <class Handler>
            using default_leak_checker = object_leak_checker<Handler>;
#else
            template <class Handler>
            using default_leak_checker = no_leak_checker<Handler>;
#endif
        } // namespace detail
    }
} // namespace trtlab::memory

#endif // TRTLAB_MEMORY_DEBUG_HELPERS_H_INCLUDED


================================================
FILE: trtlab/memory/include/trtlab/memory/detail/free_list.h
================================================
// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

#ifndef TRTLAB_MEMORY_DETAILL_FREE_LIST_H_INCLUDED
#define TRTLAB_MEMORY_DETAILL_FREE_LIST_H_INCLUDED

#include <cstddef>
#include <cstdint>

#include "../align.h"
#include "utility.h"
#include "../config.h"

namespace trtlab
{
    namespace memory
    {
        namespace detail
        {
            // stores free blocks for a memory pool
            // memory blocks are fragmented and stored in a list
            // debug: fills memory and uses a bigger node_size for fence memory
            class free_memory_list
            {
            public:
                // minimum element size
                static constexpr auto min_element_size = sizeof(char*);
                // alignment
                static constexpr auto min_element_alignment = alignof(char*);

                //=== constructor ===//
                free_memory_list(std::size_t node_size) noexcept;

                // calls other constructor plus insert
                free_memory_list(std::size_t node_size, void* mem, std::size_t size) noexcept;

                free_memory_list(free_memory_list&& other) noexcept;
                ~free_memory_list() noexcept = default;

                free_memory_list& operator=(free_memory_list&& other) noexcept;

                friend void swap(free_memory_list& a, free_memory_list& b) noexcept;

                //=== insert/allocation/deallocation ===//
                // inserts a new memory block, by splitting it up and setting the links
                // does not own memory!
                // mem must be aligned for alignment()
                // pre: size != 0
                void insert(void* mem, std::size_t size) noexcept;

                // returns the usable size
                // i.e. how many memory will be actually inserted and usable on a call to insert()
                std::size_t usable_size(std::size_t size) const noexcept
                {
                    return size;
                }

                // returns a single block from the list
                // pre: !empty()
                void* allocate() noexcept;

                // returns a memory block big enough for n bytes
                // might fail even if capacity is sufficient
                void* allocate(std::size_t n) noexcept;

                // deallocates a single block
                void deallocate(void* ptr) noexcept;

                // deallocates multiple blocks with n bytes total
                void deallocate(void* ptr, std::size_t n) noexcept;

                //=== getter ===//
                std::size_t node_size() const noexcept
                {
                    return node_size_;
                }

                // alignment of all nodes
                std::size_t alignment() const noexcept
                {
                    return alignment_;
                }

                // number of nodes remaining
                std::size_t capacity() const noexcept
                {
                    return capacity_;
                }

                bool empty() const noexcept
                {
                    return first_ == nullptr;
                }

            private:
                std::size_t fence_size() const noexcept
                {
                #if TRTLAB_MEMORY_DEBUG_FILL
                    #error "this is broken; disable debug fill"
                    return debug_fence_size ? node_size_ : 0u;
                #else
                    return 0u;
                #endif
                }

                //std::size_t fence_size() const noexcept;
                void insert_impl(void* mem, std::size_t size) noexcept;

                char*       first_;
                std::size_t node_size_, capacity_, alignment_;
            };

            void swap(free_memory_list& a, free_memory_list& b) noexcept;

            // same as above but keeps the nodes ordered
            // this allows array allocations, that is, consecutive nodes
            // debug: fills memory and uses a bigger node_size for fence memory
            class ordered_free_memory_list
            {
            public:
                // minimum element size
                static constexpr auto min_element_size = sizeof(char*);
                // alignment
                static constexpr auto min_element_alignment = alignof(char*);

                //=== constructor ===//
                ordered_free_memory_list(std::size_t node_size) noexcept;

                ordered_free_memory_list(std::size_t node_size, void* mem, std::size_t size) noexcept : ordered_free_memory_list(node_size)
                {
                    insert(mem, size);
                }

                ordered_free_memory_list(ordered_free_memory_list&& other) noexcept;

                ~ordered_free_memory_list() noexcept = default;

                ordered_free_memory_list& operator=(ordered_free_memory_list&& other) noexcept
                {
                    ordered_free_memory_list tmp(detail::move(other));
                    swap(*this, tmp);
                    return *this;
                }

                friend void swap(ordered_free_memory_list& a, ordered_free_memory_list& b) noexcept;

                //=== insert/allocation/deallocation ===//
                // inserts a new memory block, by splitting it up and setting the links
                // does not own memory!
                // mem must be aligned for alignment()
                // pre: size != 0
                void insert(void* mem, std::size_t size) noexcept;

                // returns the usable size
                // i.e. how many memory will be actually inserted and usable on a call to insert()
                std::size_t usable_size(std::size_t size) const noexcept
                {
                    return size;
                }

                // returns a single block from the list
                // pre: !empty()
                void* allocate() noexcept;

                // returns a memory block big enough for n bytes (!, not nodes)
                // might fail even if capacity is sufficient
                void* allocate(std::size_t n) noexcept;

                // deallocates a single block
                void deallocate(void* ptr) noexcept;

                // deallocates multiple blocks with n bytes total
                void deallocate(void* ptr, std::size_t n) noexcept;

                //=== getter ===//
                std::size_t node_size() const noexcept
                {
                    return node_size_;
                }

                // alignment of all nodes
                std::size_t alignment() const noexcept;

                // number of nodes remaining
                std::size_t capacity() const noexcept
                {
                    return capacity_;
                }

                bool empty() const noexcept
                {
                    return capacity_ == 0u;
                }

            private:
                std::size_t fence_size() const noexcept;

                // returns previous pointer
                char* insert_impl(void* mem, std::size_t size) noexcept;

                char* begin_node() noexcept;
                char* end_node() noexcept;

                std::uintptr_t begin_proxy_, end_proxy_;
                std::size_t    node_size_, capacity_;
                char *         last_dealloc_, *last_dealloc_prev_;
            };

            void swap(ordered_free_memory_list& a, ordered_free_memory_list& b) noexcept;

#if TRTLAB_MEMORY_DEBUG_DOUBLE_DEALLOC_CHECk
            // use ordered version to allow pointer check
            using node_free_memory_list  = ordered_free_memory_list;
            using array_free_memory_list = ordered_free_memory_list;
#else
            using node_free_memory_list = free_memory_list;
            using array_free_memory_list = ordered_free_memory_list;
#endif
        } // namespace detail
    }     // namespace memory
} // namespace trtlab

#endif // TRTLAB_MEMORY_DETAILL_FREE_LIST_H_INCLUDED


================================================
FILE: trtlab/memory/include/trtlab/memory/detail/memory_stack.h
================================================
// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

#ifndef TRTLAB_MEMORY_DETAIL_MEMORY_STACK_H_INCLUDED
#define TRTLAB_MEMORY_DETAIL_MEMORY_STACK_H_INCLUDED

#include <cstddef>

#include "../config.h"
#include "../align.h"
#include "debug_helpers.h"
#include "../debugging.h"

namespace trtlab
{
    namespace memory
    {
        namespace detail
        {
            // simple memory stack implementation that does not support growing
            class fixed_memory_stack
            {
            public:
                fixed_memory_stack() noexcept : fixed_memory_stack(nullptr)
                {
                }

                // gives it the current pointer, the end pointer must be maintained seperataly
                explicit fixed_memory_stack(void* memory) noexcept
                    : cur_(static_cast<char*>(memory))
                {
                }

                fixed_memory_stack(fixed_memory_stack&& other) noexcept : cur_(other.cur_)
                {
                    other.cur_ = nullptr;
                }

                ~fixed_memory_stack() noexcept = default;

                fixed_memory_stack& operator=(fixed_memory_stack&& other) noexcept
                {
                    cur_       = other.cur_;
                    other.cur_ = nullptr;
                    return *this;
                }

                // bumps the top pointer without filling it
                void bump(std::size_t offset) noexcept
                {
                    cur_ += offset;
                }

                // bumps the top pointer by offset and fills
                void bump(std::size_t offset, debug_magic m) noexcept
                {
                    detail::debug_fill(cur_, offset, m);
                    bump(offset);
                }

                // same as bump(offset, m) but returns old value
                void* bump_return(std::size_t offset,
                                  debug_magic m = debug_magic::new_memory) noexcept
                {
                    auto memory = cur_;
                    detail::debug_fill(memory, offset, m);
                    cur_ += offset;
                    return memory;
                }

                // allocates memory by advancing the stack, returns nullptr if insufficient
                // debug: mark memory as new_memory, put fence in front and back
                void* allocate(const char* end, std::size_t size, std::size_t alignment,
                               std::size_t fence_size = debug_fence_size) noexcept
                {
                    if (cur_ == nullptr)
                        return nullptr;

                    auto remaining = std::size_t(end - cur_);
                    auto offset    = align_offset(cur_ + fence_size, alignment);
                    if (fence_size + offset + size + fence_size > remaining)
                        return nullptr;

                    return allocate_unchecked(size, offset, fence_size);
                }

                // same as allocate() but does not check the size
                // note: pass it the align OFFSET, not the alignment
                void* allocate_unchecked(std::size_t size, std::size_t align_offset,
                                         std::size_t fence_size = debug_fence_size)
                    noexcept
                {
                    bump(fence_size, debug_magic::fence_memory);
                    bump(align_offset, debug_magic::alignment_memory);
                    auto mem = bump_return(size);
                    bump(fence_size, debug_magic::fence_memory);
                    return mem;
                }

                // unwindws the stack to a certain older position
                // debug: marks memory from new top to old top as freed
                // doesn't check for invalid pointer
                void unwind(char* top) noexcept
                {
                    debug_fill(top, std::size_t(cur_ - top), debug_magic::freed_memory);
                    cur_ = top;
                }

                // returns the current top
                char* top() const noexcept
                {
                    return cur_;
                }

            private:
                char* cur_;
            };
        } // namespace detail
    }
} // namespace trtlab::memory

#endif // TRTLAB_MEMORY_DETAIL_MEMORY_STACK_H_INCLUDED


================================================
FILE: trtlab/memory/include/trtlab/memory/detail/page_info.h
================================================
/*
 * page-info.h
 * 

extracted from: https://github.com/travisdowns/page-info
 
MIT License

Copyright (c) 2017 travisdowns

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*/

#ifndef PAGE_INFO_H_
#define PAGE_INFO_H_

#include <stddef.h>
#include <inttypes.h>
#include <stdbool.h>
#include <stdio.h>

#ifdef __cplusplus
extern "C" {
#endif

typedef struct {
    /* page frame number: if present, the physical frame for the page */
    uint64_t pfn;
    /* soft-dirty set */
    bool softdirty;
    /* exclusively mapped, see e.g., https://patchwork.kernel.org/patch/6787921/ */
    bool exclusive;
    /* is a file mapping */
    bool file;
    /* page is swapped out */
    bool swapped;
    /* page is present, i.e, a physical page is allocated */
    bool present;
    /* if true, the kpageflags were successfully loaded, if false they were not (and are all zero) */
    bool kpageflags_ok;
    /* the 64-bit flag value extracted from /proc/kpageflags only if pfn is non-null */
    uint64_t kpageflags;

} page_info;
/*
 * Information for a number of virtually consecutive pages.
 */
typedef struct {
    /* how many page_info structures are in the array pointed to by info */
    size_t num_pages;

    /* pointer to the array of page_info structures */
    page_info *info;
} page_info_array;


typedef struct {
    /* the number of pages on which this flag was set, always <= pages_available */
    size_t pages_set;

    /* the number of pages on which information could be obtained */
    size_t pages_available;

    /* the total number of pages examined, which may be greater than pages_available if
     * the flag value could not be obtained for some pages (usually because the pfn is not available
     * since the page is not yet present or because running as non-root.
     */
    size_t pages_total;

    /* the flag the values were queried for */
    int flag;

} flag_count;

/**
 * Examine the page info in infos to count the number of times a specified /proc/kpageflags flag was set,
 * effectively giving you a ratio, so you can say "80% of the pages for this allocation are backed by
 * huge pages" or whatever.
 *
 * The flags *must* come from kpageflags (these are not the same as those in /proc/pid/pagemap) and
 * are declared in linux/kernel-page-flags.h.
 *
 * Ideally, the flag information is available for all the pages in the range, so you can
 * say something about the entire range, but this is often not the case because (a) flags
 * are not available for pages that aren't present and (b) flags are generally never available
 * for non-root users. So the ratio structure indicates both the total number of pages as
 * well as the number of pages for which the flag information was available.
 */
flag_count get_flag_count(page_info_array infos, int flag);

/**
 * Given the case-insensitive name of a flag, return the flag number (the index of the bit
 * representing this flag), or -1 if the flag is not found. The "names" of the flags are
 * the same as the macro names in <linux/kernel-page-flags.h> without the KPF_ prefix.
 *
 * For example, the name of the transparent hugepages flag is "THP" and the corresponding
 * macro is KPF_THP, and the value of this macro and returned by this method is 22.
 *
 * You can generate the corresponding mask value to check the flag using (1ULL << value).
 */
int flag_from_name(char const *name);

/**
 * Print the info in the page_info structure to stdout.
 */
void print_info(page_info info);

/**
 * Print the info in the page_info structure to the give file.
 */
void fprint_info(FILE* file, page_info info);


/**
 * Print the table header that lines up with the tabluar format used by the "table" printing
 * functions. Called by fprint_ratios, or you can call it yourself if you want to prefix the
 * output with your own columns.
 */
void fprint_info_header(FILE *file);

/* print one info in a tabular format (as a single row) */
void fprint_info_row(FILE *file, page_info info);


/**
 * Print the ratio for each flag in infos. The ratio is the number of times the flag was set over
 * the total number of pages (or the total number of pages for which the information could be obtained).
 */
void fprint_ratios_noheader(FILE *file, page_info_array infos);
/*
 * Print a table with one row per page from the given infos.
 */
void fprint_ratios(FILE *file, page_info_array infos);

/*
 * Prints a summary of all the pages in the given array as ratios: the fraction of the time the given
 * flag was set.
 */
void fprint_table(FILE *f, page_info_array infos);


/**
 * Get info for a single page indicated by the given pointer (which may point anywhere in the page).
 */
page_info get_page_info(void *p);

/**
 * Get information for each page in the range from start (inclusive) to end (exclusive).
 */
page_info_array get_info_for_range(void *start, void *end);

/**
 * Free the memory associated with the given page_info_array. You shouldn't use it after this call.
 */
void free_info_array(page_info_array infos);

#ifdef __cplusplus
}
#endif

#endif /* PAGE_INFO_H_ */

================================================
FILE: trtlab/memory/include/trtlab/memory/detail/ranges.h
================================================
/* Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once
#include <numeric>
#include <utility>
#include <vector>
#include <type_traits>

namespace trtlab
{
    namespace memory
    {
        namespace detail
        {
            template <typename T>
            std::vector<std::pair<T, T>> find_ranges(const std::vector<T>& values)
            {
                static_assert(std::is_integral<T>::value, "only integral types allowed");

                auto copy = values;
                sort(copy.begin(), copy.end());

                std::vector<std::pair<T, T>> ranges;

                auto it  = copy.cbegin();
                auto end = copy.cend();

                while (it != end)
                {
                    auto low  = *it;
                    auto high = *it;
                    for (T i = 0; it != end && low + i == *it; it++, i++)
                    {
                        high = *it;
                    }
                    ranges.push_back(std::make_pair(low, high));
                }

                return ranges;
            }

            template <typename T>
            std::string print_ranges(const std::vector<std::pair<T, T>>& ranges)
            {
                return std::accumulate(std::begin(ranges), std::end(ranges), std::string(), [](std::string r, std::pair<T, T> p) {
                    if (p.first == p.second)
                    {
                        return r + (r.empty() ? "" : ",") + std::to_string(p.first);
                    }
                    else
                    {
                        return r + (r.empty() ? "" : ",") + std::to_string(p.first) + "-" + std::to_string(p.second);
                    }
                });
            }

        } // namespace detail
    }     // namespace memory
} // namespace trtlab

================================================
FILE: trtlab/memory/include/trtlab/memory/detail/utility.h
================================================
// MODIFICATION_MESSAGE

// Modification notes:
// - removed custom move/forward/swap implementations for std definitions

// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

#ifndef TRTLAB_MEMORY_DETAIL_UTILITY_HPP
#define TRTLAB_MEMORY_DETAIL_UTILITY_HPP

// implementation of some functions from <utility> to prevent dependencies on it

#include <type_traits>
#include <utility>

namespace trtlab
{
    namespace memory
    {
        namespace detail
        {
            using std::move;
            using std::forward;

            // ADL aware swap
            template <typename T>
            void adl_swap(T& a, T& b) noexcept
            {
                std::swap(a, b);
            }

// fancier syntax for enable_if
// used as (template) parameter
// also useful for doxygen
// define PREDEFINED: TRTLAB_REQUIRES(x):=
#define TRTLAB_REQUIRES(Expr) typename std::enable_if<(Expr), int>::type = 0

// same as above, but as return type
// also useful for doxygen:
// defined PREDEFINED: TRTLAB_REQUIRES_RET(x,r):=r
#define TRTLAB_REQUIRES_RET(Expr, ...) typename std::enable_if<(Expr), __VA_ARGS__>::type

// fancier syntax for enable_if on non-templated member function
#define TRTLAB_ENABLE_IF(Expr)                                                                     \
    template <typename Dummy = std::true_type, TRTLAB_REQUIRES(Dummy::value && (Expr))>

// fancier syntax for general expression SFINAE
// used as (template) parameter
// also useful for doxygen:
// define PREDEFINED: TRTLAB_SFINAE(x):=
#define TRTLAB_SFINAE(Expr) decltype((Expr), int()) = 0

// avoids code repetition for one-line forwarding functions
#define TRTLAB_AUTO_RETURN(Expr)                                                                   \
    decltype(Expr)                                                                                 \
    {                                                                                              \
        return Expr;                                                                               \
    }

// same as above, but requires certain type
#define TRTLAB_AUTO_RETURN_TYPE(Expr, T)                                                           \
    decltype(Expr)                                                                                 \
    {                                                                                              \
        static_assert(std::is_same<decltype(Expr), T>::value,                                      \
                      #Expr " does not have the return type " #T);                                 \
        return Expr;                                                                               \
    }

            // whether or not a type is an instantiation of a template
            template <template <typename...> class Template, typename T>
            struct is_instantiation_of : std::false_type
            {
            };

            template <template <typename...> class Template, typename... Args>
            struct is_instantiation_of<Template, Template<Args...>> : std::true_type
            {
            };
        } // namespace detail
    }
} // namespace trtlab::memory

#endif //TRTLAB_MEMORY_DETAIL_UTILITY_HPP


================================================
FILE: trtlab/memory/include/trtlab/memory/error.h
================================================
// MODIFICATION MESSAGE

// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

/// \file
/// The exception classes.

#ifndef TRTLAB_MEMORY_ERROR_H_INCLUDED
#define TRTLAB_MEMORY_ERROR_H_INCLUDED

#include <cstddef>
#include <new>

#include "config.h"

namespace trtlab
{
    namespace memory
    {
        /// Contains information about an allocator.
        /// It can be used for logging in the various handler functions.
        /// \ingroup memory core
        struct allocator_info
        {
            /// The name of the allocator.
            /// It is a NTBS whose lifetime is not managed by this object,
            /// it must be stored elsewhere or be a string literal.
            const char* name;

            /// A pointer representing an allocator.
            /// It does not necessarily point to the beginning of the allocator object,
            /// the only guarantee is that different allocator objects result in a different pointer value.
            /// For stateless allocators it is sometimes \c nullptr.
            /// \note The pointer must not be cast back to any allocator type.
            const void* allocator;

            /// \effects Creates it by giving it the name of the allocator and a pointer.
            constexpr allocator_info(const char* n, const void* alloc) noexcept
                : name(n),
                  allocator(alloc)
            {
            }

            /// @{
            /// \effects Compares two \ref allocator_info objects, they are equal, if the \ref allocator is the same.
            /// \returns The result of the comparision.
            friend constexpr bool operator==(const allocator_info& a,
                                                       const allocator_info& b) noexcept
            {
                return a.allocator == b.allocator;
            }

            friend constexpr bool operator!=(const allocator_info& a,
                                                       const allocator_info& b) noexcept
            {
                return a.allocator != b.allocator;
            }
            /// @}
        };

        /// The exception class thrown when a low level allocator runs out of memory.
        /// It is derived from \c std::bad_alloc.
        /// This can happen if a low level allocation function like \c std::malloc() runs out of memory.
        /// Throwing can be prohibited by the handler function.
        /// \ingroup memory core
        class out_of_memory : public std::bad_alloc
        {
        public:
            /// The type of the handler called in the constructor of \ref out_of_memory.
            /// When an out of memory situation is encountered and the exception class created,
            /// this handler gets called.
            /// It is especially useful if exception support is disabled.
            /// It gets the \ref allocator_info and the amount of memory that was tried to be allocated.
            /// \requiredbe It can log the error, throw a different exception derived from \c std::bad_alloc or abort the program.
            /// If it returns, this exception object will be created and thrown.
            /// \defaultbe On a hosted implementation it logs the error on \c stderr and continues execution,
            /// leading to this exception being thrown.
            /// On a freestanding implementation it does nothing.
            /// \note It is different from \c std::new_handler; it will not be called in a loop trying to allocate memory
            /// or something like that. Its only job is to report the error.
            using handler = void (*)(const allocator_info& info, std::size_t amount);

            /// \effects Sets \c h as the new \ref handler in an atomic operation.
            /// A \c nullptr sets the default \ref handler.
            /// \returns The previous \ref handler. This is never \c nullptr.
            static handler set_handler(handler h);

            /// \returns The current \ref handler. This is never \c nullptr.
            static handler get_handler();

            /// \effects Creates it by passing it the \ref allocator_info and the amount of memory failed to be allocated.
            /// It also calls the \ref handler to control whether or not it will be thrown.
            out_of_memory(const allocator_info& info, std::size_t amount);

            /// \returns A static NTBS that describes the error.
            /// It does not contain any specific information since there is no memory for formatting.
            const char* what() const noexcept override;

            /// \returns The \ref allocator_info passed to it in the constructor.
            const allocator_info& allocator() const noexcept
            {
                return info_;
            }

            /// \returns The amount of memory that was tried to be allocated.
            /// This is the value passed in the constructor.
            std::size_t failed_allocation_size() const noexcept
            {
                return amount_;
            }

        private:
            allocator_info info_;
            std::size_t    amount_;
        };

        /// A special case of \ref out_of_memory errors
        /// thrown when a low-level allocator with a fixed size runs out of memory.
        /// For example, thrown by \ref fixed_block_allocator or \ref static_allocator.<br>
        /// It is derived from \ref out_of_memory but does not provide its own handler.
        /// \ingroup memory core
        class out_of_fixed_memory : public out_of_memory
        {
        public:
            /// \effects Just forwards to \ref out_of_memory.
            out_of_fixed_memory(const allocator_info& info, std::size_t amount)
            : out_of_memory(info, amount)
            {
            }

            /// \returns A static NTBS that describes the error.
            /// It does not contain any specific information since there is no memory for formatting.
            const char* what() const noexcept override;
        };

        /// The exception class thrown when an allocation size is bigger than the supported maximum.
        /// This size is either the node, array or alignment parameter in a call to an allocation function.
        /// If those exceed the supported maximum returned by \c max_node_size(), \c max_array_size() or \c max_alignment(),
        /// one of its derived classes will be thrown or this class if in a situation where the type is unknown.
        /// It is derived from \c std::bad_alloc.
        /// Throwing can be prohibited by the handler function.
        /// \note Even if all parameters are less than the maximum, \ref out_of_memory or a similar exception can be thrown,
        /// because the maximum functions return an upper bound and not the actual supported maximum size,
        /// since it always depends on fence memory, alignment buffer and the like.
        /// \note A user should only \c catch for \c bad_allocation_size, not the derived classes.
        /// \note Most checks will only be done if \ref TRTLAB_MEMORY_CHECK_ALLOCATION_SIZE is \c true.
        /// \ingroup memory core
        class bad_allocation_size : public std::bad_alloc
        {
        public:
            /// The type of the handler called in the constructor of \ref bad_allocation_size.
            /// When a bad allocation size is detected and the exception object created,
            /// this handler gets called.
            /// It is especially useful if exception support is disabled.
            /// It gets the \ref allocator_info, the size passed to the function and the supported size
            /// (the latter is still an upper bound).
            /// \requiredbe It can log the error, throw a different exception derived from \c std::bad_alloc or abort the program.
            /// If it returns, this exception object will be created and thrown.
            /// \defaultbe On a hosted implementation it logs the error on \c stderr and continues execution,
            /// leading to this exception being thrown.
            /// On a freestanding implementation it does nothing.
            using handler = void (*)(const allocator_info& info, std::size_t passed,
                                     std::size_t supported);

            /// \effects Sets \c h as the new \ref handler in an atomic operation.
            /// A \c nullptr sets the default \ref handler.
            /// \returns The previous \ref handler. This is never \c nullptr.
            static handler set_handler(handler h);

            /// \returns The current \ref handler. This is never \c nullptr.
            static handler get_handler();

            /// \effects Creates it by passing it the \ref allocator_info, the size passed to the allocation function
            /// and an upper bound on the supported size.
            /// It also calls the \ref handler to control whether or not it will be thrown.
            bad_allocation_size(const allocator_info& info, std::size_t passed,
                                std::size_t supported);

            /// \returns A static NTBS that describes the error.
            /// It does not contain any specific information since there is no memory for formatting.
            const char* what() const noexcept override;

            /// \returns The \ref allocator_info passed to it in the constructor.
            const allocator_info& allocator() const noexcept
            {
                return info_;
            }

            /// \returns The size or alignment value that was passed to the allocation function
            /// which was too big. This is the same value passed to the constructor.
            std::size_t passed_value() const noexcept
            {
                return passed_;
            }

            /// \returns An upper bound on the maximum supported size/alignment.
            /// It is only an upper bound, values below can fail, but values above will always fail.
            std::size_t supported_value() const noexcept
            {
                return supported_;
            }

        private:
            allocator_info info_;
            std::size_t    passed_, supported_;
        };

        /// The exception class thrown when the node size exceeds the supported maximum,
        /// i.e. it is bigger than \c max_node_size().
        /// It is derived from \ref bad_allocation_size but does not override the handler.
        /// \ingroup memory core
        class bad_node_size : public bad_allocation_size
        {
        public:
            /// \effects Just forwards to \ref bad_allocation_size.
            bad_node_size(const allocator_info& info, std::size_t passed, std::size_t supported)
            : bad_allocation_size(info, passed, supported)
            {
            }

            /// \returns A static NTBS that describes the error.
            /// It does not contain any specific information since there is no memory for formatting.
            const char* what() const noexcept override;
        };

        /// The exception class thrown when the array size exceeds the supported maximum,
        /// i.e. it is bigger than \c max_array_size().
        /// It is derived from \ref bad_allocation_size but does not override the handler.
        /// \ingroup memory core
        class bad_array_size : public bad_allocation_size
        {
        public:
            /// \effects Just forwards to \ref bad_allocation_size.
            bad_array_size(const allocator_info& info, std::size_t passed, std::size_t supported)
            : bad_allocation_size(info, passed, supported)
            {
            }

            /// \returns A static NTBS that describes the error.
            /// It does not contain any specific information since there is no memory for formatting.
            const char* what() const noexcept override;
        };

        /// The exception class thrown when the alignment exceeds the supported maximum,
        /// i.e. it is bigger than \c max_alignment().
        /// It is derived from \ref bad_allocation_size but does not override the handler.
        /// \ingroup memory core
        class bad_alignment : public bad_allocation_size
        {
        public:
            /// \effects Just forwards to \ref bad_allocation_size.
            /// \c passed is <tt>count * size</tt>, \c supported the size in bytes.
            bad_alignment(const allocator_info& info, std::size_t passed, std::size_t supported)
            : bad_allocation_size(info, passed, supported)
            {
            }

            /// \returns A static NTBS that describes the error.
            /// It does not contain any specific information since there is no memory for formatting.
            const char* what() const noexcept override;
        };

        namespace detail
        {
            template <class Ex, typename Func>
            void check_allocation_size(std::size_t passed, Func f, const allocator_info& info)
            {
#if TRTLAB_MEMORY_CHECK_ALLOCATION_SIZE
                auto supported = f();
                if (passed > supported)
                    throw Ex(info, passed, supported);
#else
                (void)passed;
                (void)f;
                (void)info;
#endif
            }

            template <class Ex>
            void check_allocation_size(std::size_t passed, std::size_t supported,
                                       const allocator_info& info)
            {
                check_allocation_size<Ex>(passed, [&] { return supported; }, info);
            }
        } // namespace detail
    }
} // namespace trtlab::memory

#endif // TRTLAB_MEMORY_ERROR_H_INCLUDED


================================================
FILE: trtlab/memory/include/trtlab/memory/huge_page_allocator.h
================================================
#pragma once

#include "posix_aligned_allocator.h"

namespace trtlab
{
    namespace memory
    {
        template <std::size_t HugePageSize>
        struct transparent_huge_page_allocator : public posix_aligned_allocator<HugePageSize>
        {
            static_assert(is_valid_alignment(HugePageSize), "must be a power of 2");

            static void* allocate_node(std::size_t size, std::size_t)
            {
                // ensure complete pages are allocated
                auto addl_bytes_for_complete_page = size % HugePageSize;
                DLOG_IF(WARNING, addl_bytes_for_complete_page)
                    << "transparent_huge_page_allocator allocates complete pages; " << addl_bytes_for_complete_page
                    << " additional bytes were added to allocation";
                size += addl_bytes_for_complete_page;

                // allocate and advise
                void* ptr = posix_aligned_allocator<HugePageSize>::allocate_node(size, HugePageSize);
                auto ret = madvise(ptr, size, MADV_HUGEPAGE);
                if (ret)
                {
                    LOG(WARNING) << "madvise returned an error: " << std::strerror(errno);
                }
                return ptr;
            }

            constexpr static std::size_t page_size()
            {
                return HugePageSize;
            }
        };
    } // namespace memory
} // namespace trtlab

================================================
FILE: trtlab/memory/include/trtlab/memory/literals.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

// MODIFICATION MESSAGE

// Modification Notes:
// - added TiB and TB
// - taken from foonathan/memory (memory_arena.hpp)

// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

#pragma once

namespace trtlab
{
    namespace memory
    {
        namespace literals
        {
            constexpr std::size_t operator"" _KiB(unsigned long long value) noexcept
            {
                return std::size_t(value * 1024);
            }

            constexpr std::size_t operator"" _KB(unsigned long long value) noexcept
            {
                return std::size_t(value * 1000);
            }

            constexpr std::size_t operator"" _MiB(unsigned long long value) noexcept
            {
                return std::size_t(value * 1024 * 1024);
            }

            constexpr std::size_t operator"" _MB(unsigned long long value) noexcept
            {
                return std::size_t(value * 1000 * 1000);
            }

            constexpr std::size_t operator"" _GiB(unsigned long long value) noexcept
            {
                return std::size_t(value * 1024 * 1024 * 1024);
            }

            constexpr std::size_t operator"" _GB(unsigned long long value) noexcept
            {
                return std::size_t(value * 1000 * 1000 * 1000);
            }

            constexpr std::size_t operator"" _TiB(unsigned long long value) noexcept
            {
                return std::size_t(value * 1024 * 1024 * 1024 * 1024);
            }

            constexpr std::size_t operator"" _TB(unsigned long long value) noexcept
            {
                return std::size_t(value * 1000 * 1000 * 1000 * 1000);
            }
        } // namespace literals
    }     // namespace memory
} // namespace trtlab

================================================
FILE: trtlab/memory/include/trtlab/memory/malloc_allocator.h
================================================
/* Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once
#include <cstdlib>
#include <unistd.h>

#include "memory_type.h"

#include <glog/logging.h>

namespace trtlab
{
    namespace memory
    {
        struct malloc_allocator
        {
            using memory_type = host_memory;
            using is_stateful = std::false_type;

            static void* allocate_node(std::size_t size, std::size_t)
            {
                void* mem = std::malloc(size);
                VLOG(10) << "malloc: " << mem << "; size=" << size;
                return mem;
            }

            static void deallocate_node(void* ptr, std::size_t, std::size_t) noexcept
            {
                free(ptr);
            }

            static std::size_t page_size()
            {
                return sysconf(_SC_PAGE_SIZE);
            }
        };

    } // namespace memory
} // namespace trtlab

================================================
FILE: trtlab/memory/include/trtlab/memory/memory_block.h
================================================
// MODIFICATION MESSAGE

// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

#pragma once
#include <stdexcept>

#include "config.h"

namespace trtlab
{
    namespace memory
    {
        /// A memory block.
        /// It is defined by its starting address and size.
        /// \ingroup memory core
        struct memory_block
        {
            void*       memory; ///< The address of the memory block (might be \c nullptr).
            std::size_t size;   ///< The size of the memory block (might be \c 0).

            /// \effects Creates an invalid memory block with starting address \c nullptr and size \c 0.
            memory_block() noexcept : memory_block(nullptr, std::size_t(0)) {}

            /// \effects Creates a memory block from a given starting address and size.
            memory_block(void* mem, std::size_t s) noexcept : memory(mem), size(s) {}

            /// \effects Creates a memory block from a [begin,end) range.
            memory_block(void* begin, void* end) noexcept
            : memory_block(begin, static_cast<std::size_t>(static_cast<char*>(end) - static_cast<char*>(begin)))
            {
            }

            memory_block(memory_block&& other) noexcept : memory(std::exchange(other.memory, nullptr)), size(std::exchange(other.size, 0u))
            {
            }

            memory_block& operator=(memory_block&& other) noexcept
            {
                memory = std::exchange(other.memory, nullptr);
                size   = std::exchange(other.size, 0u);
                return *this;
            }

            memory_block(const memory_block&) = default;
            memory_block& operator=(const memory_block&) = default;

            /// \returns Whether or not a pointer is inside the memory.
            bool contains(const void* address) const noexcept
            {
                auto mem  = static_cast<const char*>(memory);
                auto addr = static_cast<const char*>(address);
                return addr >= mem && addr < mem + size;
            }

            std::uintptr_t distance(void* ptr)
            {
                if (!contains(ptr))
                    throw std::runtime_error("cannot compute distance - ptr not owned by block");
                auto s = reinterpret_cast<std::uintptr_t>(memory);
                auto e = reinterpret_cast<std::uintptr_t>(ptr);
                return e - s;
            }

            void* offset(std::size_t distance)
            {
                if (distance > size)
                    return nullptr;
                auto mem = static_cast<char*>(memory);
                return mem + distance;
            }
        };

        template <typename Compare = std::less<>>
        struct memory_block_compare_size
        {
            using is_transparent = void;

            constexpr bool operator()(std::size_t size, const memory_block& block, Compare compare = Compare()) const
            {
                return compare(size, block.size);
            }

            constexpr bool operator()(const memory_block& block, std::size_t size, Compare compare = Compare()) const
            {
                return compare(block.size, size);
            }

            constexpr bool operator()(const memory_block& lhs, const memory_block& rhs, Compare compare = Compare()) const
            {
                if (compare(lhs.size, rhs.size))
                    return true;
                else if (lhs.size == rhs.size && compare(reinterpret_cast<addr_t>(lhs.memory), reinterpret_cast<addr_t>(rhs.memory)))
                    return true;
                return false;
            }
        };

        template <typename Compare = std::less<>>
        struct memory_block_compare_addr
        {
            using is_transparent = void;

            constexpr bool operator()(void* addr, const memory_block& block, Compare compare = Compare()) const
            {
                return compare(reinterpret_cast<addr_t>(addr), reinterpret_cast<addr_t>(block.memory));
            }

            constexpr bool operator()(const memory_block& block, void* addr, Compare compare = Compare()) const
            {
                return compare(reinterpret_cast<addr_t>(block.memory), reinterpret_cast<addr_t>(addr));
            }

            constexpr bool operator()(const memory_block& lhs, const memory_block& rhs, Compare compare = Compare()) const
            {
                return compare(reinterpret_cast<addr_t>(lhs.memory), reinterpret_cast<addr_t>(rhs.memory));
            }
        };

    } // namespace memory
} // namespace trtlab


================================================
FILE: trtlab/memory/include/trtlab/memory/memory_pool.h
================================================
// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

#ifndef TRTLAB_MEMORY_MEMORY_POOL_H_INCLUDED
#define TRTLAB_MEMORY_MEMORY_POOL_H_INCLUDED

/// \file
/// Class \ref foonathan::memory::memory_pool and its \ref foonathan::memory::allocator_traits specialization.

#include <type_traits>

#include "align.h"
#include "detail/assert.h"
#include "detail/debug_helpers.h"
#include "detail/free_list.h"
#include "config.h"
#include "error.h"
#include "block_stack.h"

namespace trtlab
{
    namespace memory
    {
        namespace detail
        {
            struct memory_pool_leak_handler
            {
                void operator()(std::ptrdiff_t amount);
            };
        } // namespace detail

        /// Tag type defining a memory pool optimized for nodes.
        /// It does not support array allocations that great and may trigger a growth even if there is enough memory.
        /// But it is the fastest pool type.
        /// \ingroup memory allocator
        struct node_pool : TRTLAB_EBO(std::true_type)
        {
            using type = detail::node_free_memory_list;
        };

        /// Tag type defining a memory pool optimized for arrays.
        /// It keeps the nodes oredered inside the free list and searches the list for an appropriate memory block.
        /// Array allocations are still pretty slow, if the array gets big enough it can get slower than \c new.
        /// Node allocations are still fast, unless there is deallocation in random order.
        /// \note Use this tag type only if you really need to have a memory pool!
        /// \ingroup memory allocator
        struct array_pool : TRTLAB_EBO(std::true_type)
        {
            using type = detail::array_free_memory_list;
        };

        /// A stateful \concept{concept_rawallocator,RawAllocator} that manages \concept{concept_node,nodes} of fixed size.
        /// It uses a \ref block_stack with a given \c BlockOrRawAllocator defaulting to \ref growing_block_allocator,
        /// subdivides them in small nodes of given size and puts them onto a free list.
        /// Allocation and deallocation simply remove or add nodes from this list and are thus fast.
        /// The way the list is maintained can be controlled via the \c PoolType
        /// which is either \ref node_pool, \ref array_pool or \ref small_node_pool.<br>
        /// This kind of allocator is ideal for fixed size allocations and deallocations in any order,
        /// for example in a node based container like \c std::list.
        /// It is not so good for different allocation sizes and has some drawbacks for arrays
        /// as described in \ref memory_pool_type.hpp.
        /// \ingroup memory allocator
        template <class BlockAllocator, typename PoolType = node_pool>
        class memory_pool : TRTLAB_EBO(detail::default_leak_checker<detail::memory_pool_leak_handler>)
        {
            using free_list    = typename PoolType::type;
            using leak_checker = detail::default_leak_checker<detail::memory_pool_leak_handler>;

        public:
            using allocator_type = BlockAllocator;
            using memory_type    = typename BlockAllocator::memory_type;
            using pool_type      = PoolType;

            static constexpr std::size_t min_node_size = TRTLAB_IMPL_DEFINED(free_list::min_element_size);

            /// \effects Creates it by specifying the size each \concept{concept_node,node} will have,
            /// the initial block size for the arena and other constructor arguments for the \concept{concept_blockallocator,BlockAllocator}.
            /// If the \c node_size is less than the \c min_node_size, the \c min_node_size will be the actual node size.
            /// It will allocate an initial memory block with given size from the \concept{concept_blockallocator,BlockAllocator}
            /// and puts it onto the free list.
            /// \requires \c node_size must be a valid \concept{concept_node,node size}
            /// and \c block_size must be a non-zero value.
            template <typename... Args>
            memory_pool(std::size_t node_size, std::size_t block_size, Args&&... args)
            : stack_(block_size, detail::forward<Args>(args)...), free_list_(node_size)
            {
                allocate_block();
            }

            memory_pool(std::size_t node_size, uncached_block_stack<allocator_type>&& stack)
            : stack_(std::move(stack)), free_list_(node_size)
            {
                allocate_block();
            }

            /// \effects Destroys the \ref memory_pool by returning all memory blocks,
            /// regardless of properly deallocated back to the \concept{concept_blockallocator,BlockAllocator}.
            ~memory_pool() noexcept {}

            /// @{
            /// \effects Moving a \ref memory_pool object transfers ownership over the free list,
            /// i.e. the moved from pool is completely empty and the new one has all its memory.
            /// That means that it is not allowed to call \ref deallocate_node() on a moved-from allocator
            /// even when passing it memory that was previously allocated by this object.
            memory_pool(memory_pool&& other) noexcept
            : leak_checker(detail::move(other)), stack_(detail::move(other.stack_)), free_list_(detail::move(other.free_list_))
            {
            }

            memory_pool& operator=(memory_pool&& other) noexcept
            {
                leak_checker::operator=(detail::move(other));
                stack_                = detail::move(other.stack_);
                free_list_            = detail::move(other.free_list_);
                return *this;
            }
            /// @}

            /// \effects Allocates a single \concept{concept_node,node} by removing it from the free list.
            /// If the free list is empty, a new memory block will be allocated from the arena and put onto it.
            /// The new block size will be \ref next_capacity() big.
            /// \returns A node of size \ref node_size() suitable aligned,
            /// i.e. suitable for any type where <tt>sizeof(T) < node_size()</tt>.
            /// \throws Anything thrown by the used \concept{concept_blockallocator,BlockAllocator}'s allocation function if a growth is needed.
            void* allocate_node()
            {
                if (free_list_.empty())
                    allocate_block();
                TRTLAB_MEMORY_ASSERT(!free_list_.empty());
                return free_list_.allocate();
            }

            /// \effects Allocates a single \concept{concept_node,node} similar to \ref allocate_node().
            /// But if the free list is empty, a new block will *not* be allocated.
            /// \returns A suitable aligned node of size \ref node_size() or `nullptr`.
            void* try_allocate_node() noexcept
            {
                return free_list_.empty() ? nullptr : free_list_.allocate();
            }

            /// \effects Allocates an \concept{concept_array,array} of nodes by searching for \c n continuous nodes on the list and removing them.
            /// Depending on the \c PoolType this can be a slow operation or not allowed at all.
            /// This can sometimes lead to a growth, even if technically there is enough continuous memory on the free list.
            /// \returns An array of \c n nodes of size \ref node_size() suitable aligned.
            /// \throws Anything thrown by the used \concept{concept_blockallocator,BlockAllocator}'s allocation function if a growth is needed,
            /// or \ref bad_array_size if <tt>n * node_size()</tt> is too big.
            /// \requires \c n must be valid \concept{concept_array,array count}.
            void* allocate_array(std::size_t n)
            {
                detail::check_allocation_size<bad_array_size>(n * node_size(), [&] { return pool_type::value ? next_capacity() : 0; },
                                                              info());
                return allocate_array(n, node_size());
            }

            /// \effects Allocates an \concept{concept_array,array| of nodes similar to \ref allocate_array().
            /// But it will never allocate a new memory block.
            /// \returns An array of \c n nodes of size \ref node_size() suitable aligned
            /// or `nullptr`.
            void* try_allocate_array(std::size_t n) noexcept
            {
                return try_allocate_array(n, node_size());
            }

            /// \effects Deallocates a single \concept{concept_node,node} by putting it back onto the free list.
            /// \requires \c ptr must be a result from a previous call to \ref allocate_node() on the same free list,
            /// i.e. either this allocator object or a new object created by moving this to it.
            void deallocate_node(void* ptr) noexcept
            {
                free_list_.deallocate(ptr);
            }

            /// \effects Deallocates a single \concept{concept_node,node} but it does not be a result of a previous call to \ref allocate_node().
            /// \returns `true` if the node could be deallocated, `false` otherwise.
            /// \note Some free list implementations can deallocate any memory,
            /// doesn't matter where it is coming from.
            bool try_deallocate_node(void* ptr) noexcept
            {
                if (!stack_.owns(ptr))
                    return false;
                free_list_.deallocate(ptr);
                return true;
            }

            /// \effects Deallocates an \concept{concept_array,array} by putting it back onto the free list.
            /// \requires \c ptr must be a result from a previous call to \ref allocate_array() with the same \c n on the same free list,
            /// i.e. either this allocator object or a new object created by moving this to it.
            void deallocate_array(void* ptr, std::size_t n) noexcept
            {
                TRTLAB_MEMORY_ASSERT_MSG(pool_type::value, "does not support array allocations");
                free_list_.deallocate(ptr, n * node_size());
            }

            /// \effects Deallocates an \concept{concept_array,array} but it does not be a result of a previous call to \ref allocate_array().
            /// \returns `true` if the node could be deallocated, `false` otherwise.
            /// \note Some free list implementations can deallocate any memory,
            /// doesn't matter where it is coming from.
            bool try_deallocate_array(void* ptr, std::size_t n) noexcept
            {
                return try_deallocate_array(ptr, n, node_size());
            }

            /// \returns The size of each \concept{concept_node,node} in the pool,
            /// this is either the same value as in the constructor or \c min_node_size if the value was too small.
            std::size_t node_size() const noexcept
            {
                return free_list_.node_size();
            }

            /// \effects Returns the total amount of bytes remaining on the free list.
            /// Divide it by \ref node_size() to get the number of nodes that can be allocated without growing the arena.
            /// \note Array allocations may lead to a growth even if the capacity_left left is big enough.
            std::size_t capacity_left() const noexcept
            {
                return free_list_.capacity() * node_size();
            }

            /// \returns The size of the next memory block after the free list gets empty and the arena grows.
            /// \ref capacity_left() will increase by this amount.
            /// \note Due to fence memory in debug mode this cannot be just divided by the \ref node_size() to get the number of nodes.
            std::size_t next_capacity() const noexcept
            {
                return free_list_.usable_size(stack_.next_block_size());
            }

            /// \returns A reference to the \concept{concept_blockallocator,BlockAllocator} used for managing the arena.
            /// \requires It is undefined behavior to move this allocator out into another object.
            allocator_type& get_allocator() noexcept
            {
                return stack_.get_allocator();
            }

            const allocator_type& get_allocator() const noexcept
            {
                return stack_.get_allocator();
            }

            DLContext device_context() const
            {
                return stack_.device_context();
            }

        private:
            allocator_info info() const noexcept
            {
                return {TRTLAB_MEMORY_LOG_PREFIX "::memory_pool", this};
            }

            void allocate_block()
            {
                auto mem = stack_.allocate_block();
                free_list_.insert(static_cast<char*>(mem.memory), mem.size);
            }

            void* allocate_array(std::size_t n, std::size_t node_size)
            {
                auto mem = free_list_.empty() ? nullptr : free_list_.allocate(n * node_size);
                if (!mem)
                {
                    allocate_block();
                    mem = free_list_.allocate(n * node_size);
                    if (!mem)
                        throw bad_array_size(info(), n * node_size, capacity_left());
                }
                return mem;
            }

            void* try_allocate_array(std::size_t n, std::size_t node_size) noexcept
            {
                return !pool_type::value || free_list_.empty() ? nullptr : free_list_.allocate(n * node_size);
            }

            bool try_deallocate_array(void* ptr, std::size_t n, std::size_t node_size) noexcept
            {
                if (!pool_type::value || !stack_.owns(ptr))
                    return false;
                free_list_.deallocate(ptr, n * node_size);
                return true;
            }

            uncached_block_stack<allocator_type> stack_;
            free_list                            free_list_;

            friend allocator_traits<memory_pool<BlockAllocator, PoolType>>;
            friend composable_allocator_traits<memory_pool<BlockAllocator, PoolType>>;
        };

        template <class Alloc, class Type>
        constexpr std::size_t memory_pool<Alloc, Type>::min_node_size;

        /// Specialization of the \ref allocator_traits for \ref memory_pool classes.
        /// \note It is not allowed to mix calls through the specialization and through the member functions,
        /// i.e. \ref memory_pool::allocate_node() and this \c allocate_node().
        /// \ingroup memory allocator
        template <typename ImplRawAllocator, class PoolType>
        class allocator_traits<memory_pool<ImplRawAllocator, PoolType>>
        {
        public:
            using allocator_type = memory_pool<ImplRawAllocator, PoolType>;
            using memory_type    = typename ImplRawAllocator::memory_type;
            using is_stateful    = std::true_type;

            /// \returns The result of \ref memory_pool::allocate_node().
            /// \throws Anything thrown by the pool allocation function
            /// or a \ref bad_allocation_size exception.
            static void* allocate_node(allocator_type& state, std::size_t size, std::size_t alignment)
            {
                detail::check_allocation_size<bad_node_size>(size, max_node_size(state), state.info());
                detail::check_allocation_size<bad_alignment>(alignment, [&] { return max_alignment(state); }, state.info());
                auto mem = state.allocate_node();
                state.on_allocate(size);
                return mem;
            }

            /// \effects Forwards to \ref memory_pool::allocate_array()
            /// with the number of nodes adjusted to be the minimum,
            /// i.e. when the \c size is less than the \ref memory_pool::node_size().
            /// \returns A \concept{concept_array,array} with specified properties.
            /// \requires The \ref memory_pool has to support array allocations.
            /// \throws Anything thrown by the pool allocation function.
            static void* allocate_array(allocator_type& state, std::size_t count, std::size_t size, std::size_t alignment)
            {
                detail::check_allocation_size<bad_node_size>(size, max_node_size(state), state.info());
                detail::check_allocation_size<bad_alignment>(alignment, [&] { return max_alignment(state); }, state.info());
                detail::check_allocation_size<bad_array_size>(count * size, max_array_size(state), state.info());
                auto mem = state.allocate_array(count, size);
                state.on_allocate(count * size);
                return mem;
            }

            /// \effects Just forwards to \ref memory_pool::deallocate_node().
            static void deallocate_node(allocator_type& state, void* node, std::size_t size, std::size_t) noexcept
            {
                state.deallocate_node(node);
                state.on_deallocate(size);
            }

            /// \effects Forwards to \ref memory_pool::deallocate_array() with the same size adjustment.
            static void deallocate_array(allocator_type& state, void* array, std::size_t count, std::size_t size, std::size_t) noexcept
            {
                state.free_list_.deallocate(array, count * size);
                state.on_deallocate(count * size);
            }

            /// \returns The maximum size of each node which is \ref memory_pool::node_size().
            static std::size_t max_node_size(const allocator_type& state) noexcept
            {
                return state.node_size();
            }

            /// \returns An upper bound on the maximum array size which is \ref memory_pool::next_capacity().
            static std::size_t max_array_size(const allocator_type& state) noexcept
            {
                return state.next_capacity();
            }

            /// \returns The maximum alignment which is the next bigger power of two if less than \c alignof(std::max_align_t)
            /// or the maximum alignment itself otherwise.
            static std::size_t min_alignment(const allocator_type& state) noexcept
            {
                return state.free_list_.alignment();
            }

            /// \returns The maximum alignment which is the next bigger power of two if less than \c alignof(std::max_align_t)
            /// or the maximum alignment itself otherwise.
            static std::size_t max_alignment(const allocator_type& state) noexcept
            {
                return state.free_list_.alignment();
            }

            static DLContext device_context(const allocator_type& state) noexcept
            {
                return state.stack_.device_context();
            }
        };

        /// Specialization of the \ref composable_allocator_traits for \ref memory_pool classes.
        /// \ingroup memory allocator
        template <typename BlockOrRawAllocator, typename PoolType>
        class composable_allocator_traits<memory_pool<BlockOrRawAllocator, PoolType>>
        {
            using traits = allocator_traits<memory_pool<BlockOrRawAllocator, PoolType>>;

        public:
            using allocator_type = memory_pool<BlockOrRawAllocator, PoolType>;

            /// \returns The result of \ref memory_pool::try_allocate_node()
            /// or `nullptr` if the allocation size was too big.
            static void* try_allocate_node(allocator_type& state, std::size_t size, std::size_t alignment) noexcept
            {
                if (size > traits::max_node_size(state) || alignment > traits::max_alignment(state))
                    return nullptr;
                return state.try_allocate_node();
            }

            /// \effects Forwards to \ref memory_pool::try_allocate_array()
            /// with the number of nodes adjusted to be the minimum,
            /// if the \c size is less than the \ref memory_pool::node_size().
            /// \returns A \concept{concept_array,array} with specified properties
            /// or `nullptr` if it was unable to allocate.
            static void* try_allocate_array(allocator_type& state, std::size_t count, std::size_t size, std::size_t alignment) noexcept
            {
                if (size > traits::max_node_size(state) || count * size > traits::max_array_size(state)
                    || alignment > traits::max_alignment(state))
                    return nullptr;
                return state.try_allocate_array(count, size);
            }

            /// \effects Just forwards to \ref memory_pool::try_deallocate_node().
            /// \returns Whether the deallocation was successful.
            static bool try_deallocate_node(allocator_type& state, void* node, std::size_t size, std::size_t alignment) noexcept
            {
                if (size > traits::max_node_size(state) || alignment > traits::max_alignment(state))
                    return false;
                return state.try_deallocate_node(node);
            }

            /// \effects Forwards to \ref memory_pool::deallocate_array() with the same size adjustment.
            /// \returns Whether the deallocation was successful.
            static bool try_deallocate_array(allocator_type& state, void* array, std::size_t count, std::size_t size,
                                             std::size_t alignment) noexcept
            {
                if (size > traits::max_node_size(state) || count * size > traits::max_array_size(state)
                    || alignment > traits::max_alignment(state))
                    return false;
                return state.try_deallocate_array(array, count, size);
            }
        };
    } // namespace memory
} // namespace trtlab

#endif // TRTLAB_MEMORY_MEMORY_POOL_H_INCLUDED


================================================
FILE: trtlab/memory/include/trtlab/memory/memory_resource.h
================================================

#pragma once

#include <cstddef>
#include <utility>

#include "descriptor.h"

namespace trtlab {
namespace memory {
/**---------------------------------------------------------------------------*
 * @brief Base class for host memory allocation.
 *
 * This is based on `std::pmr::memory_resource`:
 * https://en.cppreference.com/w/cpp/memory/memory_resource
 *
 * When C++17 is available for use in RMM, `rmm::memory_resource` should
 * inherit from `std::pmr::memory_resource`.
 *
 * This class serves as the interface that all host memory resource
 * implementations must satisfy.
 *
 * There are two private, pure virtual functions that all derived classes must
 * implement: `do_allocate` and `do_deallocate`. Optionally, derived classes may
 * also override `is_equal`. By default, `is_equal` simply performs an identity
 * comparison.
 *
 * The public, non-virtual functions `allocate`, `deallocate`, and `is_equal`
 * simply call the private virtual functions. The reason for this is to allow
 * implementing shared, default behavior in the base class. For example, the
 * base class' `allocate` function may log every allocation, no matter what
 * derived class implementation is used.
 *
 *---------------------------------------------------------------------------**/
class memory_resource {
 public:
  memory_resource() = default;
  virtual ~memory_resource() = default;

  /**---------------------------------------------------------------------------*
   * @brief Allocates memory on the host of size at least `bytes` bytes.
   *
   * The returned storage is aligned to the specified `alignment` if supported,
   * and to `alignof(std::max_align_t)` otherwise.
   *
   * @throws std::bad_alloc When the requested `bytes` and `alignment` cannot be
   * allocated.
   *
   * @param bytes The size of the allocation
   * @param alignment Alignment of the allocation
   * @return void* Pointer to the newly allocated memory
   *---------------------------------------------------------------------------**/
  void* allocate(std::size_t bytes,
                 std::size_t alignment = alignof(std::max_align_t)) {
    return do_allocate(bytes, alignment);
  }

  /**---------------------------------------------------------------------------*
   * @brief Deallocate memory pointed to by `p`.
   *
   * `p` must have been returned by a prior call to `allocate(bytes,alignment)`
   * on a `memory_resource` that compares equal to `*this`, and the storage
   * it points to must not yet have been deallocated, otherwise behavior is
   * undefined.
   *
   * @throws Nothing.
   *
   * @param p Pointer to be deallocated
   * @param bytes The size in bytes of the allocation. This must be equal to the
   * value of `bytes` that was passed to the `allocate` call that returned `p`.
   * @param alignment Alignment of the allocation. This must be equal to the
   *value of `alignment` that was passed to the `allocate` call that returned
   *`p`.
   * @param stream Stream on which to perform deallocation
   *---------------------------------------------------------------------------**/
  void deallocate(void* p, std::size_t bytes,
                  std::size_t alignment = alignof(std::max_align_t)) {
    do_deallocate(p, bytes, alignment);
  }

  /**---------------------------------------------------------------------------*
   * @brief Compare this resource to another.
   *
   * Two `memory_resource`s compare equal if and only if memory allocated
   * from one `memory_resource` can be deallocated from the other and vice
   * versa.
   *
   * By default, simply checks if \p *this and \p other refer to the same
   * object, i.e., does not check if they are two objects of the same class.
   *
   * @param other The other resource to compare to
   * @returns If the two resources are equivalent
   *---------------------------------------------------------------------------**/
  bool is_equal(memory_resource const& other) const noexcept {
    return do_is_equal(other);
  }

 private:
  /**---------------------------------------------------------------------------*
   * @brief Allocates memory on the host of size at least `bytes` bytes.
   *
   * The returned storage is aligned to the specified `alignment` if supported,
   * and to `alignof(std::max_align_t)` otherwise.
   *
   * @throws std::bad_alloc When the requested `bytes` and `alignment` cannot be
   * allocated.
   *
   * @param bytes The size of the allocation
   * @param alignment Alignment of the allocation
   * @return void* Pointer to the newly allocated memory
   *---------------------------------------------------------------------------**/
  virtual void* do_allocate(
      std::size_t bytes, std::size_t alignment = alignof(std::max_align_t)) = 0;

  /**---------------------------------------------------------------------------*
   * @brief Deallocate memory pointed to by `p`.
   *
   * `p` must have been returned by a prior call to `allocate(bytes,alignment)`
   * on a `memory_resource` that compares equal to `*this`, and the storage
   * it points to must not yet have been deallocated, otherwise behavior is
   * undefined.
   *
   * @throws Nothing.
   *
   * @param p Pointer to be deallocated
   * @param bytes The size in bytes of the allocation. This must be equal to the
   * value of `bytes` that was passed to the `allocate` call that returned `p`.
   * @param alignment Alignment of the allocation. This must be equal to the
   *value of `alignment` that was passed to the `allocate` call that returned
   *`p`.
   * @param stream Stream on which to perform deallocation
   *---------------------------------------------------------------------------**/
  virtual void do_deallocate(
      void* p, std::size_t bytes,
      std::size_t alignment = alignof(std::max_align_t)) = 0;

  /**---------------------------------------------------------------------------*
   * @brief Compare this resource to another.
   *
   * Two host_memory_resources compare equal if and only if memory allocated
   * from one memory_resource can be deallocated from the other and vice
   * versa.
   *
   * By default, simply checks if \p *this and \p other refer to the same
   * object, i.e., does not check if they are two objects of the same class.
   *
   * @param other The other resource to compare to
   * @return true If the two resources are equivalent
   * @return false If the two resources are not equal
   *---------------------------------------------------------------------------**/
  virtual bool do_is_equal(memory_resource const& other) const noexcept {
    return this == &other;
  }
};

class raii_memory_resource
{
public:
  raii_memory_resource() = default;
  virtual ~raii_memory_resource() = default;

  virtual descriptor allocate_descriptor(std::size_t size) = 0;
  virtual descriptor allocate_descriptor(std::size_t size, std::size_t alignment) = 0;
};

class device_context_interface
{
public:
  device_context_interface() = default;
  virtual ~device_context_interface() = default;

  virtual DLContext  device_context() const = 0;
};

}  // namespace memory
}  // namespace trtlab

================================================
FILE: trtlab/memory/include/trtlab/memory/memory_type.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

#include <type_traits>
#include <cstdlib>

#include <dlpack/dlpack.h>

#include "detail/utility.h"

namespace trtlab
{
    namespace memory
    {
        namespace detail
        {
            struct any_memory;

            template <class MemoryType>
            std::true_type is_memory_type_impl(
                int, TRTLAB_SFINAE(std::declval<DLDeviceType&>() = std::declval<MemoryType&>().device_type()),
                TRTLAB_SFINAE(std::declval<std::size_t&>() = std::declval<MemoryType&>().min_allocation_alignment()),
                TRTLAB_SFINAE(std::declval<std::size_t&>() = std::declval<MemoryType&>().max_access_alignment()),
                TRTLAB_SFINAE(std::declval<std::size_t&>() = std::declval<MemoryType&>().access_alignment_for(0UL)));

            template <typename T>
            std::false_type is_memory_type_impl(short);

            template <typename T>
            struct check_memory_type
            {
                using has_base = typename std::is_base_of<any_memory, T>::type;
                using has_impl = decltype(is_memory_type_impl<T>(0));

                using valid = std::integral_constant<bool, has_base::value && has_impl::value>;
            };

        }

        template <typename T>
        struct is_memory_type : detail::check_memory_type<T>::valid
        {
        };

        namespace detail
        {
            class any_memory
            {
              protected:
                template <typename MemoryType>
                std::size_t static alignment_for(std::size_t size)
                {
                    static_assert(is_memory_type<MemoryType>::value, "");
                    auto max_alignment = MemoryType::max_access_alignment();
                    return (size >= max_alignment ? max_alignment : (std::size_t(1) << ilog2(size)));
                }

              private:
                static std::size_t ilog2(std::size_t) noexcept;
            };

        } // namespace detail


        // we can define a policy memory_type
        // this can hold methods to fill memory, etc.
        struct host_memory : detail::any_memory
        {
            constexpr static DLDeviceType device_type() noexcept
            {
                return kDLCPU;
            }
            constexpr static std::size_t min_allocation_alignment() noexcept
            {
                return 8UL;
            }
            constexpr static std::size_t max_access_alignment() noexcept
            {
                return 8UL;
            }
            static std::size_t access_alignment_for(std::size_t size)
            {
                using impl = detail::any_memory;
                return impl::alignment_for<host_memory>(size);
            }
        };

        namespace detail
        {
            template <typename T>
            struct check_host_memory
            {
                using has_base = typename std::is_base_of<host_memory, T>::type;
                using has_impl = decltype(is_memory_type_impl<T>(0));

                using valid = std::integral_constant<bool, has_base::value && has_impl::value>;
            };            
        }

        template <typename T>
        struct is_host_memory : detail::check_memory_type<T>::valid
        {
        };

    } // namespace memory
} // namespace trtlab

================================================
FILE: trtlab/memory/include/trtlab/memory/memory_typed_allocator.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once
#include <memory>

#include <glog/logging.h>

#include "trtlab/memory/allocator_storage.h"
#include "trtlab/memory/descriptor.h"

namespace trtlab
{
    namespace memory
    {
        template <typename MemoryType, DLDeviceType DeviceType>
        class memory_typed_allocator final
        {
            std::shared_ptr<iallocator> m_allocator;
            using allocator_type = memory_typed_allocator<MemoryType, DeviceType>;

        public:
            using memory_type = MemoryType;

            memory_typed_allocator(std::shared_ptr<iallocator> alloc) : m_allocator(alloc)
            {
                CHECK_EQ(m_allocator->device_context().device_type, DeviceType);
            }
            ~memory_typed_allocator() = default;

            memory_typed_allocator(const allocator_type&) = default;
            memory_typed_allocator& operator=(const allocator_type&) = default;

            memory_typed_allocator(allocator_type&&) = default;
            memory_typed_allocator& operator=(allocator_type&&) = default;

            void* allocate_node(std::size_t size, std::size_t alignment)
            {
                DCHECK(m_allocator);
                return m_allocator->allocate(size, alignment);
            }

            void deallocate_node(void* ptr, std::size_t size, std::size_t alignment)
            {
                DCHECK(m_allocator);
                return m_allocator->deallocate(ptr, size, alignment);
            }

            std::size_t max_node_size() const
            {
                DCHECK(m_allocator);
                return m_allocator->max_size();
            }

            std::size_t max_alignment() const
            {
                DCHECK(m_allocator);
                return m_allocator->max_alignment();
            }
            
            std::size_t min_alignment() const
            {
                DCHECK(m_allocator);
                return m_allocator->min_alignment();
            }

            DLContext device_context() const
            {
                DCHECK(m_allocator);
                return m_allocator->device_context();
            }

            std::shared_ptr<iallocator> shared() const
            {
                return m_allocator;
            }
        };

        using host_allocator = memory_typed_allocator<host_memory, kDLCPU>;
    } // namespace memory
} // namespace trtlab

================================================
FILE: trtlab/memory/include/trtlab/memory/posix_aligned_allocator.h
================================================
#pragma once

#include <cerrno>
#include <stdlib.h>
#include <sys/mman.h>

#include "align.h"

namespace trtlab
{
    namespace memory
    {
        template <std::size_t Alignment>
        struct posix_aligned_allocator
        {
            static_assert(is_valid_alignment(Alignment), "must be a power of 2");

            using memory_type = host_memory;
            using is_stateful = std::false_type;

            static void* allocate_node(std::size_t size, std::size_t)
            {
                void* ptr = NULL;
                int   ret = posix_memalign(&ptr, Alignment, size);
                if (ret)
                {
                    throw std::bad_alloc();
                }
                return ptr;
            }

            static void deallocate_node(void* ptr, std::size_t, std::size_t) noexcept
            {
                free(ptr);
            }

            constexpr static std::size_t max_alignment()
            {
                return Alignment;
            }

            constexpr static std::size_t min_alignment()
            {
                return Alignment;
            }
        };
    } // namespace memory
} // namespace trtlab

================================================
FILE: trtlab/memory/include/trtlab/memory/raii_allocator.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once
#include <memory>
#include <set>

#include <glog/logging.h>

#include "trtlab/memory/allocator.h"

namespace trtlab
{
    namespace memory
    {
        namespace raii_detail
        {
            template <typename Compare = std::less<>>
            struct compare_descriptor
            {
                using is_transparent = void;

                constexpr bool operator()(void* addr, const descriptor& md, Compare compare = Compare()) const
                {
                    return compare(reinterpret_cast<const addr_t>(addr), reinterpret_cast<const addr_t>(const_cast<void*>(md.data())));
                }

                constexpr bool operator()(const descriptor& md, void* addr, Compare compare = Compare()) const
                {
                    return compare(reinterpret_cast<const addr_t>(const_cast<void*>(md.data())), reinterpret_cast<const addr_t>(addr));
                }

                constexpr bool operator()(const descriptor& lhs, const descriptor& rhs, Compare compare = Compare()) const
                {
                    return compare(reinterpret_cast<const addr_t>(const_cast<void*>(lhs.data())),
                                   reinterpret_cast<const addr_t>(const_cast<void*>(rhs.data())));
                }
            };

            template <typename RawAllocator, typename Mutex>
            class raii_storage : public iallocator
            {
            public:
                using allocator_type = allocator<RawAllocator, Mutex>;
                using memory_type    = typename allocator_type::memory_type;
                using mutex          = typename allocator_type::mutex;
                using is_stateful    = std::true_type;

                raii_storage(const allocator_type& alloc) : m_allocator(alloc) {}
                ~raii_storage() = default;

                raii_storage(const raii_storage&) = delete;
                raii_storage& operator=(const raii_storage&) = delete;

                raii_storage(raii_storage&&) noexcept = default;
                raii_storage& operator=(raii_storage&&) noexcept = default;

                allocator_type& get_allocator()
                {
                    return m_allocator;
                }

                const allocator_type& get_allocator() const
                {
                    return m_allocator;
                }

            private:
                void* do_allocate(std::size_t size, std::size_t alignment) final override
                {
                    alignment = std::min(std::max(alignment, m_allocator.min_alignment()), m_allocator.max_alignment());
                    auto md = m_allocator.allocate_descriptor(size, alignment);

                    std::lock_guard<mutex> lock(m_mutex);
                    auto [it, rc] = m_descriptors.insert(std::move(md));
                    if (!rc)
                    {
                        LOG(FATAL) << "unable to hold internal descriptor";
                    }
                    return const_cast<void*>(it->data());
                }

                void do_deallocate(void* ptr, std::size_t size, std::size_t alignment) noexcept final override
                {
                    std::lock_guard<mutex> lock(m_mutex);

                    auto md = m_descriptors.find(ptr);
                    if (md == m_descriptors.end())
                    {
                        LOG(FATAL) << "cannot find matching internal descriptor";
                    }
                    m_descriptors.erase(md);
                }

                descriptor do_allocate_descriptor(std::size_t size, std::size_t alignment) final override
                {
                    return m_allocator.allocate_descriptor(size, alignment);
                }

                std::size_t do_max_alignment() const final override
                {
                    return m_allocator.max_alignment();
                }

                std::size_t do_min_alignment() const final override
                {
                    return m_allocator.min_alignment();
                }

                std::size_t do_max_size() const final override
                {
                    return m_allocator.max_node_size();
                }

                DLContext do_device_context() const final override
                {
                    return m_allocator.device_context();
                }

            private:
                mutable mutex                              m_mutex;
                allocator_type                             m_allocator;
                std::set<descriptor, compare_descriptor<>> m_descriptors;
            };

        } // namespace raii_detail

        template <typename RawAllocator, typename Mutex>
        using raii_allocator = allocator_detail::allocator_impl<raii_detail::raii_storage<RawAllocator, Mutex>>;

        template <typename RawAllocator, typename Mutex>
        auto make_raii_allocator(const allocator<RawAllocator, Mutex>& alloc)
        {
            auto storage = std::make_shared<raii_detail::raii_storage<RawAllocator, Mutex>>(alloc);
            return raii_allocator<RawAllocator, Mutex>(std::move(storage));
        }

    } // namespace memory
} // namespace trtlab

================================================
FILE: trtlab/memory/include/trtlab/memory/smart_ptr.h
================================================
// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

#ifndef TRTLAB_MEMORY_SMART_PTR_H_INCLUDED
#define TRTLAB_MEMORY_SMART_PTR_H_INCLUDED

/// \file
/// \c std::make_unique() / \c std::make_shared() replacement allocating memory through a \concept{concept_rawallocator,RawAllocator}.
/// \note Only available on a hosted implementation.

#include "config.h"

#include <memory>
#include <type_traits>

#include "detail/utility.h"
#include "deleter.h"
#include "std_allocator.h"

namespace trtlab
{
    namespace memory
    {
        namespace detail
        {
            template <typename T, class RawAllocator, typename... Args>
            auto allocate_unique(allocator_reference<RawAllocator> alloc, Args&&... args)
                -> std::unique_ptr<T, allocator_deleter<T, RawAllocator>>
            {
                using raw_ptr = std::unique_ptr<T, allocator_deallocator<T, RawAllocator>>;

                auto memory = alloc.allocate_node(sizeof(T), alignof(T));
                // raw_ptr deallocates memory in case of constructor exception
                raw_ptr result(static_cast<T*>(memory), {alloc});
                // call constructor
                ::new (memory) T(detail::forward<Args>(args)...);
                // pass ownership to return value using a deleter that calls destructor
                return {result.release(), {alloc}};
            }

            template <typename T, typename... Args>
            void construct(std::true_type, T* cur, T* end, Args&&... args)
            {
                for (; cur != end; ++cur)
                    ::new (static_cast<void*>(cur)) T(detail::forward<Args>(args)...);
            }

            template <typename T, typename... Args>
            void construct(std::false_type, T* begin, T* end, Args&&... args)
            {
#if TRTLAB_HAS_EXCEPTION_SUPPORT
                auto cur = begin;
                try
                {
                    for (; cur != end; ++cur)
                        ::new (static_cast<void*>(cur)) T(detail::forward<Args>(args)...);
                }
                catch (...)
                {
                    for (auto el = begin; el != cur; ++el)
                        el->~T();
                    throw;
                }
#else
                construct(std::true_type{}, begin, end, detail::forward<Args>(args)...);
#endif
            }

            template <typename T, class RawAllocator>
            auto allocate_array_unique(std::size_t size,
                                       allocator_reference<RawAllocator> alloc)
                -> std::unique_ptr<T[], allocator_deleter<T[], RawAllocator>>
            {
                using raw_ptr = std::unique_ptr<T[], allocator_deallocator<T[], RawAllocator>>;

                auto memory = alloc.allocate_array(size, sizeof(T), alignof(T));
                // raw_ptr deallocates memory in case of constructor exception
                raw_ptr result(static_cast<T*>(memory), {alloc, size});
                construct(std::integral_constant<bool, noexcept(T())>{}, result.get(),
                          result.get() + size);
                // pass ownership to return value using a deleter that calls destructor
                return {result.release(), {alloc, size}};
            }
        } // namespace detail

        /// A \c std::unique_ptr that deletes using a \concept{concept_rawallocator,RawAllocator}.
        ///
        /// It is an alias template using \ref allocator_deleter as \c Deleter class.
        /// \ingroup memory adapter
        template <typename T, class RawAllocator>
        TRTLAB_ALIAS_TEMPLATE(unique_ptr,
                                 std::unique_ptr<T, allocator_deleter<T, RawAllocator>>);

        /// A \c std::unique_ptr that deletes using a \concept{concept_rawallocator,RawAllocator} and allows polymorphic types.
        ///
        /// It can only be created by converting a regular unique pointer to a pointer to a derived class,
        /// and is meant to be used inside containers.
        /// It is an alias template using \ref allocator_polymorphic_deleter as \c Deleter class.
        /// \note It has a relatively high overhead, so only use it if you have to.
        /// \ingroup memory adapter
        template <class BaseType, class RawAllocator>
        TRTLAB_ALIAS_TEMPLATE(
            unique_base_ptr,
            std::unique_ptr<BaseType,
                            allocator_polymorphic_deleter<BaseType, RawAllocator>>);

        /// Creates a \c std::unique_ptr using a \concept{concept_rawallocator,RawAllocator} for the allocation.
        /// \effects Allocates memory for the given type using the allocator
        /// and creates a new object inside it passing the given arguments to its constructor.
        /// \returns A \c std::unique_ptr owning that memory.
        /// \note If the allocator is stateful a reference to the \c RawAllocator will be stored inside the deleter,
        /// the caller has to ensure that the object lives as long as the smart pointer.
        /// \ingroup memory adapter
        template <typename T, class RawAllocator, typename... Args>
        auto allocate_unique(RawAllocator&& alloc, Args&&... args) -> TRTLAB_REQUIRES_RET(
            !std::is_array<T>::value,
            std::unique_ptr<T, allocator_deleter<T, typename std::decay<RawAllocator>::type>>)
        {
            return detail::allocate_unique<T>(make_allocator_reference(
                                                  detail::forward<RawAllocator>(alloc)),
                                              detail::forward<Args>(args)...);
        }

        /// Creates a \c std::unique_ptr using a type-erased \concept{concept_rawallocator,RawAllocator} for the allocation.
        /// It is the same as the other overload but stores the reference to the allocator type-erased inside the \c std::unique_ptr.
        /// \effects Allocates memory for the given type using the allocator
        /// and creates a new object inside it passing the given arguments to its constructor.
        /// \returns A \c std::unique_ptr with a type-erased allocator reference owning that memory.
        /// \note If the allocator is stateful a reference to the \c RawAllocator will be stored inside the deleter,
        /// the caller has to ensure that the object lives as long as the smart pointer.
        /// \ingroup memory adapter
        template <typename T, class RawAllocator, typename... Args>
        auto allocate_unique(any_allocator, RawAllocator&& alloc, Args&&... args)
            -> TRTLAB_REQUIRES_RET(!std::is_array<T>::value,
                                      std::unique_ptr<T, allocator_deleter<T, any_allocator>>)
        {
            return detail::allocate_unique<T, any_allocator>(make_allocator_reference(
                                                                 detail::forward<RawAllocator>(
                                                                     alloc)),
                                                             detail::forward<Args>(args)...);
        }


        /// Creates a \c std::unique_ptr owning an array using a \concept{concept_rawallocator,RawAllocator} for the allocation.
        /// \effects Allocates memory for an array of given size and value initializes each element inside of it.
        /// \returns A \c std::unique_ptr owning that array.
        /// \note If the allocator is stateful a reference to the \c RawAllocator will be stored inside the deleter,
        /// the caller has to ensure that the object lives as long as the smart pointer.
        /// \ingroup memory adapter
        template <typename T, class RawAllocator>
        auto allocate_unique(RawAllocator&& alloc, std::size_t size) -> TRTLAB_REQUIRES_RET(
            std::is_array<T>::value,
            std::unique_ptr<T, allocator_deleter<T, typename std::decay<RawAllocator>::type>>)
        {
            return detail::allocate_array_unique<
                typename std::remove_extent<T>::type>(size,
                                                      make_allocator_reference(
                                                          detail::forward<RawAllocator>(alloc)));
        }

        /// Creates a \c std::unique_ptr owning an array using a type-erased \concept{concept_rawallocator,RawAllocator} for the allocation.
        /// It is the same as the other overload but stores the reference to the allocator type-erased inside the \c std::unique_ptr.
        /// \effects Allocates memory for an array of given size and value initializes each element inside of it.
        /// \returns A \c std::unique_ptr with a type-erased allocator reference owning that array.
        /// \note If the allocator is stateful a reference to the \c RawAllocator will be stored inside the deleter,
        /// the caller has to ensure that the object lives as long as the smart pointer.
        /// \ingroup memory adapter
        template <typename T, class RawAllocator>
        auto allocate_unique(any_allocator, RawAllocator&& alloc, std::size_t size)
            -> TRTLAB_REQUIRES_RET(std::is_array<T>::value,
                                      std::unique_ptr<T, allocator_deleter<T, any_allocator>>)
        {
            return detail::allocate_array_unique<typename std::remove_extent<T>::type,
                                                 any_allocator>(size,
                                                                make_allocator_reference(
                                                                    detail::forward<RawAllocator>(
                                                                        alloc)));
        }

        /// Creates a \c std::shared_ptr using a \concept{concept_rawallocator,RawAllocator} for the allocation.
        /// It is similar to \c std::allocate_shared but uses a \c RawAllocator (and thus also supports any \c Allocator).
        /// \effects Calls \ref std_allocator::make_std_allocator to wrap the allocator and forwards to \c std::allocate_shared.
        /// \returns A \c std::shared_ptr created using \c std::allocate_shared.
        /// \note If the allocator is stateful a reference to the \c RawAllocator will be stored inside the shared pointer,
        /// the caller has to ensure that the object lives as long as the smart pointer.
        /// \ingroup memory adapter
        template <typename T, class RawAllocator, typename... Args>
        std::shared_ptr<T> allocate_shared(RawAllocator&& alloc, Args&&... args)
        {
            return std::allocate_shared<T>(make_std_allocator<T>(
                                               detail::forward<RawAllocator>(alloc)),
                                           detail::forward<Args>(args)...);
        }


#if !defined(DOXYGEN)
#include "detail/container_node_sizes.h"
#else
        /// Contains the node size needed for a `std::shared_ptr`.
        /// These classes are auto-generated and only available if the tools are build and without cross-compiling.
        /// \ingroup memory adapter
        template <typename T>
        struct shared_ptr_node_size : std::integral_constant<std::size_t, implementation_defined>
        {
        };
#endif

    } // namespace memory
} // namespace trtlab

#endif // TRTLAB_MEMORY_SMART_PTR_H_INCLUDED


================================================
FILE: trtlab/memory/include/trtlab/memory/std_allocator.h
================================================
// MODIFICATION MESSAGE

// Modification Notes:

// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

#ifndef TRTLAB_MEMORY_STD_ALLOCATOR_H_INCLUDED
#define TRTLAB_MEMORY_STD_ALLOCATOR_H_INCLUDED

/// \file
/// Class \ref foonathan::memory::std_allocator and related classes and functions.

#include <new>
#include <type_traits>

#include "detail/utility.h"
#include "config.h"
#include "allocator_storage.h"
#include "threading.h"

namespace trtlab
{
    namespace memory
    {
        namespace traits_detail
        {
            template <class RawAllocator>
            auto propagate_on_container_swap(std_concept) ->
                typename RawAllocator::propagate_on_container_swap;

            template <class RawAllocator>
            auto propagate_on_container_swap(min_concept) -> std::true_type;

            template <class RawAllocator>
            auto propagate_on_container_move_assignment(std_concept) ->
                typename RawAllocator::propagate_on_container_move_assignment;

            template <class RawAllocator>
            auto propagate_on_container_move_assignment(min_concept) -> std::true_type;

            template <class RawAllocator>
            auto propagate_on_container_copy_assignment(std_concept) ->
                typename RawAllocator::propagate_on_container_copy_assignment;

            template <class RawAllocator>
            auto propagate_on_container_copy_assignment(min_concept) -> std::true_type;
        } // namespace traits_detail

        /// Controls the propagation of a \ref std_allocator for a certain \concept{concept_rawallocator,RawAllocator}.
        /// \ingroup memory adapter
        template <class RawAllocator>
        struct propagation_traits
        {
            using propagate_on_container_swap =
                decltype(traits_detail::propagate_on_container_swap<RawAllocator>(
                    traits_detail::full_concept{}));

            using propagate_on_container_move_assignment =
                decltype(traits_detail::propagate_on_container_move_assignment<RawAllocator>(
                    traits_detail::full_concept{}));

            using propagate_on_container_copy_assignment =
                decltype(traits_detail::propagate_on_container_copy_assignment<RawAllocator>(
                    traits_detail::full_concept{}));

            template <class AllocReference>
            static AllocReference select_on_container_copy_construction(const AllocReference& alloc)
            {
                return alloc;
            }
        };

        /// Wraps a \concept{concept_rawallocator,RawAllocator} and makes it a "normal" \c Allocator.
        /// It allows using a \c RawAllocator anywhere a \c Allocator is required.
        /// \ingroup memory adapter
        template <typename T, class RawAllocator>
        class std_allocator :
#if defined _MSC_VER && defined __clang__
        TRTLAB_EBO(protected allocator_reference<RawAllocator>)
#else
        TRTLAB_EBO(allocator_reference<RawAllocator>)
#endif
        {
            using alloc_reference = allocator_reference<RawAllocator>;
            // if it is any_allocator_reference an optimized implementation can be used
            using is_any = std::is_same<alloc_reference, any_allocator_reference>;

            using prop_traits = propagation_traits<RawAllocator>;

        public:
            //=== typedefs ===//
            using value_type      = T;
            using pointer         = T*;
            using const_pointer   = const T*;
            using reference       = T&;
            using const_reference = const T&;
            using size_type       = std::size_t;
            using difference_type = std::ptrdiff_t;

            using propagate_on_container_swap = typename prop_traits::propagate_on_container_swap;
            using propagate_on_container_move_assignment =
                typename prop_traits::propagate_on_container_move_assignment;
            using propagate_on_container_copy_assignment =
                typename prop_traits::propagate_on_container_copy_assignment;

            template <typename U>
            struct rebind
            {
                using other = std_allocator<U, RawAllocator>;
            };

            using allocator_type = typename alloc_reference::allocator_type;

            //=== constructor ===//
            /// \effects Default constructs it by storing a default constructed, stateless \c RawAllocator inside the reference.
            /// \requires The \c RawAllocator type is stateless, otherwise the body of this function will not compile.
            std_allocator() noexcept : alloc_reference(allocator_type{})
            {
#if !defined(__GNUC__) || (defined(_GLIBCXX_USE_CXX11_ABI) && _GLIBCXX_USE_CXX11_ABI != 0)
                // std::string requires default constructor for the small string optimization when using gcc's old ABI
                // so don't assert then to allow joint allocator
                static_assert(!alloc_reference::is_stateful::value,
                              "default constructor must not be used for stateful allocators");
#endif
            }

            /// \effects Creates it from a reference to a \c RawAllocator.
            /// It will store an \ref allocator_reference to it.
            /// \requires The expression <tt>allocator_reference<RawAllocator>(alloc)</tt> is well-formed,
            /// that is either \c RawAlloc is the same as \c RawAllocator or \c RawAllocator is the tag type \ref any_allocator.
            /// If the requirement is not fulfilled this function does not participate in overload resolution.
            /// \note The caller has to ensure that the lifetime of the \c RawAllocator is at least as long as the lifetime
            /// of this \ref std_allocator object.
            template <
                class RawAlloc,
                // MSVC seems to ignore access rights in decltype SFINAE below
                // use this to prevent this constructor being chosen instead of move/copy for types inheriting from it
                TRTLAB_REQUIRES((!std::is_base_of<std_allocator, RawAlloc>::value))>
            std_allocator(RawAlloc& alloc,
                          TRTLAB_SFINAE(alloc_reference(alloc))) noexcept
            : alloc_reference(alloc)
            {
            }

            /// \effects Creates it from a stateless, temporary \c RawAllocator object.
            /// It will not store a reference but create it on the fly.
            /// \requires The \c RawAllocator is stateless
            /// and the expression <tt>allocator_reference<RawAllocator>(alloc)</tt> is well-formed as above,
            /// otherwise this function does not participate in overload resolution.
            template <
                class RawAlloc,
                // MSVC seems to ignore access rights in decltype SFINAE below
                // use this to prevent this constructor being chosen instead of move/copy for types inheriting from it
                TRTLAB_REQUIRES((!std::is_base_of<std_allocator, RawAlloc>::value))>
            std_allocator(const RawAlloc& alloc,
                          TRTLAB_SFINAE(alloc_reference(alloc))) noexcept
            : alloc_reference(alloc)
            {
            }

            /// \effects Creates it from another \ref allocator_reference using the same allocator type and mutex.
            std_allocator(const alloc_reference& alloc) noexcept : alloc_reference(alloc)
            {
            }

            /// \details Implicit conversion from any other \ref allocator_storage is forbidden
            /// to prevent accidentally wrapping another \ref allocator_storage inside a \ref allocator_reference.
            template <class StoragePolicy, class OtherMut>
            std_allocator(const allocator_storage<StoragePolicy, OtherMut>&) = delete;

            /// @{
            /// \effects Creates it from another \ref std_allocator allocating a different type.
            /// This is required by the \c Allcoator concept and simply takes the same \ref allocator_reference.
            template <typename U>
            std_allocator(const std_allocator<U, RawAllocator>& alloc) noexcept
            : alloc_reference(alloc)
            {
            }

            template <typename U>
            std_allocator(std_allocator<U, RawAllocator>& alloc) noexcept
            : alloc_reference(alloc)
            {
            }
            /// @}

            /// \returns A copy of the allocator.
            /// This is required by the \c Allocator concept and forwards to the \ref propagation_traits.
            std_allocator<T, RawAllocator> select_on_container_copy_construction() const
            {
                return prop_traits::select_on_container_copy_construction(*this);
            }

            //=== allocation/deallocation ===//
            /// \effects Allocates memory using the underlying \concept{concept_rawallocator,RawAllocator}.
            /// If \c n is \c 1, it will call <tt>allocate_node(sizeof(T), alignof(T))</tt>,
            /// otherwise <tt>allocate_array(n, sizeof(T), alignof(T))</tt>.
            /// \returns A pointer to a memory block suitable for \c n objects of type \c T.
            /// \throws Anything thrown by the \c RawAllocator.
            pointer allocate(size_type n, void* = nullptr)
            {
                return static_cast<pointer>(allocate_impl(is_any{}, n));
            }

            /// \effects Deallcoates memory using the underlying \concept{concept_rawallocator,RawAllocator}.
            /// It will forward to the deallocation function in the same way as in \ref allocate().
            /// \requires The pointer must come from a previous call to \ref allocate() with the same \c n on this object or any copy of it.
            void deallocate(pointer p, size_type n) noexcept
            {
                deallocate_impl(is_any{}, p, n);
            }

            //=== construction/destruction ===//
            /// \effects Creates an object of type \c U at given address using the passed arguments.
            template <typename U, typename... Args>
            void construct(U* p, Args&&... args)
            {
                void* mem = p;
                ::new (mem) U(detail::forward<Args>(args)...);
            }

            /// \effects Calls the destructor for an object of type \c U at given address.
            template <typename U>
            void destroy(U* p) noexcept
            {
                // This is to avoid a MSVS 2015 'unreferenced formal parameter' warning
                (void)p;
                p->~U();
            }

            //=== getter ===//
            /// \returns The maximum size for an allocation which is <tt>max_array_size() / sizeof(value_type)</tt>.
            /// This is only an upper bound, not the exact maximum.
            size_type max_size() const noexcept
            {
                return this->max_array_size() / sizeof(value_type);
            }

            /// @{
            /// \effects Returns a reference to the referenced allocator.
            /// \returns For stateful allocators: A (\c const) reference to the stored allocator.
            /// For stateless allocators: A temporary constructed allocator.
            auto get_allocator() noexcept
                -> decltype(std::declval<alloc_reference>().get_allocator())
            {
                return alloc_reference::get_allocator();
            }

            auto get_allocator() const noexcept
                -> decltype(std::declval<const alloc_reference>().get_allocator())
            {
                return alloc_reference::get_allocator();
            }
            /// @}

        private:
            // any_allocator_reference: use virtual function which already does a dispatch on node/array
            void* allocate_impl(std::true_type, size_type n)
            {
                return get_allocator()->allocate_impl(n, sizeof(T), alignof(T));
            }

            void deallocate_impl(std::true_type, void* ptr, size_type n)
            {
                get_allocator()->deallocate_impl(ptr, n, sizeof(T), alignof(T));
            }

            // alloc_reference: decide between node/array
            void* allocate_impl(std::false_type, size_type n)
            {
                if (n == 1)
                    return this->allocate_node(sizeof(T), alignof(T));
                else
                    return this->allocate_array(n, sizeof(T), alignof(T));
            }

            void deallocate_impl(std::false_type, void* ptr, size_type n)
            {
                if (n == 1)
                    this->deallocate_node(ptr, sizeof(T), alignof(T));
                else
                    this->deallocate_array(ptr, n, sizeof(T), alignof(T));
            }

            template <typename U> // stateful
            bool equal_to_impl(std::true_type, const std_allocator<U, RawAllocator>& other) const noexcept
            {
                return &get_allocator() == &other.get_allocator();
            }

            template <typename U> // non-stateful
            bool equal_to_impl(std::false_type, const std_allocator<U, RawAllocator>&) const noexcept
            {
                return true;
            }

            template <typename U> // shared
            bool equal_to(std::true_type, const std_allocator<U, RawAllocator>& other) const noexcept
            {
                return get_allocator() == other.get_allocator();
            }

            template <typename U> // not shared
            bool equal_to(std::false_type, const std_allocator<U, RawAllocator>& other) const
                noexcept
            {
                return equal_to_impl(typename allocator_traits<RawAllocator>::is_stateful{}, other);
            }

            template <typename T1, typename T2, class Impl>
            friend bool operator==(const std_allocator<T1, Impl>& lhs,
                                   const std_allocator<T2, Impl>& rhs) noexcept;

            template <typename U, class OtherRawAllocator>
            friend class std_allocator;
        };

        /// \effects Compares two \ref std_allocator object, they are equal if either stateless or reference the same allocator.
        /// \returns The result of the comparision for equality.
        /// \relates std_allocator
        template <typename T, typename U, class Impl>
        bool operator==(const std_allocator<T, Impl>& lhs,
                        const std_allocator<U, Impl>& rhs) noexcept
        {
            return lhs.equal_to(is_shared_allocator<Impl>{}, rhs);
        }

        /// \effects Compares two \ref std_allocator object, they are equal if either stateless or reference the same allocator.
        /// \returns The result of the comparision for inequality.
        /// \relates std_allocator
        template <typename T, typename U, class Impl>
        bool operator!=(const std_allocator<T, Impl>& lhs,
                        const std_allocator<U, Impl>& rhs) noexcept
        {
            return !(lhs == rhs);
        }

        /// \returns A new \ref std_allocator for a given type using a certain allocator object.
        /// \relates std_allocator
        template <typename T, class RawAllocator>
        auto make_std_allocator(RawAllocator&& allocator) noexcept
            -> std_allocator<T, typename std::decay<RawAllocator>::type>
        {
            return {detail::forward<RawAllocator>(allocator)};
        }

        /// An alias template for \ref std_allocator using a type-erased \concept{concept_rawallocator,RawAllocator}.
        /// This is the same as using a \ref std_allocator with the tag type \ref any_allocator.
        /// The implementation is optimized to call fewer virtual functions.
        /// \ingroup memory adapter
        template <typename T>
        TRTLAB_ALIAS_TEMPLATE(any_std_allocator, std_allocator<T, any_allocator>);

        /// \returns A new \ref any_std_allocator for a given type using a certain allocator object.
        /// \relates any_std_allocator
        template <typename T, class RawAllocator>
        any_std_allocator<T> make_any_std_allocator(RawAllocator&& allocator) noexcept
        {
            return {detail::forward<RawAllocator>(allocator)};
        }
    }
} // namespace trtlab::memory

#endif // TRTLAB_MEMORY_STD_ALLOCATOR_H_INCLUDED


================================================
FILE: trtlab/memory/include/trtlab/memory/threading.h
================================================
// MODIFICATION_MESSAGE

// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

#ifndef TRTLAB_MEMORY_THREADING_H_INCLUDED
#define TRTLAB_MEMORY_THREADING_H_INCLUDED

/// \file
/// The \ref foonathan::memory::default_mutex.

#include <type_traits>

#include "allocator_traits.h"
#include "config.h"

#include <mutex>

namespace trtlab
{
    namespace memory
    {
        /// A dummy \c Mutex class that does not lock anything.
        /// It is a valid \c Mutex and can be used to disable locking anywhere a \c Mutex is requested.
        /// \ingroup memory core
        struct no_mutex
        {
            void lock() noexcept {}

            bool try_lock() noexcept
            {
                return true;
            }

            void unlock() noexcept {}
        };

#if TRTLAB_MEMORY_THREAD_SAFE_REFERENCE
        using default_mutex = std::mutex;
#else
        /// The default \c Mutex type used as default template paremeter in, e.g. \ref allocator_reference.
        /// If the CMake option \ref TRTLAB_MEMORY_THREAD_SAFE_REFERENCE is \c true and there is threading support,
        /// it is \c std::mutex, else \ref no_mutex.
        /// \ingroup memory core
        using default_mutex = no_mutex;
#endif

        /// Specifies whether or not a \concept{concept_rawallocator,RawAllocator} is thread safe as-is.
        /// This allows to use \ref no_mutex as an optimization.
        /// Note that stateless allocators are implictly thread-safe.
        /// Specialize it only for your own stateful allocators.
        /// \ingroup memory core
        template <class RawAllocator>
        struct is_thread_safe_allocator : std::integral_constant<bool, !allocator_traits<RawAllocator>::is_stateful::value>
        {
        };

        namespace detail
        {
            // selects a mutex for an Allocator
            // stateless allocators don't need locking
            template <class RawAllocator, class Mutex>
            using mutex_for = typename std::conditional<is_thread_safe_allocator<RawAllocator>::value, no_mutex, Mutex>::type;

            // storage for mutexes to use EBO
            // it provides const lock/unlock function, inherit from it
            template <class Mutex>
            class mutex_storage
            {
            public:
                mutex_storage() noexcept = default;
                mutex_storage(const mutex_storage&) noexcept {}

                mutex_storage& operator=(const mutex_storage&) noexcept
                {
                    return *this;
                }

                void lock() const
                {
                    mutex_.lock();
                }

                void unlock() const noexcept
                {
                    mutex_.unlock();
                }

            protected:
                ~mutex_storage() noexcept = default;

            private:
                mutable Mutex mutex_;
            };

            template <>
            class mutex_storage<no_mutex>
            {
            public:
                mutex_storage() noexcept = default;

                void lock() const noexcept {}
                void unlock() const noexcept {}

            protected:
                ~mutex_storage() noexcept = default;
            };

            // non changeable pointer to an Allocator that keeps a lock
            // I don't think EBO is necessary here...
            template <class Alloc, class Mutex>
            class locked_allocator
            {
            public:
                locked_allocator(Alloc& alloc, Mutex& m) noexcept : mutex_(&m), alloc_(&alloc)
                {
                    mutex_->lock();
                }

                locked_allocator(locked_allocator&& other) noexcept : mutex_(other.mutex_), alloc_(other.alloc_)
                {
                    other.mutex_ = nullptr;
                    other.alloc_ = nullptr;
                }

                ~locked_allocator() noexcept
                {
                    if (mutex_)
                        mutex_->unlock();
                }

                locked_allocator& operator=(locked_allocator&& other) noexcept = delete;

                Alloc& operator*() const noexcept
                {
                    TRTLAB_MEMORY_ASSERT(alloc_);
                    return *alloc_;
                }

                Alloc* operator->() const noexcept
                {
                    TRTLAB_MEMORY_ASSERT(alloc_);
                    return alloc_;
                }

            private:
                Mutex* mutex_; // don't use unqiue_lock to avoid dependency
                Alloc* alloc_;
            };

            template <class Alloc, class Mutex>
            locked_allocator<Alloc, Mutex> lock_allocator(Alloc& a, Mutex& m)
            {
                return {a, m};
            }
        } // namespace detail
    }     // namespace memory
} // namespace trtlab

#endif // TRTLAB_MEMORY_THREADING_H_INCLUDED


================================================
FILE: trtlab/memory/include/trtlab/memory/trackers.h
================================================
#pragma once
#include <cstdlib>
#include <memory>
#include <experimental/propagate_const>

namespace trtlab
{
    namespace memory
    {
        struct size_tracker
        {
            size_tracker();
            ~size_tracker();

            size_tracker(const size_tracker&) = delete;
            size_tracker& operator=(const size_tracker&) = delete;

            size_tracker(size_tracker&&) noexcept;
            size_tracker& operator=(size_tracker&&) noexcept;

            void on_node_allocation(void* ptr, std::size_t size, std::size_t alignment) noexcept;
            void on_node_deallocation(void* ptr, std::size_t size, std::size_t alignment) noexcept;
            void on_array_allocation(void* ptr, std::size_t count, std::size_t size, std::size_t alignment) noexcept;
            void on_array_deallocation(void* ptr, std::size_t count, std::size_t size, std::size_t alignment) noexcept;

            std::size_t bytes() const noexcept;

        private:
            struct impl;
            std::experimental::propagate_const<std::unique_ptr<impl>> pimpl;
        };

        /*

// todo

struct histogram_tracker
{
    histogram_tracker();
    ~histogram_tracker();

    histogram_tracker(const histogram_tracker&) = delete;
    histogram_tracker& operator=(const histogram_tracker&) = delete;

    histogram_tracker(histogram_tracker&&) noexcept;
    histogram_tracker& operator=(histogram_tracker&&) noexcept;

    void on_node_allocation(void* ptr, std::size_t size, std::size_t alignment) noexcept;
    void on_node_deallocation(void* ptr, std::size_t size, std::size_t alignment) noexcept;
    void on_array_allocation(void* ptr, std::size_t count, std::size_t size, std::size_t alignment) noexcept;
    void on_array_deallocation(void* ptr, std::size_t count, std::size_t size, std::size_t alignment) noexcept;

    std::size_t bytes() const noexcept;

private:
    struct impl;
    std::experimental::propagate_const<std::unique_ptr<impl>> pimpl;
};

*/
    } // namespace memory
} // namespace trtlab

================================================
FILE: trtlab/memory/include/trtlab/memory/tracking.h
================================================
// MODIFiCATION MESSAGE

// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

#ifndef TRTLAB_MEMORY_TRACKING_H_INCLUDED
#define TRTLAB_MEMORY_TRACKING_H_INCLUDED

/// \file
/// Class \ref foonathan::memory::tracked_allocator and related classes and functions.

#include "detail/utility.h"
#include "allocator_traits.h"
#include "memory_block.h"

namespace trtlab
{
    namespace memory
    {
        namespace detail
        {
            template <class Tracker, class BlockAllocator>
            class deeply_tracked_block_allocator;

            template <class Tracker, class BlockAllocator>
            void set_tracker(deeply_tracked_block_allocator<Tracker, BlockAllocator>& alloc, Tracker* t) noexcept
            {
                alloc.tracker_ = t;
            }

            template <class Allocator, class Tracker>
            void set_tracker(Allocator&, Tracker*)
            {
            }

            // used with deeply_tracked_allocator
            template <class Tracker, class BlockAllocator>
            class deeply_tracked_block_allocator : TRTLAB_EBO(BlockAllocator)
            {
            public:
                template <typename... Args>
                deeply_tracked_block_allocator(std::size_t block_size, Args&&... args)
                : BlockAllocator(block_size, detail::forward<Args>(args)...), tracker_(nullptr)
                {
                }

                memory_block allocate_block()
                {
                    auto block = BlockAllocator::allocate_block();
                    if (tracker_) // on first call tracker_ is nullptr
                        tracker_->on_allocator_growth(block.memory, block.size);
                    return block;
                }

                void deallocate_block(memory_block block) noexcept
                {
                    if (tracker_) // on last call tracker_ is nullptr again
                        tracker_->on_allocator_shrinking(block.memory, block.size);
                    BlockAllocator::deallocate_block(block);
                }

                std::size_t next_block_size() const noexcept
                {
                    return BlockAllocator::next_block_size();
                }

            private:
                Tracker* tracker_;

                friend void set_tracker<>(deeply_tracked_block_allocator&, Tracker*) noexcept;
            };

        } // namespace detail

        /*

        /// A \concept{concept_blockallocator,BlockAllocator} adapter that tracks another allocator using a \concept{concept_tracker,tracker}.
        /// It wraps another \concept{concept_blockallocator,BlockAllocator} and calls the tracker function before forwarding to it.
        /// The class can then be used anywhere a \concept{concept_blockallocator,BlockAllocator} is required and the memory usage will be tracked.<br>
        /// It will only call the <tt>on_allocator_growth()</tt> and <tt>on_allocator_shrinking()</tt> tracking functions,
        /// since a \concept{concept_blockallocator,BlockAllocator} is normally used inside higher allocators only.
        /// \ingroup memory adapter
        template <class Tracker, class BlockOrRawAllocator>
        class tracked_block_allocator
            : TRTLAB_EBO(Tracker, make_block_allocator_t<BlockOrRawAllocator>)
        {
        public:
            using allocator_type = make_block_allocator_t<BlockOrRawAllocator>;
            using tracker        = Tracker;

            /// @{
            /// \effects Creates it by giving it a \concept{concept_tracker,tracker} and the tracked \concept{concept_rawallocator,RawAllocator}.
            /// It will embed both objects.
            explicit tracked_block_allocator(tracker t = {}) noexcept
                : tracker(detail::move(t))
            {
            }

            tracked_block_allocator(tracker t, allocator_type&& alloc) noexcept
                : tracker(detail::move(t)),
                  allocator_type(detail::move(alloc))
            {
            }
            /// @}

            /// \effects Creates it in the form required by the concept.
            /// The allocator will be constructed using \c block_size and \c args.
            template <typename... Args>
            tracked_block_allocator(std::size_t block_size, tracker t, Args&&... args)
            : tracker(detail::move(t)), allocator_type(block_size, detail::forward<Args>(args)...)
            {
            }

            /// \effects Calls <tt>Tracker::on_allocator_growth()</tt> after forwarding to the allocator.
            /// \returns The block as the returned by the allocator.
            memory_block allocate_block()
            {
                auto block = allocator_type::allocate_block();
                this->on_allocator_growth(block.memory, block.size);
                return block;
            }

            /// \effects Calls <tt>Tracker::on_allocator_shrinking()</tt> and forwards to the allocator.
            void deallocate_block(memory_block block) noexcept
            {
                this->on_allocator_shrinking(block.memory, block.size);
                allocator_type::deallocate_block(block);
            }

            /// \returns The next block size as returned by the allocator.
            std::size_t next_block_size() const noexcept
            {
                return allocator_type::next_block_size();
            }

            /// @{
            /// \returns A (const) reference to the used allocator.
            allocator_type& get_allocator() noexcept
            {
                return *this;
            }

            const allocator_type& get_allocator() const noexcept
            {
                return *this;
            }
            /// @}

            /// @{
            /// \returns A (const) reference to the tracker.
            tracker& get_tracker() noexcept
            {
                return *this;
            }

            const tracker& get_tracker() const noexcept
            {
                return *this;
            }
            /// @}
        };

        /// Similar to \ref tracked_block_allocator, but shares the tracker with the higher level allocator.
        /// This allows tracking both (de-)allocations and growth with one tracker.
        /// \note Due to implementation reasons, it cannot track growth and shrinking in the constructor/destructor of the higher level allocator.
        /// \ingroup memory adapter
        template <class Tracker, class BlockOrRawAllocator>
        using deeply_tracked_block_allocator = TRTLAB_IMPL_DEFINED(
            detail::deeply_tracked_block_allocator<Tracker,
                                                   make_block_allocator_t<BlockOrRawAllocator>>);


        */

        /// A \concept{concept_rawallocator,RawAllocator} adapter that tracks another allocator using a \concept{concept_tracker,tracker}.
        /// It wraps another \concept{concept_rawallocator,RawAllocator} and calls the tracker function before forwarding to it.
        /// The class can then be used anywhere a \concept{concept_rawallocator,RawAllocator} is required and the memory usage will be tracked.<br>
        /// If the \concept{concept_rawallocator,RawAllocator} uses \ref deeply_tracked_block_allocator as \concept{concept_blockallocator,BlockAllocator},
        /// it will also track growth and shrinking of the allocator.
        /// \ingroup memory adapter
        template <class Tracker, class RawAllocator>
        class tracked_allocator : TRTLAB_EBO(Tracker, allocator_traits<RawAllocator>::allocator_type)
        {
            using traits            = allocator_traits<RawAllocator>;
            using composable_traits = composable_allocator_traits<RawAllocator>;

        public:
            using allocator_type = typename allocator_traits<RawAllocator>::allocator_type;
            using tracker        = Tracker;

            using is_stateful = std::integral_constant<bool, traits::is_stateful::value || !std::is_empty<Tracker>::value>;
            using memory_type = typename traits::memory_type;

            /// @{
            /// \effects Creates it by giving it a \concept{concept_tracker,tracker} and the tracked \concept{concept_rawallocator,RawAllocator}.
            /// It will embed both objects.
            /// \note This will never call the <tt>Tracker::on_allocator_growth()</tt> function.
            explicit tracked_allocator(tracker t = {}) noexcept : tracked_allocator(detail::move(t), allocator_type{}) {}

            tracked_allocator(tracker t, allocator_type&& allocator) noexcept
            : tracker(detail::move(t)), allocator_type(detail::move(allocator))
            {
                //detail::set_tracker(get_allocator().get_allocator(), &get_tracker());
            }
            /// @}

            /// \effects Destroys both tracker and allocator.
            /// \note This will never call the <tt>Tracker::on_allocator_shrinking()</tt> function.
            ~tracked_allocator() noexcept
            {
                //detail::set_tracker(get_allocator().get_allocator(), static_cast<tracker*>(nullptr));
            }

            /// @{
            /// \effects Moving moves both the tracker and the allocator.
            tracked_allocator(tracked_allocator&& other) noexcept : tracker(detail::move(other)), allocator_type(detail::move(other))
            {
                //detail::set_tracker(get_allocator().get_allocator(), &get_tracker());
            }

            tracked_allocator& operator=(tracked_allocator&& other) noexcept
            {
                tracker::       operator=(detail::move(other));
                allocator_type::operator=(detail::move(other));
                //detail::set_tracker(get_allocator().get_allocator(), &get_tracker());
                return *this;
            }
            /// @}

            /// \effects Calls <tt>Tracker::on_node_allocation()</tt> and forwards to the allocator.
            /// If a growth occurs and the allocator is deeply tracked, also calls <tt>Tracker::on_allocator_growth()</tt>.
            /// \returns The result of <tt>allocate_node()</tt>
            void* allocate_node(std::size_t size, std::size_t alignment)
            {
                auto mem = traits::allocate_node(get_allocator(), size, alignment);
                this->on_node_allocation(mem, size, alignment);
                return mem;
            }

            /// \effects Calls the composable node allocation function.
            /// If allocation was successful, also calls `Tracker::on_node_allocation()`.
            /// \returns The result of `try_allocate_node()`.
            void* try_allocate_node(std::size_t size, std::size_t alignment) noexcept
            {
                auto mem = composable_traits::try_allocate_node(get_allocator(), size, alignment);
                if (mem)
                    this->on_node_allocation(mem, size, alignment);
                return mem;
            }

            /// \effects Calls <tt>Tracker::on_array_allocation()</tt> and forwards to the allocator.
            /// If a growth occurs and the allocator is deeply tracked, also calls <tt>Tracker::on_allocator_growth()</tt>.
            /// \returns The result of <tt>allocate_array()</tt>
            void* allocate_array(std::size_t count, std::size_t size, std::size_t alignment)
            {
                auto mem = traits::allocate_array(get_allocator(), count, size, alignment);
                this->on_array_allocation(mem, count, size, alignment);
                return mem;
            }

            /// \effects Calls the composable array allocation function.
            /// If allocation was succesful, also calls `Tracker::on_array_allocation()`.
            /// \returns The result of `try_allocate_array()`.
            void* try_allocate_array(std::size_t count, std::size_t size, std::size_t alignment) noexcept
            {
                auto mem = composable_traits::try_allocate_array(get_allocator(), count, size, alignment);
                if (mem)
                    this->on_array_allocation(mem, count, size, alignment);
                return mem;
            }

            /// \effects Calls <tt>Tracker::on_node_deallocation()</tt> and forwards to the allocator's <tt>deallocate_node()</tt>.
            /// If shrinking occurs and the allocator is deeply tracked, also calls <tt>Tracker::on_allocator_shrinking()</tt>.
            void deallocate_node(void* ptr, std::size_t size, std::size_t alignment) noexcept
            {
                this->on_node_deallocation(ptr, size, alignment);
                traits::deallocate_node(get_allocator(), ptr, size, alignment);
            }

            /// \effects Calls the composable node deallocation function.
            /// If it was succesful, also calls `Tracker::on_node_deallocation()`.
            /// \returns The result of `try_deallocate_node()`.
            bool try_deallocate_node(void* ptr, std::size_t size, std::size_t alignment) noexcept
            {
                auto res = composable_traits::try_deallocate_node(get_allocator(), ptr, size, alignment);
                if (res)
                    this->on_node_deallocation(ptr, size, alignment);
                return res;
            }

            /// \effects Calls <tt>Tracker::on_array_deallocation()</tt> and forwards to the allocator's <tt>deallocate_array()</tt>.
            /// If shrinking occurs and the allocator is deeply tracked, also calls <tt>Tracker::on_allocator_shrinking()</tt>.
            void deallocate_array(void* ptr, std::size_t count, std::size_t size, std::size_t alignment) noexcept
            {
                this->on_array_deallocation(ptr, count, size, alignment);
                traits::deallocate_array(get_allocator(), ptr, count, size, alignment);
            }

            /// \effects Calls the composable array deallocation function.
            /// If it was succesful, also calls `Tracker::on_array_deallocation()`.
            /// \returns The result of `try_deallocate_array()`.
            bool try_deallocate_array(void* ptr, std::size_t count, std::size_t size, std::size_t alignment) noexcept
            {
                auto res = composable_traits::try_deallocate_array(ptr, count, size, alignment);
                if (res)
                    this->on_array_deallocation(ptr, count, size, alignment);
                return res;
            }

            /// @{
            /// \returns The result of the corresponding function on the wrapped allocator.
            std::size_t max_node_size() const
            {
                return traits::max_node_size(get_allocator());
            }

            std::size_t max_array_size() const
            {
                return traits::max_array_size(get_allocator());
            }

            std::size_t max_alignment() const
            {
                return traits::max_alignment(get_allocator());
            }

            std::size_t min_alignment() const
            {
                return traits::min_alignment(get_allocator());
            }

            DLContext device_context() const
            {
                return traits::device_context(get_allocator());
            }
            /// @}

            /// @{
            /// \returns A (\c const) reference to the wrapped allocator.
            allocator_type& get_allocator() noexcept
            {
                return *this;
            }

            const allocator_type& get_allocator() const noexcept
            {
                return *this;
            }
            /// @}

            /// @{
            /// \returns A (\c const) reference to the tracker.
            tracker& get_tracker() noexcept
            {
                return *this;
            }

            const tracker& get_tracker() const noexcept
            {
                return *this;
            }
            /// @}
        };

        /// \effects Takes a \concept{concept_rawallocator,RawAllocator} and wraps it with a \concept{concept_tracker,tracker}.
        /// \returns A \ref tracked_allocator with the corresponding parameters forwarded to the constructor.
        /// \relates tracked_allocator
        template <class Tracker, class RawAllocator>
        auto make_tracked_allocator(Tracker t, RawAllocator&& alloc) -> tracked_allocator<Tracker, typename std::decay<RawAllocator>::type>
        {
            return tracked_allocator<Tracker, typename std::decay<RawAllocator>::type>{detail::move(t), detail::move(alloc)};
        }

        /*
        namespace detail
        {
            template <typename T, bool Block>
            struct is_block_or_raw_allocator_impl : std::true_type
            {
            };

            template <typename T>
            struct is_block_or_raw_allocator_impl<T, false> : memory::is_raw_allocator<T>
            {
            };

            template <typename T>
            struct is_block_or_raw_allocator
                : is_block_or_raw_allocator_impl<T, memory::is_block_allocator<T>::value>
            {
            };

            template <class RawAllocator, class BlockAllocator>
            struct rebind_block_allocator;

            template <template <typename...> class RawAllocator, typename... Args,
                      class OtherBlockAllocator>
            struct rebind_block_allocator<RawAllocator<Args...>, OtherBlockAllocator>
            {
                using type =
                    RawAllocator<typename std::conditional<is_block_or_raw_allocator<Args>::value,
                                                           OtherBlockAllocator, Args>::type...>;
            };

            template <class Tracker, class RawAllocator>
            using deeply_tracked_block_allocator_for =
                memory::deeply_tracked_block_allocator<Tracker,
                                                       typename RawAllocator::allocator_type>;

            template <class Tracker, class RawAllocator>
            using rebound_allocator =
                typename rebind_block_allocator<RawAllocator,
                                                deeply_tracked_block_allocator_for<Tracker,
                                                                                   RawAllocator>>::
                    type;
        } // namespace detail

        /// A \ref tracked_allocator that has rebound any \concept{concept_blockallocator,BlockAllocator} to the corresponding \ref deeply_tracked_block_allocator.
        /// This makes it a deeply tracked allocator.<br>
        /// It replaces each template argument of the given \concept{concept_rawallocator,RawAllocator} for which \ref is_block_allocator or \ref is_raw_allocator is \c true with a \ref deeply_tracked_block_allocator.
        /// \ingroup memory adapter
        template <class Tracker, class RawAllocator>
        TRTLAB_ALIAS_TEMPLATE(
            deeply_tracked_allocator,
            tracked_allocator<Tracker, detail::rebound_allocator<Tracker, RawAllocator>>);

        /// \effects Takes a \concept{concept_rawallocator,RawAllocator} and deeply wraps it with a \concept{concept_tracker,tracker}.
        /// \returns A \ref deeply_tracked_allocator with the corresponding parameters forwarded to the constructor.
        /// \relates deeply_tracked_allocator
        template <class RawAllocator, class Tracker, typename... Args>
        auto make_deeply_tracked_allocator(Tracker t, Args&&... args)
            -> deeply_tracked_allocator<Tracker, RawAllocator>
        {
            return deeply_tracked_allocator<Tracker, RawAllocator>(detail::move(t),
                                                                   {detail::forward<Args>(
                                                                       args)...});
        }
        */
    } // namespace memory

} // namespace trtlab

#endif // TRTLAB_MEMORY_TRACKING_H_INCLUDED


================================================
FILE: trtlab/memory/include/trtlab/memory/transactional_allocator.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once
#include <cstddef>
#include <map>
#include <queue>
#include <utility>

#include <glog/logging.h>

#include "block_allocators.h"
#include "block_arena.h"
#include "block_manager.h"
#include "error.h"

#include "detail/memory_stack.h"
//#include <foonathan/memory/allocator_storage.hpp>

namespace trtlab
{
    namespace memory
    {
        namespace transactional_detail
        {
            class basic_stack : public memory_block, private detail::fixed_memory_stack
            {
                using stack = detail::fixed_memory_stack;

            public:
                basic_stack() noexcept : memory_block(), fixed_memory_stack(nullptr), m_end(nullptr) {}

                basic_stack(void* ptr, std::size_t size) noexcept : memory_block(ptr, size), fixed_memory_stack(ptr), m_end(top() + size) {}

                basic_stack(basic_stack&& other) noexcept
                : memory_block(std::move(other)), fixed_memory_stack(std::move(other)), m_end(std::exchange(other.m_end, nullptr))
                {
                }

                basic_stack& operator=(basic_stack&& other) noexcept
                {
                    m_end                               = std::exchange(other.m_end, nullptr);
                    detail::fixed_memory_stack::operator=(std::move(other));
                    memory_block::              operator=(std::move(other));
                    return *this;
                }

                void* allocate(std::size_t size_, std::size_t alignment) noexcept
                {
                    return detail::fixed_memory_stack::allocate(m_end, size_, alignment);
                }

                std::size_t available() const noexcept
                {
                    return std::size_t(m_end - top());
                }

                std::size_t capacity() const noexcept
                {
                    return size;
                };

                bool contains(const void* ptr) const noexcept
                {
                    if (!ptr && !m_end)
                    {
                        return true;
                    }
                    return memory_block::contains(ptr);
                }

                const memory_block& get_memory_block() const noexcept
                {
                    return *this;
                }

            private:
                char* m_end;
            };

            class ref_counted_stack : public basic_stack
            {
            public:
                ref_counted_stack() noexcept : basic_stack(nullptr, 0u), m_count(0u) {}
                explicit ref_counted_stack(const memory_block& block) noexcept : basic_stack(block.memory, block.size), m_count(0u) {}
                explicit ref_counted_stack(void* ptr, std::size_t size) noexcept : basic_stack(ptr, size), m_count(0u) {}

                ref_counted_stack(ref_counted_stack&& other) noexcept
                : basic_stack(std::move(other)), m_count(std::exchange(other.m_count, 0u))
                {
                }

                ref_counted_stack& operator=(ref_counted_stack&& other) noexcept
                {
                    basic_stack::operator=(std::move(other));
                    m_count              = std::exchange(other.m_count, 0u);
                    return *this;
                }

                void* allocate(std::size_t size, std::size_t alignment) noexcept
                {
                    auto ptr = basic_stack::allocate(size, alignment);
                    if (ptr)
                    {
                        m_count++;
                    }
                    return ptr;
                }

                bool should_release_after_deallocate(void* ptr) noexcept
                {
                    DCHECK(contains(ptr));
                    DCHECK(m_count) << "Caught more deallocates than allocates";
                    if (m_count && --m_count)
                    {
                        return false;
                    }
                    return true;
                }

                std::size_t use_count() const noexcept
                {
                    return m_count;
                }

            private:
                std::size_t m_count;
            };

        } // namespace transactional_detail

        template <typename BlockAllocator, typename ListType>
        class transactional_allocator
        : TRTLAB_EBO(block_arena<BlockAllocator, cached_arena, ListType>)
        {
            static_assert(is_block_allocator<BlockAllocator>::value, "BlockAllocator is not a BlockAllocator!");

            using list_type  = ListType;
            using stack_type = transactional_detail::ref_counted_stack;
            using arena_type = block_arena<BlockAllocator, cached_arena, list_type>;

            stack_type                m_current_stack;
            block_manager<stack_type> m_in_use_stacks;
            std::size_t               m_max_node_size;

        public:
            using allocator_type       = typename arena_type::allocator_type;
            using block_allocator_type = typename arena_type::block_allocator_type;
            using memory_type          = typename arena_type::memory_type;
            using is_stateful          = std::true_type;

            explicit transactional_allocator(arena_type&& arena) : arena_type(std::move(arena))
            {
                DVLOG(1) << "allocate initial block/stack";
                m_current_stack = allocate_stack();
            }

            ~transactional_allocator() noexcept
            {
                release_current_stack();
                release_in_use_stacks();
                shrink_to_fit();
            }

            transactional_allocator(transactional_allocator&& other) noexcept
            : arena_type(std::move(other)),
              m_current_stack(std::move(other.m_current_stack)),
              m_in_use_stacks(std::move(other.m_in_use_stacks)),
              m_max_node_size(std::exchange(other.m_max_node_size, 0u))
            {
            }

            transactional_allocator& operator=(transactional_allocator&& other) noexcept
            {
                arena_type::operator=(std::move(other));
                m_in_use_stacks     = std::move(other.m_in_use_stacks);
                m_current_stack     = std::move(other.m_current_stack);
                m_max_node_size     = std::exchange(other.m_max_node_size, 0u);
            }

            transactional_allocator(const transactional_allocator&) = delete;
            transactional_allocator& operator=(const transactional_allocator&) = delete;

            void* allocate_node(std::size_t size, std::size_t alignment)
            {
                // allocate off current stack if possible
                // otherwise, allocate a new stack
                DVLOG(2) << this << ": allocate_node " << size << "; " << alignment;

                // check size
                // todo: check alignment
                if (size > m_max_node_size)
                {
                    throw bad_allocation_size(info(), size, m_max_node_size);
                }

                void* ptr = m_current_stack.allocate(size, alignment);
                if (__builtin_expect((!ptr), 0)) // unlikely macro
                {
                    DVLOG(3) << "current stack exhaused - rotate stacks";
                    release_current_stack();
                    m_current_stack = allocate_stack();
                    ptr             = m_current_stack.allocate(size, alignment);
                }
                if (__builtin_expect((!ptr), 0))
                {
                    throw bad_node_size(info(), size, m_max_node_size);
                }
                DVLOG(1) << this << ": allocated " << ptr << "; size=" << size;
                return ptr;
            }

            void deallocate_node(void* ptr, std::size_t size, std::size_t alignment) noexcept
            {
                // deallocate stack if its not the current stack
                // and the reference count goes to 0
                DVLOG(1) << this << ": deallocate_node " << ptr << "; " << size << "; " << alignment;
                auto stack = find_stack(ptr);
                DCHECK(stack);
                if (stack->should_release_after_deallocate(ptr) && !m_current_stack.contains(ptr))
                {
                    DVLOG(3) << "deallocate dropping block";
                    drop_stack_containing_address(ptr);
                }
            }

            std::size_t max_node_size() const noexcept
            {
                return m_max_node_size;
            }

            DLContext device_context() const noexcept
            {
                return arena_type::device_context();
            }

            // access the arena

            void shrink_to_fit() noexcept
            {
                arena_type::shrink_to_fit();
            }

            void reserve_blocks(std::size_t block_count) noexcept
            {
                DVLOG(2) << "reserve blocks: " << block_count;
                auto count = block_count - m_in_use_stacks.size() - 1 /* m_current_stack */;
                DVLOG(3) << "reserve blocks: " << block_count << "; needed: " << count;
                arena_type::reserve_blocks(count);
            }

            allocator_type& get_allocator() noexcept
            {
                return arena_type::get_allocator();
            }

            block_allocator_type& get_block_allocator() noexcept
            {
                return arena_type::get_block_allocator();
            }

        private:
            allocator_info info() noexcept
            {
                return {"trtlab::transactional_allocator", this};
            }

            stack_type allocate_stack()
            {
                auto block      = arena_type::allocate_block();
                m_max_node_size = std::max(m_max_node_size, block.size);
                DVLOG(3) << "allocated new stack";
                return stack_type(block);
            }

            void PushStack(stack_type&& stack)
            {
                DVLOG(3) << "pushing stack to block_manager";
                m_in_use_stacks.add_block(std::move(stack));
            }

            void drop_stack_containing_address(void* ptr) noexcept
            {
                // find_stack search both the in-use block pool and teh current stack
                // We only drop in-use stacks
                auto stack = m_in_use_stacks.find_block(ptr);
                if (stack)
                {
                    DVLOG(3) << "dropping stack " << stack << " from in-use stack";
                    arena_type::deallocate_block(std::move(*stack));
                    m_in_use_stacks.drop_block(ptr);
                }
            }

            void release_current_stack() noexcept
            {
                if (m_current_stack.use_count())
                {
                    DVLOG(3) << "current stack is in-use; move to block_manager";
                    PushStack(std::move(m_current_stack));
                }
                else
                {
                    DVLOG(3) << "current stack is dereferences; deallocating";
                    auto block = m_current_stack.get_memory_block();
                    if (block.memory)
                    {
                        arena_type::deallocate_block(std::move(block));
                    }
                }
                m_current_stack = stack_type();
            }

            void release_in_use_stacks() noexcept
            {
                //if(!m_Map.empty())
                if (m_in_use_stacks.size())
                {
                    DLOG(WARNING) << "transactional_allocator being released with unreleased allocations";
                    for (void* ptr : m_in_use_stacks.blocks())
                    {
                        DVLOG(3) << "force dropping stack " << ptr;
                        drop_stack_containing_address(ptr);
                    }
                    m_in_use_stacks.clear();
                }
                DCHECK_EQ(m_in_use_stacks.size(), 0);
            }

            stack_type* find_stack(const void* ptr) noexcept
            {
                if (m_current_stack.contains(ptr))
                {
                    return &m_current_stack;
                }
                return m_in_use_stacks.find_block(ptr);
            }
        };

        template <typename BlockAllocator, typename BlockList>
        auto make_transactional_allocator(block_arena<BlockAllocator, cached_arena, BlockList>&& arena)
        {
            return transactional_allocator<BlockAllocator, BlockList>(std::move(arena));
        }

    } // namespace memory
} // namespace trtlab


================================================
FILE: trtlab/memory/include/trtlab/memory/utils.h
================================================


#pragma once

namespace trtlab
{
    namespace memory
    {
        std::string bytes_to_string(std::size_t bytes);
        std::size_t string_to_bytes(const std::string);
    }
}

================================================
FILE: trtlab/memory/src/CMakeLists.txt
================================================
# MODIFICATION MESSAGE

# Modification Notes:

# Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
# This file is subject to the license terms in the LICENSE file
# found in the top-level directory of this distribution


#find_package(Threads)
find_package(dlpack)
find_package(glog REQUIRED)
find_package(gflags REQUIRED)
if( CMAKE_BUILD_TYPE STREQUAL "Debug" )
  find_package(gflags COMPONENTS nothreads_shared)
  set(trtlab_gflags "gflags_nothreads_shared")
  message(STATUS "gflags: ${trtlab_gflags}")
else()
  find_package(gflags COMPONENTS nothreads_static)
  set(trtlab_gflags "gflags_nothreads_static")
  message(STATUS "gflags: ${trtlab_gflags}")
endif()

set(header_path ${TRTLAB_MEMORY_SOURCE_DIR}/include/trtlab/memory)

set(detail_header
        ${header_path}/detail/assert.h
#       ${header_path}/detail/container_node_sizes.hpp
#       ${header_path}/detail/debug_helpers.hpp
#       ${header_path}/detail/ebo_storage.hpp
#       ${header_path}/detail/free_list.hpp
#       ${header_path}/detail/free_list_array.hpp
#       ${header_path}/detail/lowlevel_allocator.hpp
#       ${header_path}/detail/memory_stack.hpp
#       ${header_path}/detail/small_free_list.hpp
        ${header_path}/detail/utility.h
)

set(header
        ${header_path}/align.h
#       ${header_path}/aligned_allocator.hpp
        ${header_path}/allocator_storage.h
        ${header_path}/allocator_traits.h
        ${header_path}/block_allocators.h
        ${header_path}/block_arena.h
        ${header_path}/block_manager.h
        ${header_path}/block_stack.h
        ${header_path}/config.h
#       ${header_path}/container.hpp
        ${header_path}/debugging.h
#       ${header_path}/default_allocator.hpp
        ${header_path}/deleter.h
        ${header_path}/descriptor.h
        ${header_path}/error.h
        ${header_path}/huge_page_allocator.h
        ${header_path}/literals.h
        ${header_path}/memory_block.h
        ${header_path}/memory_pool.h
#       ${header_path}/fallback_allocator.hpp
#       ${header_path}/malloc_allocator.hpp
#       ${header_path}/heap_allocator.hpp
#       ${header_path}/iteration_allocator.hpp
#       ${header_path}/joint_allocator.hpp
#       ${header_path}/memory_arena.hpp
#       ${header_path}/memory_pool.hpp
#       ${header_path}/memory_pool_collection.hpp
#       ${header_path}/memory_pool_type.hpp
#       ${header_path}/memory_resource_adapter.hpp
#       ${header_path}/memory_stack.hpp
#       ${header_path}/namespace_alias.hpp
#       ${header_path}/new_allocator.hpp
#       ${header_path}/segregator.hpp
#       ${header_path}/smart_ptr.hpp
#       ${header_path}/static_allocator.hpp
#       ${header_path}/std_allocator.hpp
#       ${header_path}/temporary_allocator.hpp
#       ${header_path}/threading.hpp
        ${header_path}/tracking.h
        ${header_path}/trackers.h
#       ${header_path}/virtual_memory.hpp
        ${header_path}/utils.h
        ${CMAKE_CURRENT_BINARY_DIR}/container_node_sizes_impl.h
)

set(src
        align.cc
        block_stack.cc
        detail/page_info.c
#       detail/debug_helpers.cpp
#       detail/assert.cpp
        detail/block_list.cc
        detail/free_list.cc
#       detail/free_list_array.cpp
#       detail/free_list_utils.hpp
#       detail/ilog2.hpp
#       detail/small_free_list.cpp
#       debugging.cpp
        descriptor.cc
        error.cc
#       heap_allocator.cpp
#       iteration_allocator.cpp
#       malloc_allocator.cpp
#       memory_pool.cpp
#       memory_pool_collection.cpp
#       memory_stack.cpp
        memory_type.cc
#       new_allocator.cpp
#       static_allocator.cpp
#       temporary_allocator.cpp
        trackers.cc
        utils.cc
#       virtual_memory.cpp)
)

# configure config file
configure_file("config.h.in" "${CMAKE_CURRENT_BINARY_DIR}/config_impl.h")

# generate container_node_sizes.hpp
if(TRTLAB_MEMORY_BUILD_TOOLS AND (NOT CMAKE_CROSSCOMPILING))
    add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/container_node_sizes_impl.h
            COMMAND memory_node_size_debugger --code --alignof "alignof(T)" ${CMAKE_CURRENT_BINARY_DIR}/container_node_sizes_impl.h
            DEPENDS memory_node_size_debugger
            VERBATIM)
else()
    message(WARNING "cannot generate container_node_sizes_impl.h, node size information will be unavailable")
    file(WRITE  ${CMAKE_CURRENT_BINARY_DIR}/container_node_sizes_impl.h "#define TRTLAB_MEMORY_NO_NODE_SIZE")
endif()

# trtlab_memory library
add_library(trtlab_memory ${detail_header} ${header} ${src})

target_include_directories(trtlab_memory PUBLIC $<BUILD_INTERFACE:${TRTLAB_MEMORY_SOURCE_DIR}/include/> # for client in subdirectory
                                                $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}> # for generated files in build mode
                                                $<INSTALL_INTERFACE:${TRTLAB_MEMORY_INC_INSTALL_DIR}> # for client in install mode
                                         PRIVATE ${header_path})
target_compile_definitions(trtlab_memory PUBLIC
                           TRTLAB_MEMORY=1
                           TRTLAB_MEMORY_VERSION_MAJOR=${TRTLAB_MEMORY_VERSION_MAJOR}
                           TRTLAB_MEMORY_VERSION_MINOR=${TRTLAB_MEMORY_VERSION_MINOR}
                           TRTLAB_MEMORY_VERSION_PATCH=${TRTLAB_MEMORY_VERSION_PATCH})

target_link_libraries(trtlab_memory
  PUBLIC
#   Threads::Threads
    glog::glog
    ${trtlab_gflags}
    dlpack::dlpack
)

#set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} "-pg")

set_target_properties(trtlab_memory PROPERTIES
                      OUTPUT_NAME "trtlab_memory-${TRTLAB_MEMORY_VERSION}"
                      POSITION_INDEPENDENT_CODE ON)

install(TARGETS trtlab_memory EXPORT trtlab_memoryTargets
        RUNTIME       DESTINATION ${TRTLAB_MEMORY_RUNTIME_INSTALL_DIR}
        LIBRARY       DESTINATION ${TRTLAB_MEMORY_LIBRARY_INSTALL_DIR}
        ARCHIVE       DESTINATION ${TRTLAB_MEMORY_ARCHIVE_INSTALL_DIR}
        FRAMEWORK     DESTINATION ${TRTLAB_MEMORY_FRAMEWORK_INSTALL_DIR})

# Write/install version file
include(CMakePackageConfigHelpers)
set(version_file "${CMAKE_CURRENT_BINARY_DIR}/cmake/trtlab_memory-config-version.cmake")
write_basic_package_version_file(${version_file}
                                 VERSION ${TRTLAB_MEMORY_VERSION}
                                 COMPATIBILITY AnyNewerVersion)

install(FILES ${CMAKE_CURRENT_BINARY_DIR}/config_impl.hpp DESTINATION ${TRTLAB_MEMORY_INC_INSTALL_DIR})
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/container_node_sizes_impl.hpp DESTINATION ${TRTLAB_MEMORY_INC_INSTALL_DIR}/foonathan/memory/detail)
install(FILES ${header}                                   DESTINATION ${TRTLAB_MEMORY_INC_INSTALL_DIR}/foonathan/memory)
install(FILES ${detail_header}                            DESTINATION ${TRTLAB_MEMORY_INC_INSTALL_DIR}/foonathan/memory/detail)
install(FILES ${version_file}                             DESTINATION ${TRTLAB_MEMORY_CMAKE_CONFIG_INSTALL_DIR})

================================================
FILE: trtlab/memory/src/align.cc
================================================
// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

#include "align.h"
#include "detail/assert.h"

#include "ilog2.h"

using namespace trtlab::memory;
using namespace detail;

bool trtlab::memory::is_aligned(void* ptr, std::size_t alignment) noexcept
{
    TRTLAB_MEMORY_ASSERT(is_valid_alignment(alignment));
    auto address = reinterpret_cast<std::uintptr_t>(ptr);
    return address % alignment == 0u;
}

std::size_t trtlab::memory::alignment_for(std::size_t size) noexcept
{
    return (size >= 8UL ? 8UL : (std::size_t(1) << ilog2(size)));
}

std::size_t trtlab::memory::ilog2(std::size_t x)
{
    return ilog2_base(x) - 1;
}

// ceiling ilog2() implementation, adds one if part after comma
// e.g. 1 -> 0, 2 -> 1, 3 -> 2, 4 -> 2, 5 -> 3
std::size_t trtlab::memory::ilog2_ceil(std::size_t x)
{
    // only subtract one if power of two
    return ilog2_base(x) - std::size_t(is_power_of_two(x));
}

================================================
FILE: trtlab/memory/src/block_stack.cc
================================================
// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

#include "block_stack.h"

#include <new>

#include "align.h"

using namespace trtlab::memory;
using namespace detail;

// TODO: template memory_block_stack on MemoryType

const std::size_t memory_block_stack::node::div_alignment =
    sizeof(memory_block_stack::node) / host_memory::max_access_alignment();
const std::size_t memory_block_stack::node::mod_offset =
    sizeof(memory_block_stack::node) % host_memory::max_access_alignment() != 0u;
const std::size_t memory_block_stack::node::offset = (div_alignment + mod_offset) * host_memory::max_access_alignment();

const std::size_t memory_block_stack::implementation_offset = memory_block_stack::node::offset;

void memory_block_stack::push(allocated_mb block) noexcept
{
    TRTLAB_MEMORY_ASSERT(is_aligned(block.memory, max_alignment));
    auto next = ::new (block.memory) node(head_, block.size - node::offset);
    head_     = next;
}

memory_block_stack::allocated_mb memory_block_stack::pop() noexcept
{
    TRTLAB_MEMORY_ASSERT(head_);
    auto to_pop = head_;
    head_       = head_->prev;
    return {to_pop, to_pop->usable_size + node::offset};
}

void memory_block_stack::steal_top(memory_block_stack& other) noexcept
{
    TRTLAB_MEMORY_ASSERT(other.head_);
    auto to_steal = other.head_;
    other.head_   = other.head_->prev;

    to_steal->prev = head_;
    head_          = to_steal;
}

bool memory_block_stack::owns(const void* ptr) const noexcept
{
    auto address = static_cast<const char*>(ptr);
    for (auto cur = head_; cur; cur = cur->prev)
    {
        auto mem = static_cast<char*>(static_cast<void*>(cur));
        if (address >= mem && address < mem + cur->usable_size)
            return true;
    }
    return false;
}

std::size_t memory_block_stack::size() const noexcept
{
    std::size_t res = 0u;
    for (auto cur = head_; cur; cur = cur->prev)
        ++res;
    return res;
}


================================================
FILE: trtlab/memory/src/config.h.in
================================================
// MODIFICATION MESSAGE

// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

#ifndef TRTLAB_MEMORY_IMPL_IN_CONFIG_H
    #error "do not include this file directly, use config.h"
#endif

#include <cstddef>

//=== options ===//
// clang-format off
#cmakedefine01 TRTLAB_MEMORY_CHECK_ALLOCATION_SIZE
#define TRTLAB_MEMORY_IMPL_DEFAULT_ALLOCATOR ${TRTLAB_MEMORY_DEFAULT_ALLOCATOR}
#cmakedefine01 TRTLAB_MEMORY_THREAD_SAFE_REFERENCE
#cmakedefine01 TRTLAB_MEMORY_DEBUG_ASSERT
#cmakedefine01 TRTLAB_MEMORY_DEBUG_FILL
#define TRTLAB_MEMORY_DEBUG_FENCE ${TRTLAB_MEMORY_DEBUG_FENCE}
#cmakedefine01 TRTLAB_MEMORY_DEBUG_LEAK_CHECK
#cmakedefine01 TRTLAB_MEMORY_DEBUG_POINTER_CHECK
#cmakedefine01 TRTLAB_MEMORY_DEBUG_DOUBLE_DEALLOC_CHECK
#define TRTLAB_MEMORY_IMPL_MEMORY_RESOURCE_HEADER ${TRTLAB_MEMORY_MEMORY_RESOURCE_HEADER}
#define TRTLAB_MEMORY_IMPL_MEMORY_RESOURCE ${TRTLAB_MEMORY_MEMORY_RESOURCE}
#cmakedefine01 TRTLAB_MEMORY_EXTERN_TEMPLATE
#define TRTLAB_MEMORY_TEMPORARY_STACK_MODE ${TRTLAB_MEMORY_TEMPORARY_STACK_MODE}
// clang-format on

================================================
FILE: trtlab/memory/src/descriptor.cc
================================================
#include "descriptor.h"
#include "utils.h"

using namespace trtlab;
using namespace memory;

descriptor::descriptor() : m_storage(nullptr), m_size(0), m_alignment(0), m_data(nullptr) {}

descriptor::descriptor(std::shared_ptr<iallocator> alloc, std::size_t size, std::size_t alignment)
: m_storage(std::move(alloc)), m_size(size), m_alignment(alignment), m_data(m_storage->allocate(size, alignment))
{
}

void descriptor::release()
{
    if(m_storage && m_data)
    {
        m_storage->deallocate(m_data, m_size, m_alignment);
    }
}

DLContext descriptor::device_context() const
{
    DCHECK(m_storage);
    return m_storage->device_context();
}

std::shared_ptr<descriptor> descriptor::make_shared()
{
    return std::make_shared<descriptor>(std::move(*this));
}

std::ostream& trtlab::memory::operator<<(std::ostream& os, const descriptor& md)
{
    os << "[descriptor - addr: " << md.m_data << "; size: " << bytes_to_string(md.m_size) << "; alignment: " << md.m_alignment << "]";
    return os;
}

================================================
FILE: trtlab/memory/src/detail/block_list.cc
================================================
#include "detail/block_list.h"

#include "align.h"
#include "detail/assert.h"
#include "error.h"

#include "free_list_utils.h"

using namespace trtlab::memory;
using namespace detail;


constexpr std::size_t block_list::min_element_size;
constexpr std::size_t block_list::min_element_alignment;

block_list::block_list() noexcept
: first_(nullptr), capacity_(0u)
{
}

block_list::block_list(block_list&& other) noexcept
: first_(std::exchange(other.first_, nullptr)), capacity_(std::exchange(other.capacity_, 0u))
{
}

block_list& block_list::operator=(block_list&& other) noexcept
{
    block_list tmp(detail::move(other));
    swap(*this, tmp);
    return *this;
}

void trtlab::memory::detail::swap(block_list& a, block_list& b) noexcept
{
    detail::adl_swap(a.first_, b.first_);
    detail::adl_swap(a.capacity_, b.capacity_);
}

void block_list::insert(memory_block&& block) noexcept
{
    TRTLAB_MEMORY_ASSERT(block.memory);
    TRTLAB_MEMORY_ASSERT(block.size > sizeof(node));
    DVLOG(4) << "block_list::insert " << block.memory;
    
    auto n = static_cast<node*>(block.memory);
    n->size = block.size;
    n->next = first_;
    first_ = n;
    capacity_++;
}

memory_block block_list::allocate() noexcept
{
    TRTLAB_MEMORY_ASSERT(!empty());
    --capacity_;

    memory_block block;
    block.memory = static_cast<void*>(first_);
    block.size = first_->size;
    first_ = first_->next;
    return block;
}

void block_list::deallocate(memory_block&& block) noexcept
{
    insert(std::move(block));
}


block_list_oob::block_list_oob() noexcept
{
}

block_list_oob::block_list_oob(block_list_oob&& other) noexcept
: nodes(std::move(other.nodes))
{
}

block_list_oob& block_list_oob::operator=(block_list_oob&& other) noexcept
{
    block_list_oob tmp(detail::move(other));
    swap(*this, tmp);
    return *this;
}

void trtlab::memory::detail::swap(block_list_oob& a, block_list_oob& b) noexcept
{
    detail::adl_swap(a.nodes, b.nodes);
}

void block_list_oob::insert(memory_block&& block) noexcept
{
    TRTLAB_MEMORY_ASSERT(block.memory);
    TRTLAB_MEMORY_ASSERT(block.size > sizeof(node));
    DVLOG(4) << "block_list_oob::insert " << block.memory;
    nodes.push_front(std::move(block));
}

memory_block block_list_oob::allocate() noexcept
{
    TRTLAB_MEMORY_ASSERT(!empty());
    memory_block block = nodes.front();
    nodes.pop_front();
    return block;
}

void block_list_oob::deallocate(memory_block&& block) noexcept
{
    insert(std::move(block));
}

================================================
FILE: trtlab/memory/src/detail/free_list.cc
================================================
// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

#include "detail/free_list.h"

#include "align.h"
#include "detail/debug_helpers.h"
#include "detail/assert.h"
#include "debugging.h"
#include "error.h"

#include "free_list_utils.h"

using namespace trtlab::memory;
using namespace detail;

namespace
{
    // i.e. array
    struct interval
    {
        char* prev;  // last before
        char* first; // first in
        char* last;  // last in
        char* next;  // first after

        // number of nodes in the interval
        std::size_t size(std::size_t node_size) const noexcept
        {
            // last is inclusive, so add actual_size to it
            // note: cannot use next, might not be directly after
            auto end = last + node_size;
            TRTLAB_MEMORY_ASSERT((end - first) % node_size == 0u);
            return (end - first) / node_size;
        }
    };

    // searches for n consecutive bytes
    // begin and end are the proxy nodes
    // assumes list is not empty
    // similar to list_search_array()
    interval list_search_array(char* first, std::size_t bytes_needed, std::size_t node_size) noexcept
    {
        interval i;
        i.prev  = nullptr;
        i.first = first;
        // i.last/next are used as iterator for the end of the interval
        i.last = first;
        i.next = list_get_next(first);

        auto bytes_so_far = node_size;
        while (i.next)
        {
            if (i.last + node_size != i.next) // not continous
            {
                // restart at next
                i.prev  = i.last;
                i.first = i.next;
                i.last  = i.next;
                i.next  = list_get_next(i.last);

                bytes_so_far = node_size;
            }
            else
            {
                // extend interval
                auto new_next = list_get_next(i.next);
                i.last        = i.next;
                i.next        = new_next;

                bytes_so_far += node_size;
                if (bytes_so_far >= bytes_needed)
                    return i;
            }
        }
        // not enough continuous space
        return {nullptr, nullptr, nullptr, nullptr};
    }

    // similar to list_search_array()
    // begin/end are proxy nodes
    interval xor_list_search_array(char* begin, char* end, std::size_t bytes_needed, std::size_t node_size) noexcept
    {
        interval i;
        i.prev  = begin;
        i.first = xor_list_get_other(begin, nullptr);
        // i.last/next are used as iterator for the end of the interval
        i.last = i.first;
        i.next = xor_list_get_other(i.last, i.prev);

        auto bytes_so_far = node_size;
        while (i.next != end)
        {
            if (i.last + node_size != i.next) // not continous
            {
                // restart at i.next
                i.prev  = i.last;
                i.first = i.next;
                i.last  = i.next;
                i.next  = xor_list_get_other(i.first, i.prev);

                bytes_so_far = node_size;
            }
            else
            {
                // extend interval
                auto new_next = xor_list_get_other(i.next, i.last);
                i.last        = i.next;
                i.next        = new_next;

                bytes_so_far += node_size;
                if (bytes_so_far >= bytes_needed)
                    return i;
            }
        }
        // not enough continuous space
        return {nullptr, nullptr, nullptr, nullptr};
    }
} // namespace

constexpr std::size_t free_memory_list::min_element_size;
constexpr std::size_t free_memory_list::min_element_alignment;

free_memory_list::free_memory_list(std::size_t node_size) noexcept
: first_(nullptr),
  node_size_(node_size > min_element_size ? node_size : min_element_size),
  capacity_(0u),
  alignment_(alignment_for(node_size))
{
}

free_memory_list::free_memory_list(std::size_t node_size, void* mem, std::size_t size) noexcept : free_memory_list(node_size)
{
    insert(mem, size);
}

free_memory_list::free_memory_list(free_memory_list&& other) noexcept
: first_(other.first_), node_size_(other.node_size_), capacity_(other.capacity_)
{
    other.first_    = nullptr;
    other.capacity_ = 0u;
}

free_memory_list& free_memory_list::operator=(free_memory_list&& other) noexcept
{
    free_memory_list tmp(detail::move(other));
    swap(*this, tmp);
    return *this;
}

void trtlab::memory::detail::swap(free_memory_list& a, free_memory_list& b) noexcept
{
    detail::adl_swap(a.first_, b.first_);
    detail::adl_swap(a.node_size_, b.node_size_);
    detail::adl_swap(a.capacity_, b.capacity_);
}

void free_memory_list::insert(void* mem, std::size_t size) noexcept
{
    TRTLAB_MEMORY_ASSERT(mem);
    TRTLAB_MEMORY_ASSERT(is_aligned(mem, alignment()));
    detail::debug_fill_internal(mem, size, false);

    insert_impl(mem, size);
}

void* free_memory_list::allocate() noexcept
{
    TRTLAB_MEMORY_ASSERT(!empty());
    --capacity_;

    auto mem = first_;
    first_   = list_get_next(first_);
    return debug_fill_new(mem, node_size_, fence_size());
}

#include <glog/logging.h>

void* free_memory_list::allocate(std::size_t n) noexcept
{
    TRTLAB_MEMORY_ASSERT(!empty());
    if (n <= node_size_)
        return allocate();

    auto actual_size = node_size_ + 2 * fence_size();

    auto i = list_search_array(first_, n + 2 * fence_size(), actual_size);
    if (i.first == nullptr)
        return nullptr;

    if (i.prev)
        list_set_next(i.prev, i.next); // change next from previous to first after
    else
        first_ = i.next;
    capacity_ -= i.size(actual_size);

    return debug_fill_new(i.first, n, fence_size());
}

void free_memory_list::deallocate(void* ptr) noexcept
{
    ++capacity_;

    auto node = static_cast<char*>(debug_fill_free(ptr, node_size_, fence_size()));
    list_set_next(node, first_);
    first_ = node;
}

void free_memory_list::deallocate(void* ptr, std::size_t n) noexcept
{
    if (n <= node_size_)
    {
        deallocate(ptr);
    }
    else
    {
        auto mem = debug_fill_free(ptr, n, fence_size());
        insert_impl(mem, n + 2 * fence_size());
    }
}

void free_memory_list::insert_impl(void* mem, std::size_t size) noexcept
{
    auto actual_size = node_size_ + 2 * fence_size();
    auto no_nodes    = size / actual_size;
    TRTLAB_MEMORY_ASSERT(no_nodes > 0);

    auto cur = static_cast<char*>(mem);
    for (std::size_t i = 0u; i != no_nodes - 1; ++i)
    {
        list_set_next(cur, cur + actual_size);
        cur += actual_size;
    }
    list_set_next(cur, first_);
    first_ = static_cast<char*>(mem);

    capacity_ += no_nodes;
}

namespace
{
    // converts a block into a linked list
    void xor_link_block(void* memory, std::size_t node_size, std::size_t no_nodes, char* prev, char* next) noexcept
    {
        auto cur = static_cast<char*>(memory);
        xor_list_change(prev, next, cur); // change next pointer of prev

        auto last_cur = prev;
        for (std::size_t i = 0u; i != no_nodes - 1; ++i)
        {
            xor_list_set(cur, last_cur,
                         cur + node_size); // cur gets last_cur and next node in continous memory
            last_cur = cur;
            cur += node_size;
        }
        xor_list_set(cur, last_cur, next); // last memory node gets next as next
        xor_list_change(next, prev, cur);  // change prev pointer of next
    }

    struct pos
    {
        char *prev, *next;
    };

    // finds position to insert memory to keep list ordered
    // first_prev -> first -> ... (memory somewhere here) ... -> last -> last_next
    pos find_pos_interval(const allocator_info& info, char* memory, char* first_prev, char* first, char* last, char* last_next) noexcept
    {
        // note: first_prev/last_next can be the proxy nodes, then first_prev isn't necessarily less than first!
        TRTLAB_MEMORY_ASSERT(less(first, memory) && less(memory, last));

        // need to insert somewhere in the middle
        // search through the entire list
        // search from both ends at once
        auto cur_forward  = first;
        auto prev_forward = first_prev;

        auto cur_backward  = last;
        auto prev_backward = last_next;

        do
        {
            if (greater(cur_forward, memory))
                return {prev_forward, cur_forward};
            else if (less(cur_backward, memory))
                // the next position is the previous backwards pointer
                return {cur_backward, prev_backward};
            debug_check_double_dealloc([&] { return cur_forward != memory && cur_backward != memory; }, info, memory);
            xor_list_iter_next(cur_forward, prev_forward);
            xor_list_iter_next(cur_backward, prev_backward);
        } while (less(prev_forward, prev_backward));

        // ran outside of list
        debug_check_double_dealloc([] { return false; }, info, memory);
        return {nullptr, nullptr};
    }

    // finds the position in the entire list
    pos find_pos(const allocator_info& info, char* memory, char* begin_node, char* end_node, char* last_dealloc,
                 char* last_dealloc_prev) noexcept
    {
        auto first = xor_list_get_other(begin_node, nullptr);
        auto last  = xor_list_get_other(end_node, nullptr);

        if (greater(first, memory))
            // insert at front
            return {begin_node, first};
        else if (less(last, memory))
            // insert at the end
            return {last, end_node};
        else if (less(last_dealloc_prev, memory) && less(memory, last_dealloc))
            // insert before last_dealloc
            return {last_dealloc_prev, last_dealloc};
        else if (less(memory, last_dealloc))
            // insert into [first, last_dealloc_prev]
            return find_pos_interval(info, memory, begin_node, first, last_dealloc_prev, last_dealloc);
        else if (greater(memory, last_dealloc))
            // insert into (last_dealloc, last]
            return find_pos_interval(info, memory, last_dealloc_prev, last_dealloc, last, end_node);

        TRTLAB_MEMORY_UNREACHABLE("memory must be in some half or outside");
        return {nullptr, nullptr};
    }
} // namespace

constexpr std::size_t ordered_free_memory_list::min_element_size;
constexpr std::size_t ordered_free_memory_list::min_element_alignment;

ordered_free_memory_list::ordered_free_memory_list(std::size_t node_size) noexcept
: node_size_(node_size > min_element_size ? node_size : min_element_size),
  capacity_(0u),
  last_dealloc_(end_node()),
  last_dealloc_prev_(begin_node())
{
    xor_list_set(begin_node(), nullptr, end_node());
    xor_list_set(end_node(), begin_node(), nullptr);
}

ordered_free_memory_list::ordered_free_memory_list(ordered_free_memory_list&& other) noexcept
: node_size_(other.node_size_), capacity_(other.capacity_)
{
    if (!other.empty())
    {
        auto first = xor_list_get_other(other.begin_node(), nullptr);
        auto last  = xor_list_get_other(other.end_node(), nullptr);

        xor_list_set(begin_node(), nullptr, first);
        xor_list_change(first, other.begin_node(), begin_node());
        xor_list_change(last, other.end_node(), end_node());
        xor_list_set(end_node(), last, nullptr);

        other.capacity_ = 0u;
        xor_list_set(other.begin_node(), nullptr, other.end_node());
        xor_list_set(other.end_node(), other.begin_node(), nullptr);
    }
    else
    {
        xor_list_set(begin_node(), nullptr, end_node());
        xor_list_set(end_node(), begin_node(), nullptr);
    }

    // for programming convenience, last_dealloc is reset
    last_dealloc_prev_ = begin_node();
    last_dealloc_      = xor_list_get_other(last_dealloc_prev_, nullptr);
}

void trtlab::memory::detail::swap(ordered_free_memory_list& a, ordered_free_memory_list& b) noexcept
{
    auto a_first = xor_list_get_other(a.begin_node(), nullptr);
    auto a_last  = xor_list_get_other(a.end_node(), nullptr);

    auto b_first = xor_list_get_other(b.begin_node(), nullptr);
    auto b_last  = xor_list_get_other(b.end_node(), nullptr);

    if (!a.empty())
    {
        xor_list_set(b.begin_node(), nullptr, a_first);
        xor_list_change(a_first, a.begin_node(), b.begin_node());
        xor_list_change(a_last, a.end_node(), b.end_node());
        xor_list_set(b.end_node(), a_last, nullptr);
    }
    else
    {
        xor_list_set(b.begin_node(), nullptr, b.end_node());
        xor_list_set(b.end_node(), b.begin_node(), nullptr);
    }

    if (!b.empty())
    {
        xor_list_set(a.begin_node(), nullptr, b_first);
        xor_list_change(b_first, b.begin_node(), a.begin_node());
        xor_list_change(b_last, b.end_node(), a.end_node());
        xor_list_set(a.end_node(), b_last, nullptr);
    }
    else
    {
        xor_list_set(a.begin_node(), nullptr, a.end_node());
        xor_list_set(a.end_node(), a.begin_node(), nullptr);
    }

    detail::adl_swap(a.node_size_, b.node_size_);
    detail::adl_swap(a.capacity_, b.capacity_);

    // for programming convenience, last_dealloc is reset
    a.last_dealloc_prev_ = a.begin_node();
    a.last_dealloc_      = xor_list_get_other(a.last_dealloc_prev_, nullptr);

    b.last_dealloc_prev_ = b.begin_node();
    b.last_dealloc_      = xor_list_get_other(b.last_dealloc_prev_, nullptr);
}

void ordered_free_memory_list::insert(void* mem, std::size_t size) noexcept
{
    TRTLAB_MEMORY_ASSERT(mem);
    TRTLAB_MEMORY_ASSERT(is_aligned(mem, alignment()));
    debug_fill_internal(mem, size, false);

    insert_impl(mem, size);
}

void* ordered_free_memory_list::allocate() noexcept
{
    TRTLAB_MEMORY_ASSERT(!empty());

    // remove first node
    auto prev = begin_node();
    auto node = xor_list_get_other(prev, nullptr);
    auto next = xor_list_get_other(node, prev);

    xor_list_set(prev, nullptr, next); // link prev to next
    xor_list_change(next, node, prev); // change prev of next
    --capacity_;

    if (node == last_dealloc_)
    {
        // move last_dealloc_ one further in
        last_dealloc_ = next;
        TRTLAB_MEMORY_ASSERT(last_dealloc_prev_ == prev);
    }

    return debug_fill_new(node, node_size_, fence_size());
}

void* ordered_free_memory_list::allocate(std::size_t n) noexcept
{
    TRTLAB_MEMORY_ASSERT(!empty());

    if (n <= node_size_)
        return allocate();

    auto actual_size = node_size_ + 2 * fence_size();

    auto i = xor_list_search_array(begin_node(), end_node(), n + 2 * fence_size(), actual_size);
    if (i.first == nullptr)
        return nullptr;

    xor_list_change(i.prev, i.first, i.next); // change next pointer from i.prev to i.next
    xor_list_change(i.next, i.last, i.prev);  // change prev pointer from i.next to i.prev
    capacity_ -= i.size(actual_size);

    // if last_dealloc_ points into the array being removed
    if (less_equal(i.first, last_dealloc_) && less_equal(last_dealloc_, i.last))
    {
        // move last_dealloc just outside range
        last_dealloc_      = i.next;
        last_dealloc_prev_ = i.prev;
    }

    return debug_fill_new(i.first, n, fence_size());
}

void ordered_free_memory_list::deallocate(void* ptr) noexcept
{
    auto node = static_cast<char*>(debug_fill_free(ptr, node_size_, fence_size()));

    auto p = find_pos(allocator_info(TRTLAB_MEMORY_LOG_PREFIX "::detail::ordered_free_memory_list", this), node, begin_node(), end_node(),
                      last_dealloc_, last_dealloc_prev_);

    xor_list_insert(node, p.prev, p.next);
    ++capacity_;

    last_dealloc_      = node;
    last_dealloc_prev_ = p.prev;
}

void ordered_free_memory_list::deallocate(void* ptr, std::size_t n) noexcept
{
    if (n <= node_size_)
        deallocate(ptr);
    else
    {
        auto mem  = debug_fill_free(ptr, n, fence_size());
        auto prev = insert_impl(mem, n + 2 * fence_size());

        last_dealloc_      = static_cast<char*>(mem);
        last_dealloc_prev_ = prev;
    }
}

std::size_t ordered_free_memory_list::alignment() const noexcept
{
    return alignment_for(node_size_);
}

std::size_t ordered_free_memory_list::fence_size() const noexcept
{
    // node size is fence size
    return debug_fence_size ? node_size_ : 0u;
}

char* ordered_free_memory_list::insert_impl(void* mem, std::size_t size) noexcept
{
    auto actual_size = node_size_ + 2 * fence_size();
    auto no_nodes    = size / actual_size;
    TRTLAB_MEMORY_ASSERT(no_nodes > 0);

    auto p = find_pos(allocator_info(TRTLAB_MEMORY_LOG_PREFIX "::detail::ordered_free_memory_list", this), static_cast<char*>(mem),
                      begin_node(), end_node(), last_dealloc_, last_dealloc_prev_);

    xor_link_block(mem, actual_size, no_nodes, p.prev, p.next);
    capacity_ += no_nodes;

    if (p.prev == last_dealloc_prev_)
    {
        last_dealloc_ = static_cast<char*>(mem);
    }

    return p.prev;
}

char* ordered_free_memory_list::begin_node() noexcept
{
    void* mem = &begin_proxy_;
    return static_cast<char*>(mem);
}

char* ordered_free_memory_list::end_node() noexcept
{
    void* mem = &end_proxy_;
    return static_cast<char*>(mem);
}


================================================
FILE: trtlab/memory/src/detail/free_list_utils.h
================================================
// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

#ifndef TRTLAB_MEMORY_SRC_DETAIL_FREE_LIST_UTILS_H_INCLUDED
#define TRTLAB_MEMORY_SRC_DETAIL_FREE_LIST_UTILS_H_INCLUDED

#include <cstdint>

#include "config.h"
#include "align.h"
#include "detail/assert.h"

#include <cstring>
#include <functional>

namespace trtlab
{
    namespace memory
    {
        namespace detail
        {
            //=== storage ===///
            // reads stored integer value
            inline std::uintptr_t get_int(void* address) noexcept
            {
                TRTLAB_MEMORY_ASSERT(address);
                std::uintptr_t res;
                std::memcpy(&res, address, sizeof(std::uintptr_t));
                return res;
            }

            // sets stored integer value
            inline void set_int(void* address, std::uintptr_t i) noexcept
            {
                TRTLAB_MEMORY_ASSERT(address);
                std::memcpy(address, &i, sizeof(std::uintptr_t));
            }

            // pointer to integer
            inline std::uintptr_t to_int(char* ptr) noexcept
            {
                return reinterpret_cast<std::uintptr_t>(ptr);
            }

            // integer to pointer
            inline char* from_int(std::uintptr_t i) noexcept
            {
                return reinterpret_cast<char*>(i);
            }

            //=== intrusive linked list ===//
            // reads a stored pointer value
            inline char* list_get_next(void* address) noexcept
            {
                return from_int(get_int(address));
            }

            // stores a pointer value
            inline void list_set_next(void* address, char* ptr) noexcept
            {
                set_int(address, to_int(ptr));
            }

            //=== intrusive xor linked list ===//
            // returns the other pointer given one pointer
            inline char* xor_list_get_other(void* address, char* prev_or_next) noexcept
            {
                return from_int(get_int(address) ^ to_int(prev_or_next));
            }

            // sets the next and previous pointer (order actually does not matter)
            inline void xor_list_set(void* address, char* prev, char* next) noexcept
            {
                set_int(address, to_int(prev) ^ to_int(next));
            }

            // changes other pointer given one pointer
            inline void xor_list_change(void* address, char* old_ptr,
                                        char* new_ptr) noexcept
            {
                TRTLAB_MEMORY_ASSERT(address);
                auto other = xor_list_get_other(address, old_ptr);
                xor_list_set(address, other, new_ptr);
            }

            // advances a pointer pair forward/backward
            inline void xor_list_iter_next(char*& cur, char*& prev) noexcept
            {
                auto next = xor_list_get_other(cur, prev);
                prev      = cur;
                cur       = next;
            }

            // links new node between prev and next
            inline void xor_list_insert(char* new_node, char* prev, char* next) noexcept
            {
                xor_list_set(new_node, prev, next);
                xor_list_change(prev, next, new_node); // change prev's next to new_node
                xor_list_change(next, prev, new_node); // change next's prev to new_node
            }

            //=== sorted list utils ===//
            // if std::less/std::greater not available compare integer representation and hope it works
            inline bool less(void* a, void* b) noexcept
            {
                return std::less<void*>()(a, b);
            }

            inline bool less_equal(void* a, void* b) noexcept
            {
                return a == b || less(a, b);
            }

            inline bool greater(void* a, void* b) noexcept
            {
                return std::greater<void*>()(a, b);
            }

            inline bool greater_equal(void* a, void* b) noexcept
            {
                return a == b || greater(a, b);
            }
        } // namespace detail
    }     // namespace memory
} // namespace trtlab

#endif // TRTLAB_MEMORY_SRC_DETAIL_FREE_LIST_UTILS_H_INCLUDED


================================================
FILE: trtlab/memory/src/detail/page_info.c
================================================
/*
 * smaps.c
 *
 *  Created on: Jan 31, 2017
 *      Author: tdowns
 *

extracted from: https://github.com/travisdowns/page-info
 
MIT License

Copyright (c) 2017 travisdowns

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
 
*/

#include "detail/page_info.h"

#include <stdio.h>
#include <sys/types.h>
#include <linux/kernel-page-flags.h>
#include <unistd.h>
#include <string.h>
#include <strings.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <err.h>
#include <assert.h>
#include <limits.h>


#define PM_PFRAME_MASK         ((1ULL << 55) - 1)
#define PM_SOFT_DIRTY           (1ULL << 55)
#define PM_MMAP_EXCLUSIVE       (1ULL << 56)
#define PM_FILE                 (1ULL << 61)
#define PM_SWAP                 (1ULL << 62)
#define PM_PRESENT              (1ULL << 63)


/** bundles a flag with its description */
typedef struct {
    int flag_num;
    char const *name;
    bool show_default;
} flag;

#define FLAG_SHOW(name) { KPF_ ## name, # name, true },
#define FLAG_HIDE(name) { KPF_ ## name, # name, false },

const flag kpageflag_defs[] = {
        FLAG_SHOW(LOCKED       )
        FLAG_HIDE(ERROR        )
        FLAG_HIDE(REFERENCED   )
        FLAG_HIDE(UPTODATE     )
        FLAG_HIDE(DIRTY        )
        FLAG_HIDE(LRU          )
        FLAG_SHOW(ACTIVE       )
        FLAG_SHOW(SLAB         )
        FLAG_HIDE(WRITEBACK    )
        FLAG_HIDE(RECLAIM      )
        FLAG_SHOW(BUDDY        )
        FLAG_SHOW(MMAP         )
        FLAG_SHOW(ANON         )
        FLAG_SHOW(SWAPCACHE    )
        FLAG_SHOW(SWAPBACKED   )
        FLAG_SHOW(COMPOUND_HEAD)
        FLAG_SHOW(COMPOUND_TAIL)
        FLAG_SHOW(HUGE         )
        FLAG_SHOW(UNEVICTABLE  )
        FLAG_SHOW(HWPOISON     )
        FLAG_SHOW(NOPAGE       )
        FLAG_SHOW(KSM          )
        FLAG_SHOW(THP          )
        /* older kernels won't have these new flags, so conditionally compile in support for them */
#ifdef KPF_BALLOON
        FLAG_SHOW(BALLOON      )
#endif
#ifdef KPF_ZERO_PAGE
        FLAG_SHOW(ZERO_PAGE    )
#endif
#ifdef KPF_IDLE
        FLAG_SHOW(IDLE         )
#endif

        { -1, 0, false }  // sentinel
};

#define kpageflag_count (sizeof(kpageflag_defs)/sizeof(kpageflag_defs[0]) - 1)

#define ITERATE_FLAGS for (flag const *f = kpageflag_defs; f->flag_num != -1; f++)


// x-macro for doing some operation on all the pagemap flags
#define PAGEMAP_X(fn) \
    fn(softdirty ) \
    fn(exclusive ) \
    fn(file      ) \
    fn(swapped   ) \
    fn(present   )

static unsigned get_page_size() {
    long psize = sysconf(_SC_PAGESIZE);
    assert(psize >= 1 && psize <= UINT_MAX);
    return (unsigned)psize;
}

/* round the given pointer down to the page boundary (i.e,. return a pointer to the page it lives in) */
static inline void *pagedown(void *p, unsigned psize) {
    return (void *)(((uintptr_t)p) & -(uintptr_t)psize);
}

/**
 * Extract the interesting info from a 64-bit pagemap value, and return it as a page_info.
 */
page_info extract_info(uint64_t bits) {
    page_info ret = {};
    ret.pfn         = bits & PM_PFRAME_MASK;
    ret.softdirty   = bits & PM_SOFT_DIRTY;
    ret.exclusive   = bits & PM_MMAP_EXCLUSIVE;
    ret.file        = bits & PM_FILE;
    ret.swapped     = bits & PM_SWAP;
    ret.present     = bits & PM_PRESENT;
    return ret;
}

/* print page_info to the given file */
void fprint_info(FILE* f, page_info info) {
    fprintf(f,
            "PFN: %p\n"
            "softdirty = %d\n"
            "exclusive = %d\n"
            "file      = %d\n"
            "swapped   = %d\n"
            "present   = %d\n",
            (void*)info.pfn,
            info.softdirty,
            info.exclusive,
            info.file,
            info.swapped,
            info.present);
}

void print_info(page_info info) {
    fprint_info(stdout, info);
}

flag_count get_flag_count(page_info_array infos, int flag_num) {
    flag_count ret = {};

    if (flag_num < 0 || flag_num > 63) {
        return ret;
    }

    uint64_t flag = (1ULL << flag_num);

    ret.flag = flag_num;
    ret.pages_total = infos.num_pages;

    for (size_t i = 0; i < infos.num_pages; i++) {
        page_info info = infos.info[i];
        if (info.kpageflags_ok) {
            ret.pages_set += (info.kpageflags & flag) == flag;
            ret.pages_available++;
        }
    }
    return ret;
}

/**
 * Print the table header that lines up with the tabluar format used by the "table" printing
 * functions. Called by fprint_ratios, or you can call it yourself if you want to prefix the
 * output with your own columns.
 */
void fprint_info_header(FILE *file) {
    fprintf(file,  "         PFN  sdirty   excl   file swappd presnt ");
    ITERATE_FLAGS { if (f->show_default) fprintf(file, "%4.4s ", f->name); }
    fprintf(file, "\n");
}

/* print one info in a tabular format (as a single row) */
void fprint_info_row(FILE *file, page_info info) {
    fprintf(file, "%12p %7d%7d%7d%7d%7d ",
            (void*)info.pfn,
            info.softdirty,
            info.exclusive,
            info.file,
            info.swapped,
            info.present);

    if (info.kpageflags_ok) {
        ITERATE_FLAGS { if (f->show_default) fprintf(file, "%4d ", !!(info.kpageflags & (1ULL << f->flag_num))); }
    }
    fprintf(file, "\n");
}

#define DECLARE_ACCUM(name) size_t name ## _accum = 0;
#define INCR_ACCUM(name)    name ## _accum += info->name;
#define PRINT_ACCUM(name)   fprintf(file, "%7.4f", (double)name ## _accum / infos.num_pages);


void fprint_ratios_noheader(FILE *file, page_info_array infos) {
    PAGEMAP_X(DECLARE_ACCUM);
    size_t total_kpage_ok = 0;
    size_t flag_totals[kpageflag_count] = {};
    for (size_t p = 0; p < infos.num_pages; p++) {
        page_info *info = &infos.info[p];
        PAGEMAP_X(INCR_ACCUM);
        if (info->kpageflags_ok) {
            total_kpage_ok++;
            int i = 0;
            ITERATE_FLAGS {
                flag_totals[i++] += !!(info->kpageflags & (1ULL << f->flag_num));
            }
        }
    }

    printf("%12s ", "----------");
    PAGEMAP_X(PRINT_ACCUM)

    int i = 0;
    if (total_kpage_ok > 0) {
        ITERATE_FLAGS {
            if (f->show_default) fprintf(file, " %4.2f", (double)flag_totals[i] / total_kpage_ok);
            i++;
        }
    }
    fprintf(file, "\n");
}

/*
 * Print a table with one row per page from the given infos.
 */
void fprint_ratios(FILE *file, page_info_array infos) {
    fprint_info_header(file);
    fprint_ratios_noheader(file, infos);
}

/*
 * Prints a summary of all the pages in the given array as ratios: the fraction of the time the given
 * flag was set.
 */
void fprint_table(FILE *f, page_info_array infos) {
    fprintf(f, "%zu total pages\n", infos.num_pages);
    fprint_info_header(f);
    for (size_t p = 0; p < infos.num_pages; p++) {
        fprint_info_row(f, infos.info[p]);
    }
}


/**
 * Get info for a single page indicated by the given pointer (which may point anywhere in the page)
 */
page_info get_page_info(void *p) {
    // just get the info array for a single page
    page_info_array onepage = get_info_for_range(p, (char *)p + 1);
    assert(onepage.num_pages == 1);
    page_info ret = onepage.info[0];
    free_info_array(onepage);
    return ret;
}

/**
 * Get information for each page in the range from start (inclusive) to end (exclusive).
 */
page_info_array get_info_for_range(void *start, void *end) {
    unsigned psize = get_page_size();
    void *start_page = pagedown(start, psize);
    void *end_page   = pagedown(end - 1, psize) + psize;
    size_t page_count = start < end ? (end_page - start_page) / psize : 0;
    assert(page_count == 0 || start_page < end_page);

    if (page_count == 0) {
        return (page_info_array){ 0, NULL };
    }

    page_info *infos = malloc((page_count + 1) * sizeof(page_info));

    // open the pagemap file
    FILE *pagemap_file = fopen("/proc/self/pagemap", "rb");
    if (!pagemap_file) err(EXIT_FAILURE, "failed to open pagemap");

    // seek to the first page
    if (fseek(pagemap_file, (uintptr_t)start_page / psize * sizeof(uint64_t), SEEK_SET)) err(EXIT_FAILURE, "pagemap seek failed");

    size_t bitmap_bytes = page_count * sizeof(uint64_t);
    uint64_t* bitmap = malloc(bitmap_bytes);
    assert(bitmap);
    size_t readc = fread(bitmap, bitmap_bytes, 1, pagemap_file);
    if (readc != 1) err(EXIT_FAILURE, "unexpected fread(pagemap) return: %zu", readc);

    fclose(pagemap_file);

    FILE *kpageflags_file = NULL;
    enum { INIT, OPEN, FAILED  } file_state = INIT;

    for (size_t page_idx = 0; page_idx < page_count; page_idx++) {
        page_info info = extract_info(bitmap[page_idx]);

        if (info.pfn) {
            // we got a pfn, try to read /proc/kpageflags

            // open file if not open
            if (file_state == INIT) {
                kpageflags_file = fopen("/proc/kpageflags", "rb");
                if (!kpageflags_file) {
                    warn("failed to open kpageflags");
                    file_state = FAILED;
                } else {
                    file_state = OPEN;
                }
            }

            if (file_state == OPEN) {
                uint64_t bits;
                if (fseek(kpageflags_file, info.pfn * sizeof(bits), SEEK_SET)) err(EXIT_FAILURE, "kpageflags seek failed");
                if ((readc = fread(&bits, sizeof(bits), 1, kpageflags_file)) != 1) err(EXIT_FAILURE, "unexpected fread(kpageflags) return: %zu", readc);
                info.kpageflags_ok = true;
                info.kpageflags = bits;
            }
        }

        infos[page_idx] = info;
    }

    if (kpageflags_file)
        fclose(kpageflags_file);

    free(bitmap);

    return (page_info_array){ page_count, infos };
}

void free_info_array(page_info_array infos) {
    free(infos.info);
}

int flag_from_name(char const *name) {
    ITERATE_FLAGS {
        if (strcasecmp(f->name, name) == 0) {
            return f->flag_num;
        }
    }
    return -1;
}

================================================
FILE: trtlab/memory/src/error.cc
================================================
// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

#include "error.h"

#include <atomic>
#include <cstdio>

// #include <foonathan/get_new_handler.hpp>

using namespace trtlab::memory;

namespace
{
    void default_out_of_memory_handler(const allocator_info& info, std::size_t amount) noexcept
    {
#if TRTLAB_HOSTED_IMPLEMENTATION
        std::fprintf(stderr,
                     "[%s] Allocator %s (at %p) ran out of memory trying to allocate %zu bytes.\n",
                     TRTLAB_MEMORY_LOG_PREFIX, info.name, info.allocator, amount);
#endif
    }

    std::atomic<out_of_memory::handler> out_of_memory_h(default_out_of_memory_handler);
}

out_of_memory::handler out_of_memory::set_handler(out_of_memory::handler h)
{
    return out_of_memory_h.exchange(h ? h : default_out_of_memory_handler);
}

out_of_memory::handler out_of_memory::get_handler()
{
    return out_of_memory_h;
}

out_of_memory::out_of_memory(const allocator_info& info, std::size_t amount)
: info_(info), amount_(amount)
{
    out_of_memory_h.load()(info, amount);
}

const char* out_of_memory::what() const noexcept
{
    return "low-level allocator is out of memory";
}

const char* out_of_fixed_memory::what() const noexcept
{
    return "fixed size allocator is out of memory";
}

namespace
{
    void default_bad_alloc_size_handler(const allocator_info& info, std::size_t passed,
                                        std::size_t supported) noexcept
    {
#if TRTLAB_HOSTED_IMPLEMENTATION
        std::fprintf(stderr, "[%s] Allocator %s (at %p) received invalid size/alignment %zu, "
                             "max supported is %zu\n",
                     TRTLAB_MEMORY_LOG_PREFIX, info.name, info.allocator, passed, supported);
#endif
    }

    std::atomic<bad_allocation_size::handler> bad_alloc_size_h(default_bad_alloc_size_handler);
}

bad_allocation_size::handler bad_allocation_size::set_handler(bad_allocation_size::handler h)
{
    return bad_alloc_size_h.exchange(h ? h : default_bad_alloc_size_handler);
}

bad_allocation_size::handler bad_allocation_size::get_handler()
{
    return bad_alloc_size_h;
}

bad_allocation_size::bad_allocation_size(const allocator_info& info, std::size_t passed,
                                         std::size_t supported)
: info_(info), passed_(passed), supported_(supported)
{
    bad_alloc_size_h.load()(info_, passed_, supported_);
}

const char* bad_allocation_size::what() const noexcept
{
    return "allocation node size exceeds supported maximum of allocator";
}

const char* bad_node_size::what() const noexcept
{
    return "allocation node size exceeds supported maximum of allocator";
}

const char* bad_array_size::what() const noexcept
{
    return "allocation array size exceeds supported maximum of allocator";
}

const char* bad_alignment::what() const noexcept
{
    return "allocation alignment exceeds supported maximum of allocator";
}


================================================
FILE: trtlab/memory/src/ilog2.h
================================================
// MODIFICATION MESSAGE

// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

#ifndef TRTLAB_MEMORY_SRC_DETAIL_ILOG2_HPP_INCLUDED
#define TRTLAB_MEMORY_SRC_DETAIL_ILOG2_HPP_INCLUDED

#include "config.h"

#include <cstddef>
#include <climits>
#include <cstdint>
#include <type_traits>

namespace trtlab
{
    namespace memory
    {
        namespace detail
        {
            // prioritized tag dispatching to choose smallest integer that fits
            struct clzll_tag
            {
            };
            struct clzl_tag : clzll_tag
            {
            };
            struct clz_tag : clzl_tag
            {
            };

            // also subtracts the number of addtional 0s that occur because the target type is smaller
            template <typename T, typename = typename std::enable_if<sizeof(T) <= sizeof(unsigned int)>::type>
            constexpr unsigned clz(clz_tag, T x)
            {
                return __builtin_clz(x) - (sizeof(unsigned int) * CHAR_BIT - sizeof(T) * CHAR_BIT);
            }

            template <typename T, typename = typename std::enable_if<sizeof(T) <= sizeof(unsigned long)>::type>
            constexpr unsigned clz(clzl_tag, T x)
            {
                return __builtin_clzl(x) - (sizeof(unsigned long) * CHAR_BIT - sizeof(T) * CHAR_BIT);
            }

            template <typename T, typename = typename std::enable_if<sizeof(T) <= sizeof(unsigned long long)>::type>
            constexpr unsigned clz(clzll_tag, T x)
            {
                return __builtin_clzll(x) - (sizeof(unsigned long long) * CHAR_BIT - sizeof(T) * CHAR_BIT);
            }
            constexpr unsigned clz(std::uint8_t x)
            {
                return detail::clz(detail::clz_tag{}, x);
            }

            constexpr unsigned clz(std::uint16_t x)
            {
                return detail::clz(detail::clz_tag{}, x);
                ;
            }

            constexpr unsigned clz(std::uint32_t x)
            {
                return detail::clz(detail::clz_tag{}, x);
            }

            constexpr unsigned clz(std::uint64_t x)
            {
                return detail::clz(detail::clz_tag{}, x);
            }

            // undefined for 0
            template <typename UInt>
            constexpr bool is_power_of_two(UInt x)
            {
                return (x & (x - 1)) == 0;
            }

            constexpr std::size_t ilog2_base(std::uint64_t x)
            {
                return sizeof(x) * CHAR_BIT - clz(x);
            }

            // ilog2() implementation, cuts part after the comma
            // e.g. 1 -> 0, 2 -> 1, 3 -> 1, 4 -> 2, 5 -> 2
            constexpr std::size_t ilog2(std::size_t x)
            {
                return ilog2_base(x) - 1;
            }

            // ceiling ilog2() implementation, adds one if part after comma
            // e.g. 1 -> 0, 2 -> 1, 3 -> 2, 4 -> 2, 5 -> 3
            constexpr std::size_t ilog2_ceil(std::size_t x)
            {
                // only subtract one if power of two
                return ilog2_base(x) - std::size_t(is_power_of_two(x));
            }
        } // namespace detail
    }     // namespace memory
} // namespace trtlab

#endif


================================================
FILE: trtlab/memory/src/memory_type.cc
================================================

#include "memory_type.h"
#include "ilog2.h"

using namespace trtlab::memory;

std::size_t detail::any_memory::ilog2(std::size_t size) noexcept
{
    return ::trtlab::memory::detail::ilog2(size);
}


================================================
FILE: trtlab/memory/src/trackers.cc
================================================

#include "trackers.h"
#include <atomic>

using namespace trtlab;
using namespace memory;

struct size_tracker::impl
{
    std::atomic<std::size_t> bytes = 0;
};

size_tracker::size_tracker() : pimpl(std::make_unique<size_tracker::impl>()) {}
size_tracker::~size_tracker() = default;

size_tracker::size_tracker(size_tracker&&) noexcept = default;
size_tracker& size_tracker::operator=(size_tracker&&) noexcept = default;

std::size_t size_tracker::bytes() const noexcept
{
    return pimpl->bytes.load();
}

void size_tracker::on_node_allocation(void* ptr, std::size_t size, std::size_t alignment) noexcept
{
    pimpl->bytes += size;
}

void size_tracker::on_node_deallocation(void* ptr, std::size_t size, std::size_t alignment) noexcept
{
    pimpl->bytes -= size;
}

void size_tracker::on_array_allocation(void* ptr, std::size_t count, std::size_t size, std::size_t alignment) noexcept
{
    pimpl->bytes += (count * size);
}

void size_tracker::on_array_deallocation(void* ptr, std::size_t count, std::size_t size, std::size_t alignment) noexcept
{
    pimpl->bytes -= (count * size);
}

================================================
FILE: trtlab/memory/src/utils.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <math.h>
#include <stdio.h>

#include <cmath>
#include <regex>

#include <glog/logging.h>

#include "utils.h"

namespace trtlab
{
    namespace memory
    {
        std::string bytes_to_string(size_t bytes)
        {
            char       buffer[50];
            int        unit       = 1024;
            const char prefixes[] = "KMGTPE";
            if (bytes < unit)
            {
                sprintf(buffer, "%ld B", bytes);
                return std::string(buffer);
            }
            int exp = (int)(std::log(bytes) / std::log(unit));
            sprintf(buffer, "%.1f %ciB", bytes / std::pow(unit, exp), prefixes[exp - 1]);
            return std::string(buffer);
        }

        std::uint64_t string_to_bytes(const std::string str)
        {
            // https://regex101.com/r/UVm5wT/1
            std::smatch         m;
            std::regex          r("(\\d+[.\\d+]*)([KMGTkmgt]*)([i]*)[bB]");
            std::map<char, int> prefix = {
                {'k', 1}, {'m', 2}, {'g', 3}, {'t', 4}, {'K', 1}, {'M', 2}, {'G', 3}, {'T', 4},
            };

            if (!std::regex_search(str, m, r))
                LOG(FATAL) << "Unable to convert \"" << str << "\" to bytes. "
                           << "Expected format: 10b, 1024B, 1KiB, 10MB, 2.4gb, etc.";

            const std::uint64_t base     = m.empty() || (m.size() > 3 && m[3] == "") ? 1000 : 1024;
            auto                exponent = prefix[m[2].str()[0]];
            auto                scalar   = std::stod(m[1]);
            return (std::uint64_t)(scalar * std::pow(base, exponent));
        }
    } // namespace memory
} // namespace trtlab

================================================
FILE: trtlab/memory/tests/CMakeLists.txt
================================================
# MODIFICATION MESSAGE

# Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
# This file is subject to the license terms in the LICENSE file
# found in the top-level directory of this distribution.

# builds test

find_package(GTest)
enable_testing()

set(tests
#   test_simple.cc
    test_memory.cc
    test_main.cc
)

add_executable(test_memory ${tests})
target_link_libraries(test_memory trtlab_memory GTest::gtest)
target_include_directories(test_memory PRIVATE
                           ${CMAKE_CURRENT_BINARY_DIR}
                           ${TRTLAB_MEMORY_SOURCE_DIR}/include/trtlab/memory)

add_test(NAME memory COMMAND test_memory)


================================================
FILE: trtlab/memory/tests/test_main.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <glog/logging.h>
#include <gtest/gtest.h>

int main(int argc, char **argv) {
    FLAGS_alsologtostderr = 1; // Log to console
    ::google::InitGoogleLogging("trtlab::test_memory");
    ::testing::InitGoogleTest(&argc, argv);
    ::google::ParseCommandLineFlags(&argc, &argv, true);
    return RUN_ALL_TESTS();
}

================================================
FILE: trtlab/memory/tests/test_memory.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <gtest/gtest.h>

#include <cstring>
#include <mutex>
#include <vector>

#include "trtlab/memory/allocator.h"
#include "trtlab/memory/block_allocators.h"
#include "trtlab/memory/block_arena.h"
#include "trtlab/memory/std_allocator.h"
#include "trtlab/memory/tracking.h"
#include "trtlab/memory/literals.h"
#include "trtlab/memory/smart_ptr.h"

#include "trtlab/memory/detail/memory_stack.h"
#include "trtlab/memory/transactional_allocator.h"

using namespace trtlab::memory;
using namespace trtlab::memory::literals;

class TestMemory : public ::testing::Test
{
};

#define ALIAS_TEMPLATE(Name, ...) using Name = __VA_ARGS__

namespace trtlab
{
    namespace memory
    {
        template <typename T, class RawAllocator>
        ALIAS_TEMPLATE(vector, std::vector<T, std_allocator<T, RawAllocator>>);
    }
} // namespace trtlab

template <typename T, typename RawAllocator>
auto make_vector(RawAllocator&& alloc)
{
    return vector<T, RawAllocator>(alloc);
}

template <typename T, typename RawAllocator, typename Mutex>
auto make_vector(allocator<RawAllocator, Mutex> alloc)
{
    return vector<T, trtlab::memory::allocator<RawAllocator, Mutex>>(alloc);
}

struct log_tracker
{
    void on_node_allocation(void* ptr, std::size_t size, std::size_t alignment) noexcept
    {
        LOG(INFO) << name << ": node allocated: " << ptr << "; size: " << size << "; alignment: " << alignment;
    }

    void on_node_deallocation(void* ptr, std::size_t size, std::size_t alignment) noexcept
    {
        LOG(INFO) << name << ": node deallocated: " << ptr << "; " << size << "; " << alignment;
    }

    void on_array_allocation(void* ptr, std::size_t count, std::size_t size, std::size_t alignment) noexcept
    {
        LOG(INFO) << name << ": array allocated: " << ptr << " ( " << count << " * " << size << "; " << alignment << " )";
    }

    void on_array_deallocation(void* ptr, std::size_t count, std::size_t size, std::size_t alignment) noexcept
    {
        LOG(INFO) << name << ": array deallocated: " << ptr << " ( " << count << " * " << size << "; " << alignment << " )";
    }

    const char* name;
};

struct empty_tracker
{
    void on_node_allocation(void* ptr, std::size_t size, std::size_t alignment) noexcept
    {
        LOG(INFO) << ": node allocated: " << ptr << "; size: " << size << "; alignment: " << alignment;
    }

    void on_node_deallocation(void* ptr, std::size_t size, std::size_t alignment) noexcept
    {
        LOG(INFO) << ": node deallocated: " << ptr << "; " << size << "; " << alignment;
    }

    void on_array_allocation(void* ptr, std::size_t count, std::size_t size, std::size_t alignment) noexcept
    {
        LOG(INFO) << ": array allocated: " << ptr << " ( " << count << " * " << size << "; " << alignment << " )";
    }

    void on_array_deallocation(void* ptr, std::size_t count, std::size_t size, std::size_t alignment) noexcept
    {
        LOG(INFO) << ": array deallocated: " << ptr << " ( " << count << " * " << size << "; " << alignment << " )";
    }
};

struct counting_tracker
{
    counting_tracker(std::string n) : name(n), m_node_count(0), m_node_bytes(0), m_array_count(0), m_array_bytes(0) {}
    ~counting_tracker() {}

    counting_tracker(const counting_tracker&) = default;
    counting_tracker& operator=(const counting_tracker&) = default;

    counting_tracker(counting_tracker&&) noexcept = default;
    counting_tracker& operator=(counting_tracker&&) noexcept = default;

    void on_node_allocation(void* ptr, std::size_t size, std::size_t alignment) noexcept
    {
        DLOG(INFO) << name << ": node allocated: " << ptr << "; size: " << size << "; alignment: " << alignment;
        m_node_count++;
        m_node_bytes += size;
        DLOG(INFO) << name << ": node allocations: " << m_node_count << "; node bytes: " << m_node_bytes;
    }

    void on_node_deallocation(void* ptr, std::size_t size, std::size_t alignment) noexcept
    {
        DLOG(INFO) << name << ": node deallocated: " << ptr << "; " << size << "; " << alignment;
        m_node_count--;
        m_node_bytes -= size;
        DLOG(INFO) << name << ": node allocations: " << m_node_count << "; node bytes: " << m_node_bytes;
    }

    void on_array_allocation(void* ptr, std::size_t count, std::size_t size, std::size_t alignment) noexcept
    {
        DLOG(INFO) << name << ": array allocated: " << ptr << " ( " << count << " * " << size << "; " << alignment << " )";
        m_array_count++;
        m_array_bytes += count * size;
        DLOG(INFO) << name << ": array allocations: " << m_array_count << "; array bytes: " << m_array_bytes;
    }

    void on_array_deallocation(void* ptr, std::size_t count, std::size_t size, std::size_t alignment) noexcept
    {
        DLOG(INFO) << name << ": array deallocated: " << ptr << " ( " << count << " * " << size << "; " << alignment << " )";
        m_array_count--;
        m_array_bytes -= count * size;
        DLOG(INFO) << name << ": array allocations: " << m_array_count << "; array bytes: " << m_array_bytes;
    }

    std::size_t count() const noexcept
    {
        return m_node_count + m_array_count;
    }

    std::size_t bytes() const noexcept
    {
        return m_node_bytes + m_array_bytes;
    }

    std::size_t node_count() const noexcept
    {
        return m_node_count;
    }
    std::size_t node_bytes() const noexcept
    {
        return m_node_bytes;
    }
    std::size_t array_count() const noexcept
    {
        return m_array_count;
    }
    std::size_t array_bytes() const noexcept
    {
        return m_array_bytes;
    }

private:
    std::string name;
    std::size_t m_node_count;
    std::size_t m_node_bytes;
    std::size_t m_array_count;
    std::size_t m_array_bytes;
};

template <typename Allocator>
static void test_alloc_x10(Allocator& alloc)
{
    auto p0 = alloc.allocate_node(33_MiB, 8UL);
    auto p1 = alloc.allocate_node(44_MiB, 8UL);
    auto p2 = alloc.allocate_node(78_MiB, 8UL);
    auto p3 = alloc.allocate_node(12_MiB, 8UL);
    auto p4 = alloc.allocate_node(32_MiB, 8UL);
    auto p5 = alloc.allocate_node(100_MiB, 8UL);
    auto p6 = alloc.allocate_node(18_MiB, 8UL);
    auto p7 = alloc.allocate_node(21_MiB, 8UL);
    auto p8 = alloc.allocate_node(15_MiB, 8UL);
    auto p9 = alloc.allocate_node(71_MiB, 8UL);

    alloc.deallocate_node(p3, 12_MiB, 8UL);
    alloc.deallocate_node(p1, 44_MiB, 8UL);
    alloc.deallocate_node(p7, 21_MiB, 8UL);
    alloc.deallocate_node(p8, 15_MiB, 8UL);
    alloc.deallocate_node(p0, 33_MiB, 8UL);
    alloc.deallocate_node(p4, 32_MiB, 8UL);
    alloc.deallocate_node(p6, 18_MiB, 8UL);
    alloc.deallocate_node(p5, 100_MiB, 8UL);
    alloc.deallocate_node(p9, 71_MiB, 8UL);
    alloc.deallocate_node(p2, 78_MiB, 8UL);
}

struct TestRawMalloc
{
    using memory_type = host_memory;
    using is_stateful = std::false_type;

    static void* allocate_node(std::size_t size, std::size_t)
    {
        return std::malloc(size);
    }

    static void deallocate_node(void* ptr, std::size_t, std::size_t) noexcept
    {
        return std::free(ptr);
    }
};

TEST_F(TestMemory, BasicTraits)
{
    auto raw = TestRawMalloc();
    static_assert(std::is_same<typename decltype(raw)::is_stateful, std::false_type>::value, "");
    auto alloc = make_allocator(std::move(raw));
    //static_assert(std::is_same<typename decltype(alloc)::is_stateful, std::false_type>::value, "");

    void* p0 = alloc.allocate(1_MiB);
    ASSERT_NE(p0, nullptr);
    std::memset(p0, 1, 1_MiB);
    alloc.deallocate(p0, 1_MiB);
}

TEST_F(TestMemory, ReferenceStorage)
{
    auto raw = TestRawMalloc();
    static_assert(std::is_same<typename decltype(raw)::is_stateful, std::false_type>::value, "");

    auto alloc = make_allocator(std::move(raw));

    ASSERT_EQ(alloc.use_count(), 1);
    auto ref = make_allocator_reference(alloc);
    ASSERT_EQ(alloc.use_count(), 2);
}

#include "malloc_allocator.h"

TEST_F(TestMemory, VectorWithTracking)
{
    auto raw     = malloc_allocator();
    auto adpt    = make_allocator_adapter(std::move(raw));
    ASSERT_EQ(adpt.device_context().device_type, kDLCPU);

    auto tracked = make_tracked_allocator(log_tracker{"** tracker: malloc **"}, std::move(adpt));
    ASSERT_EQ(tracked.device_context().device_type, kDLCPU);

    auto alloc   = make_allocator(std::move(tracked));

    ASSERT_EQ(alloc.device_context().device_type, kDLCPU);

    ASSERT_EQ(alloc.use_count(), 1);
    auto vec1 = make_vector<int>(alloc);
    ASSERT_EQ(alloc.use_count(), 2);

    vec1.reserve(128);

    LOG(INFO) << vec1.get_allocator().get_allocator().min_alignment();
    LOG(INFO) << vec1.get_allocator().get_allocator().max_alignment();

    for (int i = 0; i < 10; i++)
    {
        vec1.push_back(i);
    }

    auto vec2 = vec1;

    for (int i = 0; i < 10; i++)
    {
        vec2[i] += 2;
    }

    ASSERT_EQ(alloc.use_count(), 3);
}

// block_allocator from raw type
TEST_F(TestMemory, SingleBlockAllocatorFromType)
{
    auto block_alloc = make_block_allocator<single_block_allocator, TestRawMalloc>(1_MiB);
    auto block       = block_alloc.allocate_block();
    ASSERT_EQ(block_alloc.next_block_size(), 0);
    block_alloc.deallocate_block(block);
    ASSERT_EQ(block_alloc.next_block_size(), 1_MiB);
}

TEST_F(TestMemory, SingleBlockAllocatorFromObj)
{
    auto raw         = TestRawMalloc();
    auto tracked     = make_tracked_allocator(log_tracker{"** tracker: malloc **"}, std::move(raw));
    auto block_alloc = make_block_allocator<single_block_allocator>(std::move(tracked), 1_MiB);

    auto block = block_alloc.allocate_block();
    block_alloc.deallocate_block(block);

    block = block_alloc.allocate_block();
    EXPECT_ANY_THROW(block_alloc.allocate_block());
    block_alloc.deallocate_block(block);
}

TEST_F(TestMemory, FixedSizedBlockAllocatorFromType)
{
    auto block_alloc = make_block_allocator<fixed_size_block_allocator, TestRawMalloc>(1_MiB);
    auto block       = block_alloc.allocate_block();
    ASSERT_EQ(block_alloc.next_block_size(), 1_MiB);
    block_alloc.deallocate_block(block);
}

TEST_F(TestMemory, FixedSizedBlockAllocatorFromObj)
{
    auto raw         = TestRawMalloc();
    auto tracked     = make_tracked_allocator(log_tracker{"** tracker: malloc **"}, std::move(raw));
    auto block_alloc = make_block_allocator<fixed_size_block_allocator>(std::move(tracked), 1_MiB);

    auto block0 = block_alloc.allocate_block();
    auto block1 = block_alloc.allocate_block();
    ASSERT_EQ(block0.size, 1_MiB);
    ASSERT_EQ(block1.size, 1_MiB);
    block_alloc.deallocate_block(block0);
    block_alloc.deallocate_block(block1);
}

TEST_F(TestMemory, GrowingdBlockAllocatorFromType)
{
    auto block_alloc = make_block_allocator<growing_block_allocator, TestRawMalloc>(1_MiB);
    auto block       = block_alloc.allocate_block();
    ASSERT_EQ(block_alloc.next_block_size(), 2_MiB);
    block_alloc.deallocate_block(block);
}

TEST_F(TestMemory, GrowingdBlockAllocatorFromObj)
{
    auto raw         = TestRawMalloc();
    auto tracked     = make_tracked_allocator(log_tracker{"** tracker: malloc **"}, std::move(raw));
    auto block_alloc = make_block_allocator<growing_block_allocator>(std::move(tracked), 1_MiB, 2_MiB, 2.0);

    auto block0 = block_alloc.allocate_block();
    auto block1 = block_alloc.allocate_block();
    ASSERT_EQ(block0.size, 1_MiB);
    ASSERT_EQ(block1.size, 2_MiB);
    ASSERT_EQ(block_alloc.next_block_size(), 2_MiB);
    block_alloc.deallocate_block(block0);
    block_alloc.deallocate_block(block1);
}

TEST_F(TestMemory, CountLimitedFixedSizeBlockAllocatorFromType)
{
    auto block_alloc = make_block_allocator<fixed_size_block_allocator, TestRawMalloc>(1_MiB);
    auto block       = block_alloc.allocate_block();
    ASSERT_EQ(block_alloc.next_block_size(), 1_MiB);
    block_alloc.deallocate_block(block);
}

TEST_F(TestMemory, CountLimitedFixedSizeBlockAllocatorFromObj)
{
    auto raw     = TestRawMalloc();
    auto tracked = make_tracked_allocator(log_tracker{"** tracker: malloc **"}, std::move(raw));
    auto block   = make_block_allocator<fixed_size_block_allocator>(std::move(tracked), 1_MiB);
    auto alloc   = make_extended_block_allocator<count_limited_block_allocator>(std::move(block), 2UL);

    auto block0 = alloc.allocate_block();
    auto block1 = alloc.allocate_block();
    ASSERT_EQ(block0.size, 1_MiB);
    ASSERT_EQ(block1.size, 1_MiB);
    ASSERT_EQ(alloc.block_count(), 2);
    ASSERT_ANY_THROW(alloc.allocate_block());
    alloc.deallocate_block(block0);
    alloc.deallocate_block(block1);
}

TEST_F(TestMemory, SizeLimitedFixedSizeBlockAllocatorFromObj)
{
    auto raw     = TestRawMalloc();
    auto tracked = make_tracked_allocator(log_tracker{"** tracker: malloc **"}, std::move(raw));
    auto block   = make_block_allocator<fixed_size_block_allocator>(std::move(tracked), 1_MiB);
    auto alloc   = make_extended_block_allocator<size_limited_block_allocator>(std::move(block), 2_MiB);

    auto block0 = alloc.allocate_block();
    auto block1 = alloc.allocate_block();
    ASSERT_EQ(block0.size, 1_MiB);
    ASSERT_EQ(block1.size, 1_MiB);
    ASSERT_EQ(alloc.bytes_allocated(), 2_MiB);
    ASSERT_ANY_THROW(alloc.allocate_block());
    alloc.deallocate_block(block0);
    alloc.deallocate_block(block1);
}

TEST_F(TestMemory, SmartPtrsWithStatelessRawAllocator)
{
    auto raw = TestRawMalloc();
    static_assert(std::is_same<typename decltype(raw)::is_stateful, std::false_type>::value, "");
    static_assert(is_thread_safe_allocator<decltype(raw)>::value, "should be true");

    auto tracked = make_tracked_allocator(empty_tracker{}, std::move(raw));
    static_assert(std::is_same<typename decltype(tracked)::is_stateful, std::false_type>::value, "");
    static_assert(is_thread_safe_allocator<decltype(tracked)>::value, "should be true");

    auto alloc = make_allocator(std::move(tracked));
    static_assert(is_thread_safe_allocator<decltype(alloc)>::value, "should be true");
    static_assert(std::is_same<typename decltype(alloc)::mutex, no_mutex>::value, "");

    ASSERT_EQ(alloc.use_count(), 1);

    auto i0 = allocate_unique<int>(alloc, 1);
    ASSERT_EQ(alloc.use_count(), 2);
}

TEST_F(TestMemory, SmartPtrsWithStatefulRawAllocator)
{
    auto raw = TestRawMalloc();
    static_assert(std::is_same<typename decltype(raw)::is_stateful, std::false_type>::value, "");
    static_assert(is_thread_safe_allocator<decltype(raw)>::value, "should be true");

    auto tracked = make_tracked_allocator(log_tracker{"** malloc **"}, std::move(raw));
    static_assert(std::is_same<typename decltype(tracked)::is_stateful, std::true_type>::value, "");
    static_assert(!is_thread_safe_allocator<decltype(tracked)>::value, "should be false");

    auto alloc = make_allocator(std::move(tracked));
    static_assert(is_thread_safe_allocator<decltype(alloc)>::value, "should be true");
    static_assert(std::is_same<typename decltype(alloc)::mutex, std::mutex>::value, "");

    ASSERT_EQ(alloc.use_count(), 1);

    auto i0 = allocate_unique<int>(alloc, 1);
    ASSERT_EQ(alloc.use_count(), 2);
}

TEST_F(TestMemory, MemoryDescriptors)
{
    auto raw     = TestRawMalloc();
    auto tracked = make_tracked_allocator(empty_tracker{}, std::move(raw));
    auto alloc   = make_allocator(std::move(tracked));

    ASSERT_EQ(alloc.use_count(), 1);

    auto ref = make_allocator_reference(alloc);
    ASSERT_EQ(alloc.use_count(), 2);

    LOG(INFO) << "testing any";

    auto any_ref = make_any_allocator_reference(alloc);
    ASSERT_EQ(alloc.use_count(), 3);

    auto md = alloc.allocate_descriptor(1_MiB);
    /*
    auto md = alloc.allocate_mdesc(1_MiB);
    ASSERT_EQ(alloc.use_count(), 4);

    auto md3 = alloc.allocate_mdesc_v3(1_MiB);
    ASSERT_EQ(alloc.use_count(), 5);

    mdesc_v3 v3;
    v3 = std::move(md3);
    ASSERT_EQ(alloc.use_count(), 5);
*/
    LOG(INFO) << "finished testing any";
}

TEST_F(TestMemory, AllocatorTraits)
{
    auto raw   = TestRawMalloc();
    auto alloc = make_allocator(std::move(raw));

    using traits = allocator_traits<decltype(alloc)>;

    auto min = traits::min_alignment(alloc);
    auto ctx = traits::device_context(alloc);
}

TEST_F(TestMemory, CachedBlockArena)
{
    auto raw     = TestRawMalloc();
    auto tracked = make_tracked_allocator(counting_tracker("** tracker: malloc **"), std::move(raw));
    // protect the counters by making tracked a thread-safe allocator
    auto safe  = make_thread_safe_allocator<std::mutex>(std::move(tracked));
    auto block = make_block_allocator<fixed_size_block_allocator>(std::move(safe), 1_MiB);
    auto alloc = make_extended_block_allocator<count_limited_block_allocator>(std::move(block), 2UL);
    auto arena = make_cached_block_arena(std::move(alloc));

    /*                          v thread-safe v  v tracked v                */
    const auto& tracker = arena.get_allocator().get_allocator().get_tracker();

    auto block0 = arena.allocate_block();
    ASSERT_EQ(tracker.array_count(), 1);
    ASSERT_EQ(tracker.array_bytes(), 1_MiB);

    auto block1 = arena.allocate_block();
    ASSERT_EQ(tracker.array_count(), 2);
    ASSERT_EQ(tracker.array_bytes(), 2_MiB);

    ASSERT_EQ(arena.get_block_allocator().block_count(), 2);
    ASSERT_ANY_THROW(arena.allocate_block());

    // caching arena will hold the block and not actually deallocate it
    arena.deallocate_block(std::move(block0));
    ASSERT_EQ(tracker.array_count(), 2);
    ASSERT_EQ(tracker.array_bytes(), 2_MiB);

    // deallocate any used cached blocks
    arena.shrink_to_fit();
    ASSERT_EQ(tracker.array_count(), 1);
    ASSERT_EQ(tracker.array_bytes(), 1_MiB);

    // add a block back to the cache
    arena.deallocate_block(std::move(block1));
    ASSERT_EQ(tracker.array_count(), 1);
    ASSERT_EQ(tracker.array_bytes(), 1_MiB);

    // this allocation should pull from the cache
    block0 = arena.allocate_block();
    ASSERT_EQ(tracker.array_count(), 1);
    ASSERT_EQ(tracker.array_bytes(), 1_MiB);
    arena.deallocate_block(std::move(block0));
}

TEST_F(TestMemory, DetailStack)
{
    detail::fixed_memory_stack stack;
}

TEST_F(TestMemory, TransactionalAllocator)
{
    auto raw     = TestRawMalloc();
    auto tracked = make_tracked_allocator(counting_tracker("** tracker: malloc **"), std::move(raw));
    auto block   = make_block_allocator<fixed_size_block_allocator>(std::move(tracked), 1_MiB);
    auto counted = make_extended_block_allocator<count_limited_block_allocator>(std::move(block), 3);
    auto arena   = make_cached_block_arena(std::move(counted));
    auto txalloc = make_transactional_allocator(std::move(arena));
    auto alloc   = make_allocator(std::move(txalloc));

    /*                           v txalloc v     v tracked v  */
    const auto& tracker = alloc.get_allocator().get_allocator().get_tracker();

    // the transactional allocator will allocate one block on instantiate
    // you can avoid this overhead if you've pre-instantiated the arena by
    // calling reserve_blocks on the arena allocator prior to moving it into
    // the transactional allocator
    ASSERT_EQ(tracker.count(), 1);
    ASSERT_EQ(tracker.bytes(), 1_MiB);

    {
        // this is allocated on the initial stack
        auto md = alloc.allocate_descriptor(1_KiB);
        ASSERT_EQ(tracker.count(), 1UL);
        ASSERT_EQ(tracker.bytes(), 1_MiB);
    }

    {
        // since the previous memory descriptor is released
        // the next allocation can force recycling of that block
        // rather than allocating a new one
        auto md1 = alloc.allocate_descriptor(1_MiB);
        ASSERT_EQ(tracker.count(), 1);
        ASSERT_EQ(tracker.bytes(), 1_MiB);
    }

    {
        auto on_block_0 = alloc.allocate_descriptor(1_KiB);
        ASSERT_EQ(tracker.count(), 1UL);
        ASSERT_EQ(tracker.bytes(), 1_MiB);

        // not enough room on the original stack
        // since we have not pre-reserved arena blocks
        // this will allocate a new block

        auto on_block_1 = alloc.allocate_descriptor(1_MiB);
        ASSERT_EQ(tracker.count(), 2UL);
        ASSERT_EQ(tracker.bytes(), 2_MiB);
    }
}

#include "huge_page_allocator.h"
#include "detail/page_info.h"
#include "linux/kernel-page-flags.h"

#include <thread>

TEST_F(TestMemory, TransparentHugePages)
{
    constexpr std::size_t size = 20_MiB;

    auto thp_2m = transparent_huge_page_allocator<2_MiB>();

    auto alloc = make_allocator(std::move(thp_2m));

    ASSERT_EQ(2_MiB, alloc.min_alignment());
    ASSERT_EQ(2_MiB, alloc.max_alignment());

    auto md = alloc.allocate_descriptor(size);
    std::memset(md.data(), 0, size);

    // this breaks down the array into system page-sized pages
    // on linux, 4k pages
    // in this example, we will test each of those 4k pages to see if they belong
    // to a larger transparent huge page
    page_info_array pinfo     = get_info_for_range(md.data(), ((char*)md.data()) + size);
    flag_count      thp_count = get_flag_count(pinfo, KPF_THP);

    EXPECT_TRUE(thp_count.pages_available) << "hugepage info not available; probably not running as root";

    if (thp_count.pages_available)
    {
        EXPECT_EQ(thp_count.pages_set, thp_count.pages_total);
    }

    // DLOG(INFO) << "grep -e AnonHugePages  /proc/" << getpid() << "/smaps";
    // std::this_thread::sleep_for(std::chrono::seconds(60));
}

#include "detail/free_list.h"

bool is_equal(void* a, void* b)
{
    return reinterpret_cast<std::uintptr_t>(a) == reinterpret_cast<std::uintptr_t>(b);
}

TEST_F(TestMemory, TestFreeList)
{
    auto raw     = TestRawMalloc();
    auto tracked = make_tracked_allocator(counting_tracker("** tracker: malloc **"), std::move(raw));
    auto block   = make_block_allocator<fixed_size_block_allocator>(std::move(tracked), 1_MiB);

    auto b0 = block.allocate_block();
    auto b1 = block.allocate_block();

    auto mem0 = b0.memory;
    auto mem1 = b1.memory;

    ASSERT_FALSE(is_equal(b0.memory, b1.memory));

    auto list = detail::block_list();

    list.insert(std::move(b0));
    list.insert(std::move(b1));
    ASSERT_EQ(list.size(), 2);

    auto l0 = list.allocate();
    ASSERT_EQ(list.size(), 1);
    EXPECT_TRUE(is_equal(l0.memory, mem1));

    list.deallocate(std::move(l0));
    ASSERT_EQ(list.size(), 2);

    // a second allocation and deallocation should look similar
    l0 = list.allocate();
    ASSERT_EQ(list.size(), 1);
    EXPECT_TRUE(is_equal(l0.memory, mem1));
    auto l1 = list.allocate();
    ASSERT_EQ(list.size(), 0);
    EXPECT_TRUE(is_equal(l1.memory, mem0));

    block.deallocate_block(std::move(l0));
    block.deallocate_block(std::move(l1));
    ASSERT_EQ(list.size(), 0);
}

#include "block_stack.h"

TEST_F(TestMemory, MemoryArenaUncached)
{
    auto raw     = TestRawMalloc();
    auto tracked = make_tracked_allocator(counting_tracker("** tracker: malloc **"), std::move(raw));
    auto block   = make_block_allocator<fixed_size_block_allocator>(std::move(tracked), 1_MiB);
    auto count   = make_extended_block_allocator<count_limited_block_allocator>(std::move(block), 2UL);
    auto stack   = make_block_stack<uncached>(std::move(count));

    /*                            v count v       v tracked v                */
    const auto& tracker = stack.get_allocator().get_allocator().get_tracker();

    ASSERT_LT(stack.next_block_size(), 1_MiB);
    auto actual_size = stack.next_block_size();

    // we can only grow memory_arena one block at a time
    // the memory_arena is a stack of blocks
    // when we deallocate_block, it pops the stack back
    // the stack is not necessarily continuous

    stack.allocate_block();
    auto block0 = stack.current_block().memory;
    ASSERT_EQ(stack.current_block().size, actual_size);
    ASSERT_EQ(tracker.count(), 1);

    stack.allocate_block();
    auto block1 = stack.current_block().memory;
    ASSERT_FALSE(is_equal(block0, block1));
    ASSERT_EQ(stack.current_block().size, actual_size);
    ASSERT_EQ(tracker.count(), 2);

    stack.deallocate_block();
    ASSERT_TRUE(is_equal(stack.current_block().memory, block0));
    ASSERT_EQ(tracker.count(), 1);

    stack.deallocate_block();
    ASSERT_TRUE(stack.empty());
    ASSERT_EQ(tracker.count(), 0);
}

#include "memory_pool.h"
#include "detail/container_node_sizes.h"
#include <list>

TEST_F(TestMemory, MemoryPool)
{
    auto raw     = TestRawMalloc();
    auto tracked = make_tracked_allocator(counting_tracker("** tracker: malloc **"), std::move(raw));
    auto single  = make_block_allocator<single_block_allocator>(std::move(tracked), 1_MiB);
    auto stack   = make_block_stack<uncached>(std::move(single));
    auto pool    = memory_pool<decltype(single)>(list_node_size<int>::value, std::move(stack));

    auto node_size  = list_node_size<int>::value;
    auto node_count = pool.capacity_left() / node_size;
    LOG(INFO) << "pool node_size      : " << node_size;
    LOG(INFO) << "pool node capacity  : " << node_count;
    auto ctx = pool.device_context();

    auto list  = std::list<int, std_allocator<int, decltype(pool)>>(pool);
    auto queue = std::queue<int, decltype(list)>(std::move(list));

    for (int i = 0; i < node_count; i++)
    {
        queue.push(i);
    }

    ASSERT_ANY_THROW(queue.push(42));
}

#include "memory_type.h"

TEST_F(TestMemory, IsMemoryType)
{
    static_assert(is_memory_type<host_memory>::value, "");

    struct empty_memory
    {
    };
    static_assert(!is_memory_type<empty_memory>::value, "");

    struct with_valid_impl
    {
        constexpr static DLDeviceType device_type()
        {
            return kDLCPU;
        }
        constexpr static std::size_t min_allocation_alignment()
        {
            return 8UL;
        }
        constexpr static std::size_t max_access_alignment()
        {
            return 8UL;
        }
        static std::size_t access_alignment_for(std::size_t)
        {
            return with_valid_impl::max_access_alignment();
        }
    };
    static_assert(!is_memory_type<with_valid_impl>::value, "");
    static_assert(decltype(detail::is_memory_type_impl<with_valid_impl>(0))::value, "");

    struct with_valid_base : detail::any_memory
    {
    };
    static_assert(!is_memory_type<with_valid_base>::value, "");

    struct valid_memory_type : public with_valid_impl, public with_valid_base
    {
    };

    //static_assert(std::is_base_of<detail::any_memory, valid_memory_type>::value, "");
    static_assert(decltype(detail::is_memory_type_impl<valid_memory_type>(0))::value, "");

    static_assert(is_memory_type<valid_memory_type>::value, "");

    struct pinned_host_memory : host_memory
    {
        constexpr static DLDeviceType device_type() noexcept
        {
            return kDLCPUPinned;
        }
    };

    static_assert(is_memory_type<pinned_host_memory>::value, "");
    static_assert(is_host_memory<pinned_host_memory>::value, "");
}

TEST_F(TestMemory, HostMemory)
{
    static_assert(is_memory_type<host_memory>::value, "");
    static_assert(is_host_memory<host_memory>::value, "");
    static_assert(host_memory::max_access_alignment() == 8UL, "");
    ASSERT_EQ(host_memory::access_alignment_for(1), 1);
    ASSERT_EQ(host_memory::access_alignment_for(2), 2);
    ASSERT_EQ(host_memory::access_alignment_for(3), 2);
    ASSERT_EQ(host_memory::access_alignment_for(4), 4);
    ASSERT_EQ(host_memory::access_alignment_for(5), 4);
    ASSERT_EQ(host_memory::access_alignment_for(6), 4);
    ASSERT_EQ(host_memory::access_alignment_for(7), 4);
    ASSERT_EQ(host_memory::access_alignment_for(8), 8);
    ASSERT_EQ(host_memory::access_alignment_for(9), 8);
    ASSERT_EQ(host_memory::access_alignment_for(100), 8);
}

/*
#include "affinity.h"

TEST_F(TestMemory, Topology)
{
    auto logical_cpus = [](const cpuaff::cpu& cpu) { return int(cpu.id().get()); };

    auto initial_affinity = affinity::this_thread::get_affinity();
    LOG(INFO) << "affinity: this_thread: " << initial_affinity;

    {
        affinity_guard guard(cpu_set::from_string("0"));
        auto           scoped_affinity = affinity::this_thread::get_affinity();
        EXPECT_EQ(scoped_affinity.size(), 1);
    }

    auto after_guard_affinity = affinity::this_thread::get_affinity();
    EXPECT_EQ(initial_affinity, after_guard_affinity);

    auto numa_nodes = affinity::system::topology();
    for(const auto& n : numa_nodes)
    {
        LOG(INFO) << n;
    }
}
*/

#include "malloc_allocator.h"

TEST_F(TestMemory, FirstTouchMallocAllocator)
{
    constexpr std::size_t size = 20_MiB;

    //auto raw = first_touch_allocator<malloc_allocator>();

    auto alloc = make_allocator(malloc_allocator());

    ASSERT_EQ(8, alloc.min_alignment());
    ASSERT_EQ(8, alloc.max_alignment());

    auto md = alloc.allocate_descriptor(size);

    // this breaks down the array into system page-sized pages; on linux, 4k pages
    // in this example, we will test each of those 4k pages to see if they belong
    // to a larger transparent huge page
    page_info_array pinfo     = get_info_for_range(md.data(), ((char*)md.data()) + size);
    flag_count      thp_count = get_flag_count(pinfo, KPF_THP);

    EXPECT_TRUE(thp_count.pages_available) << "hugepage info not available; probably not running as root";

    if (thp_count.pages_available)
    {
        EXPECT_EQ(thp_count.pages_set, 0);
    }

    // DLOG(INFO) << "grep -e AnonHugePages  /proc/" << getpid() << "/smaps";
    // std::this_thread::sleep_for(std::chrono::seconds(60));
}

#include "detail/ranges.h"

using detail::find_ranges;
using detail::print_ranges;

TEST_F(TestMemory, FindRanges0)
{
    std::vector<int>                 a{1};
    std::vector<std::pair<int, int>> a_ranges{{1, 1}};
    auto                             ranges = find_ranges(a);
    ASSERT_EQ(ranges, a_ranges);
    ASSERT_EQ(print_ranges(ranges), "1");
}

TEST_F(TestMemory, FindRanges1)
{
    std::vector<int>                 a{1, 2};
    std::vector<std::pair<int, int>> a_ranges{{1, 2}};
    auto                             ranges = find_ranges(a);
    ASSERT_EQ(ranges, a_ranges);
    ASSERT_EQ(print_ranges(ranges), "1-2");
}

TEST_F(TestMemory, FindRanges2)
{
    std::vector<int>                 a{1, 2, 3};
    std::vector<std::pair<int, int>> a_ranges{{1, 3}};
    auto                             ranges = find_ranges(a);
    ASSERT_EQ(ranges, a_ranges);
    ASSERT_EQ(print_ranges(ranges), "1-3");
}

TEST_F(TestMemory, FindRanges3)
{
    std::vector<int>                 a{1, 3};
    std::vector<std::pair<int, int>> a_ranges{{1, 1}, {3, 3}};
    auto                             ranges = find_ranges(a);
    ASSERT_EQ(ranges, a_ranges);
    ASSERT_EQ(print_ranges(ranges), "1,3");
}

TEST_F(TestMemory, FindRanges4)
{
    std::vector<int>                 a{1, 2, 4, 5, 6, 10};
    std::vector<std::pair<int, int>> a_ranges{{1, 2}, {4, 6}, {10, 10}};
    auto                             ranges = find_ranges(a);
    ASSERT_EQ(ranges, a_ranges);
    ASSERT_EQ(print_ranges(ranges), "1-2,4-6,10");
}

TEST_F(TestMemory, FindRanges5)
{
    std::vector<int>                 a{0, 1, 2, 3, 4, 5, 6};
    std::vector<std::pair<int, int>> a_ranges{{0, 6}};
    auto                             ranges = find_ranges(a);
    ASSERT_EQ(ranges, a_ranges);
    ASSERT_EQ(print_ranges(ranges), "0-6");
}

template <typename Key, typename Value, typename BlockAllocator>
auto make_map(BlockAllocator&& block_alloc)
{
    static_assert(is_block_allocator<BlockAllocator>::value, "");

    using node_type = std::pair<Key, Value>;
    auto node_size  = alignof(node_type) + sizeof(node_type) + 64;

    auto stack = make_block_stack<uncached>(std::move(block_alloc));
    auto pool  = memory_pool<BlockAllocator>(node_size, std::move(stack));
    auto alloc = make_thread_unsafe_allocator(std::move(pool));

    return std::map<Key, Value, std::less<Key>, std_allocator<node_type, decltype(alloc)>>(alloc);
}

TEST_F(TestMemory, MapWithCustomAllocator)
{
    auto huge  = transparent_huge_page_allocator<2_MiB>();
    auto track = make_tracked_allocator(counting_tracker{"** tracker: huge **"}, std::move(huge));
    auto block = make_block_allocator<fixed_size_block_allocator>(std::move(track), 2_MiB);
    auto alloc = make_extended_block_allocator<count_limited_block_allocator>(std::move(block), 4);
    auto arena = make_cached_block_arena(std::move(alloc));

    // populate cache
    arena.reserve_blocks(4);

    // create map
    auto m = make_map<int, int>(std::move(arena));

    // unwind the allocator stack to get the tracker
    //                      v  std_alloc  v v memory_pool v v block_stack v v block_arena v  v tracker v
    const auto& tracker = m.get_allocator().get_allocator().get_allocator().get_allocator().get_allocator().get_tracker();

    // our system allocator has been used 4 times to populate the block cache
    EXPECT_EQ(tracker.count(), 4);

    LOG(INFO) << "Start Using Map";

    for (int i = 0; i < 10240; i++)
    {
        m[i] = i;
    }

    EXPECT_EQ(tracker.count(), 4);
    LOG(INFO) << "total system allocations: " << tracker.count();
}

TEST_F(TestMemory, MapWithTracedMalloc)
{
    auto track = make_tracked_allocator(counting_tracker{"** tracker: malloc **"}, malloc_allocator());

    // create map
    auto m = std::map<int, int, std::less<int>, std_allocator<std::pair<int, int>, decltype(track)>>(track);

    // get tracker          v  std_alloc  v   v tracker v
    const auto& tracker = m.get_allocator().get_allocator().get_tracker();

    LOG(INFO) << "system allocation count on init: " << tracker.count();
    LOG(INFO) << "start using map";

    for (int i = 0; i < 10240; i++)
    {
        m[i] = i;
    }

    LOG(INFO) << "finish using map";
    LOG(INFO) << "system allocation count on fini: " << tracker.count();
}

bool equiv_ptr(void* lhs, void* rhs)
{
    return reinterpret_cast<addr_t>(lhs) == reinterpret_cast<addr_t>(rhs);
}

TEST_F(TestMemory, RBTree_Set)
{
    memory_block b1{reinterpret_cast<void*>(0x00000001), 128};
    memory_block b2{reinterpret_cast<void*>(0xDEADBEEF), 1024};
    memory_block b3{reinterpret_cast<void*>(0xFACEBAD1), 1024};
    memory_block b4{reinterpret_cast<void*>(0xA0000000), 2048};

    std::set<memory_block, memory_block_compare_size<>> blocks(memory_block_compare_size<>{});

    blocks.insert(b1);
    blocks.insert(b2);
    blocks.insert(b3);
    blocks.insert(b4);

    // using is_transparent -> we can find blocks by size

    // we can find blocks by actual block or by size
    auto search = blocks.find(1024);
    ASSERT_NE(search, blocks.end());
    ASSERT_TRUE(equiv_ptr(search->memory, b2.memory));

    // we can find blocks that meet certin size requirements by using lower bound
    search = blocks.lower_bound(129);
    ASSERT_NE(search, blocks.end());
    ASSERT_TRUE(equiv_ptr(search->memory, b2.memory));

    // this is how we woudl have to do it without is_transparent
    search = blocks.lower_bound(memory_block{nullptr, 127});
    ASSERT_NE(search, blocks.end());
    ASSERT_TRUE(equiv_ptr(search->memory, b1.memory));

    search = blocks.lower_bound(memory_block{nullptr, 128});
    ASSERT_NE(search, blocks.end());
    ASSERT_TRUE(equiv_ptr(search->memory, b1.memory));

    search = blocks.lower_bound(memory_block{nullptr, 129});
    ASSERT_NE(search, blocks.end());
    ASSERT_TRUE(equiv_ptr(search->memory, b2.memory));

    // we can also find blocks directly by content
    search = blocks.find(b3);
    ASSERT_NE(search, blocks.end());
    ASSERT_TRUE(equiv_ptr(search->memory, b3.memory));
}

#include "bfit_allocator.h"
TEST_F(TestMemory, bfit)
{
    auto track = make_tracked_allocator(log_tracker{"** tracker: malloc **"}, malloc_allocator());
    auto alloc = make_bfit_allocator(6 * 128_MiB, std::move(track));
    //auto alloc = make_allocator(std::move(bfit));

    test_alloc_x10(alloc);
    ASSERT_EQ(alloc.free_nodes(), 1);
    ASSERT_EQ(alloc.used_nodes(), 0);
    test_alloc_x10(alloc);
    ASSERT_EQ(alloc.free_nodes(), 1);
    ASSERT_EQ(alloc.used_nodes(), 0);
    test_alloc_x10(alloc);
    ASSERT_EQ(alloc.free_nodes(), 1);
    ASSERT_EQ(alloc.used_nodes(), 0);
}

#include <boost/histogram.hpp> // make_histogram, regular, weight, indexed
#include <boost/format.hpp>
#include "utils.h"

TEST_F(TestMemory, histogram)
{
    using namespace boost::histogram; // strip the boost::histogram prefix
    auto h = make_histogram(axis::regular<>(6, 12, 36, "x"));

    /*
    Let's fill a histogram with data, typically this happens in a loop.

    STL algorithms are supported. std::for_each is very convenient to fill a
    histogram from an iterator range. Use std::ref in the call, if you don't
    want std::for_each to make a copy of your histogram.
  */
    std::vector<std::size_t> data = {1_KiB, 3_MiB, 512, 256, 128};

    auto log_hist = [&h](std::size_t size) { h(ilog2_ceil(size)); };

    std::for_each(data.begin(), data.end(), log_hist);
    //h(-1.5); // is placed in underflow bin -1
    //h(-1.0); // is placed in bin 0, bin interval is semi-open
    //h(2.0);  // is placed in overflow bin 6, bin interval is semi-open
    //h(20.0); // is placed in overflow bin 6

    /*
    This does a weighted fill using the `weight` function as an additional
    argument. It may appear at the beginning or end of the argument list. C++
    doesn't have keyword arguments like Python, this is the next-best thing.
  */

    std::ostringstream os;
    os << std::endl;
    for (auto&& x : indexed(h, coverage::all))
    {
        //os << boost::format("bin %2i [%4.1f, %4.1f): %i\n") % x.index() % x.bin().lower() % x.bin().upper() % *x;
        os << boost::format("bin %2i [%10s, %10s): %i\n") % x.index() % bytes_to_string(std::pow(2, x.bin().lower()))
                  % bytes_to_string(std::pow(2, x.bin().upper())) % *x;
    }

    LOG(INFO) << os.str();
}

#include <trtlab/memory/trackers.h>
#include <trtlab/memory/raii_allocator.h>

TEST_F(TestMemory, TrackHighLevelAllocator)
{
    auto high_lvl = make_allocator(make_bfit_allocator(256_MiB, malloc_allocator()));
    auto tracker1 = make_tracked_allocator(size_tracker{}, high_lvl.copy());
    auto tracker2 = make_tracked_allocator(size_tracker{}, high_lvl.copy());
    auto alloc1 = make_allocator(std::move(tracker1));
    auto alloc2 = make_allocator(std::move(tracker2));

    const auto& t1 = alloc1.get_allocator().get_tracker();
    const auto& t2 = alloc2.get_allocator().get_tracker();

    EXPECT_EQ(t1.bytes(), 0);
    EXPECT_EQ(t2.bytes(), 0);


    auto md0 = alloc1.allocate_descriptor(3_MiB);
    EXPECT_EQ(t1.bytes(), 3_MiB);

    auto md1 = alloc2.allocate_descriptor(128_MiB);
    EXPECT_EQ(t2.bytes(), 128_MiB);

    descriptor md2;
    EXPECT_ANY_THROW(md2 = alloc1.allocate_descriptor(128_MiB));

    // the state of the trackers should not change
    EXPECT_EQ(t1.bytes(), 3_MiB);
    EXPECT_EQ(t2.bytes(), 128_MiB);

    auto raii = make_raii_allocator(alloc1);
    auto md3 = raii.allocate_descriptor(45_MiB);
    EXPECT_EQ(t1.bytes(), 48_MiB);

    void* p0  = raii.allocate(2_MiB);
    void* p1  = raii.allocate(4_MiB);
    EXPECT_EQ(t1.bytes(), 54_MiB);

    raii.deallocate(p1);
    // this would normally be a memory leak,
    // but because the allocator owns a descriptor for all non-descriptor allocations
    // this will get cleaned up by the destructor.
    // todo: add a warning for all
    // raii.deallocate(p0);
    EXPECT_EQ(t1.bytes(), 50_MiB);

    auto iraii = raii.shared();
    auto md4 = iraii->allocate_descriptor(8_MiB);
    EXPECT_EQ(t1.bytes(), 58_MiB);

}

#include <trtlab/memory/memory_typed_allocator.h>

TEST_F(TestMemory, IAllocator)
{
    auto high_lvl = make_allocator(make_bfit_allocator(256_MiB, malloc_allocator()));

    CHECK_EQ(high_lvl.device_context().device_type, kDLCPU);

    auto tracker1 = make_tracked_allocator(log_tracker{"** tracker: high-level #1 **"}, high_lvl.copy());
    auto tracker2 = make_tracked_allocator(log_tracker{"** tracker: high-level #2 **"}, high_lvl.copy());
    auto alloc1 = make_allocator(std::move(tracker1));
    auto alloc2 = make_allocator(std::move(tracker2));


    LOG(INFO) << "init host_allocator";
    auto host = host_allocator(alloc1.shared());

    //auto tracker3 = make_tracked_allocator(log_tracker{"** tracker: high-level #3 **"}, std::move(host)););

}

================================================
FILE: trtlab/memory/tools/CMakeLists.txt
================================================
# Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
# This file is subject to the license terms in the LICENSE file
# found in the top-level directory of this distribution.

# builds tools

add_executable(memory_node_size_debugger test_types.hpp node_size_debugger.hpp node_size_debugger.cpp)
#_use_comp(memory_node_size_debugger)
# comp_target_features(memory_node_size_debugger PUBLIC CPP11)
if (MSVC)
    target_compile_options(memory_node_size_debugger PRIVATE "/bigobj")
endif()
target_compile_definitions(memory_node_size_debugger PUBLIC
                           VERSION="${TRTLAB_MEMORY_VERSION_MAJOR}.${TRTLAB_MEMORY_VERSION_MINOR}")
set_target_properties(memory_node_size_debugger PROPERTIES OUTPUT_NAME nodesize_dbg)

install(TARGETS memory_node_size_debugger EXPORT memoryTargets RUNTIME DESTINATION ${TRTLAB_MEMORY_RUNTIME_INSTALL_DIR})


================================================
FILE: trtlab/memory/tools/node_size_debugger.cpp
================================================
// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

#include <cctype>
#include <cstring>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <string>

#include "node_size_debugger.hpp"

const char* const exe_name = "nodesize_dbg";
const std::string exe_spaces(std::strlen(exe_name), ' ');

struct simple_serializer
{
    std::ostream& out;

    void prefix() const {}

    void operator()(const debug_result& result) const
    {
        out << result.container_name << ":\n";
        for (auto pair : result.node_sizes)
            out << '\t' << pair.first << '=' << pair.second << '\n';
    }

    void suffix() const {}
};

struct verbose_serializer
{
    std::ostream& out;

    void prefix() const {}

    void operator()(const debug_result& result) const
    {
        out << "For container '" << result.container_name << "':\n";
        for (auto pair : result.node_sizes)
            out << '\t' << "With an alignment of " << std::setw(2) << pair.first
                << " is the base node size " << std::setw(2) << pair.second << ".\n";
    }

    void suffix() const {}
};

struct code_serializer
{
    std::ostream& out;
    std::string   alignment;
    std::size_t   tab_width;

    void prefix() const
    {
        out << "// The following section was autogenerated by " << exe_name << '\n';
        out << "//=== BEGIN AUTOGENERATED SECTION ===//\n\n";
    }

    void operator()(const debug_result& result) const
    {
        /* namespace detail
         * {
         *      template <std::size_t Alignment>
         *      struct <name>_node_size;
         *
         *      template <>
         *      struct <name>_node_size<I>
         *      : std::integral_constant<std::size_t, I_base_size>
         *      {};
         *
         *      ...
         * } // namespace detail
         *
         * template <typename T>
         * struct <name>_node_size
         * : std::integral_constant<std::size_t,
         *    detail::<name>_node_size<FOONATHAN_ALIGNOF(T)>::value + sizeof(T)>
         * {};
         */
        auto newline = "\n";
        out << "namespace detail" << newline << '{' << newline << tab()
            << "template <std::size_t Alignment>" << newline << tab() << "struct "
            << struct_name(result.container_name) << ';' << newline;
        for (auto pair : result.node_sizes)
            out << newline << tab() << "template <>" << newline << tab() << "struct "
                << struct_name(result.container_name) << '<' << pair.first << '>' << newline
                << tab() << ": std::integral_constant<std::size_t, " << pair.second << '>'
                << newline << tab() << "{};" << newline;
        out << "} // namespace detail" << newline << newline << "template <typename T>" << newline
            << "struct " << struct_name(result.container_name) << newline
            << ": std::integral_constant<std::size_t," << newline
            << "       detail::" << struct_name(result.container_name) << '<' << alignment
            << ">::value + sizeof(T)>" << newline << "{};" << newline << newline;
    }

    void suffix() const
    {
        out << "//=== END AUTOGENERATED SECTION ===//\n";
    }

    std::string tab() const
    {
        if (tab_width == 0u)
            return "\t";
        return std::string(tab_width, ' ');
    }

    std::string struct_name(const char* container_name) const
    {
        return container_name + std::string("_node_size");
    }
};

using debuggers =
    std::tuple<debug_forward_list, debug_list, debug_set, debug_multiset, debug_unordered_set,
               debug_unordered_multiset, debug_map, debug_multimap, debug_unordered_map,
               debug_unordered_multimap, debug_shared_ptr_stateless, debug_shared_ptr_stateful,
               debug_shared_ptr_stateful_mutex>;

template <class Debugger, class Serializer>
void serialize_single(const Serializer& serializer)
{
    serializer.prefix();
    serializer(debug(Debugger{}));
    serializer.suffix();
}

template <class Debugger, class Serializer>
int serialize_impl(const Serializer& serializer)
{
    serializer(debug(Debugger()));
    return 0;
}

template <class Serializer, class... Debuggers>
void serialize_impl(const Serializer& serializer, std::tuple<Debuggers...>)
{
    int dummy[] = {serialize_impl<Debuggers>(serializer)...};
    (void)dummy;
}

template <class Serializer>
void serialize(const Serializer& serializer)
{
    serializer.prefix();
    serialize_impl(serializer, debuggers{});
    serializer.suffix();
}

void print_help(std::ostream& out)
{
    out << "Usage: " << exe_name << " [--version][--help]\n";
    out << "       " << exe_spaces << " [--simple][--verbose]\n";
    out << "       " << exe_spaces
        << " [--code [--alignof expr] [--append] [--tab digit] [outputfile]]\n";
    out << "Obtains information about the internal node sizes of the STL containers.\n";
    out << '\n';
    out << "   --simple\tprints node sizes in the form 'alignment=base-node-size'\n";
    out << "   --verbose\tprints node sizes in a more verbose form\n";
    out << "   --code\tgenerates C++ code to obtain the node size\n";
    out << "   --help\tdisplay this help and exit\n";
    out << "   --version\toutput version information and exit\n";
    out << '\n';
    out << "Options for code generation: \n";
    out << "   --alignof\tfollowed by an expression that calculates the alignment of a type named "
           "'T', default is 'alignof(T)'\n";
    out << "   --append\tappend to the outputfile instead of overwriting it (the default)\n";
    out << "   --tab\tfollowed by single digit specifying tab width, 0 uses '\\t'\n";
    out << '\n';
    out << "The base node size is the size of the node without the storage for the value type.\n"
        << "Add 'sizeof(value_type)' to the base node size for the appropriate alignment to get "
           "the whole size.\n";
    out << "With no options prints base node sizes of all containers in a simple manner.\n";
}

void print_version(std::ostream& out)
{
    out << exe_name << " version " << VERSION << '\n';
}

int print_invalid_option(std::ostream& out, const char* option)
{
    out << exe_name << ": invalid option -- '";
    while (*option == '-')
        ++option;
    out << option << "'\n";
    out << "Try '" << exe_name << " --help' for more information.\n";
    return 2;
}

int print_invalid_argument(std::ostream& out, const char* option)
{
    out << exe_name << ": invalid argument for option -- '" << option << "'\n";
    out << "Try '" << exe_name << " --help' for more information.\n";
    return 2;
}

int main(int argc, char* argv[])
{
    if (argc <= 1 || argv[1] == std::string("--simple"))
        serialize(simple_serializer{std::cout});
    else if (argv[1] == std::string("--verbose"))
        serialize(verbose_serializer{std::cout});
    else if (argv[1] == std::string("--code"))
    {
        std::size_t   tab_width = 4u;
        std::string   alignment = "alignof(T)";
        auto          append    = false;
        std::ofstream file;
        std::ostream  out(std::cout.rdbuf());

        for (auto cur = &argv[2]; *cur; ++cur)
        {
            if (*cur == std::string("--tab"))
            {
                ++cur;
                if (*cur && std::isdigit(cur[0][0]) && !cur[0][1])
                    tab_width = std::size_t(cur[0][0] - '0');
                else
                    return print_invalid_argument(std::cerr, "-t");
            }
            else if (*cur == std::string("--alignof"))
            {
                ++cur;
                if (*cur)
                    alignment = *cur;
                else
                    return print_invalid_argument(std::cerr, "--alignof");
            }
            else if (!file.is_open() && *cur == std::string("--append"))
            {
                append = true;
            }
            else if (!file.is_open())
            {
                file.open(*cur, append ? std::ios_base::app : std::ios_base::out);
                if (!file.is_open())
                    return print_invalid_argument(std::cerr, "outputfile");
                out.rdbuf(file.rdbuf());
            }
            else
                return print_invalid_argument(std::cerr, "--code");
        }

        code_serializer serializer{out, alignment, tab_width};
        serialize(serializer);
    }
    else if (argv[1] == std::string("--help"))
        print_help(std::cout);
    else if (argv[1] == std::string("--version"))
        print_version(std::cout);
    else
        return print_invalid_option(std::cerr, argv[1]);
}


================================================
FILE: trtlab/memory/tools/node_size_debugger.hpp
================================================
// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

#ifndef FOONATHAN_MEMORY_TOOL_NODE_SIZE_DEBUGGER_HPP
#define FOONATHAN_MEMORY_TOOL_NODE_SIZE_DEBUGGER_HPP

#include <algorithm>
#include <memory>
#include <mutex>
#include <tuple>
#include <type_traits>

#include <forward_list>
#include <list>
#include <map>
#include <set>
#include <unordered_map>
#include <unordered_set>

// #include <foonathan/alignof.hpp>

template <typename TestType, class Debugger>
struct node_size_storage
{
    static std::size_t size;
};

template <typename TT, class Debugger>
std::size_t node_size_storage<TT, Debugger>::size = 0;

struct empty_payload
{
};

// Obtains the node size for a container.
// Since the node type is private to the implementation,
// it cannot be accessed directly.
// It is only available to the allocator through rebinding.
// The allocator simply stores the size of the biggest type, it is rebound to,
// as long as it is not the TestType, the actual value_type of the container.
template <typename T, typename TestType, class Debugger, class AdditionalPayload = empty_payload>
class node_size_debugger : public std::allocator<T>, private AdditionalPayload
{
public:
    template <typename Other>
    struct rebind
    {
        using other = node_size_debugger<Other, TestType, Debugger, AdditionalPayload>;
    };

    node_size_debugger()
    {
        if (!std::is_same<T, TestType>::value)
            node_size() = std::max(node_size(), sizeof(T));
    }

    template <typename U>
    node_size_debugger(node_size_debugger<U, TestType, Debugger, AdditionalPayload>)
    {
        if (!std::is_same<T, TestType>::value)
            node_size() = std::max(node_size(), sizeof(T));
    }

    static std::size_t& node_size()
    {
        return node_size_storage<TestType, Debugger>::size;
    }

private:
    template <typename U, typename TT, class Dbg, class Payload>
    friend class node_size_debugger;
};

struct hash
{
    // note: not noexcept! this leads to a cached hash value
    template <typename T>
    std::size_t operator()(const T&) const
    {
        // quality doesn't matter
        return 0;
    }
};

struct debug_forward_list
{
    const char* name() const
    {
        return "forward_list";
    }

    template <typename T>
    std::size_t debug()
    {
        std::forward_list<T, node_size_debugger<T, T, debug_forward_list>> list;
        list.push_front(T());
        list.push_front(T());
        list.push_front(T());
        return list.get_allocator().node_size() - sizeof(T);
    }
};

struct debug_list
{
    const char* name() const
    {
        return "list";
    }

    template <typename T>
    std::size_t debug()
    {
        std::list<T, node_size_debugger<T, T, debug_list>> list;
        list.push_front(T());
        list.push_front(T());
        list.push_front(T());
        return list.get_allocator().node_size() - sizeof(T);
    }
};

struct debug_set
{
    const char* name() const
    {
        return "set";
    }

    template <typename T>
    std::size_t debug()
    {
        std::set<T, std::less<T>, node_size_debugger<T, T, debug_set>> set;
        set.insert(T());
        set.insert(T());
        set.insert(T());
        return set.get_allocator().node_size() - sizeof(T);
    }
};

struct debug_multiset
{
    const char* name() const
    {
        return "multiset";
    }

    template <typename T>
    std::size_t debug()
    {
        std::multiset<T, std::less<T>, node_size_debugger<T, T, debug_multiset>> set;
        set.insert(T());
        set.insert(T());
        set.insert(T());
        return set.get_allocator().node_size() - sizeof(T);
    }
};

struct debug_unordered_set
{
    const char* name() const
    {
        return "unordered_set";
    }

    template <typename T>
    std::size_t debug()
    {
        std::unordered_set<T, hash, std::equal_to<T>, node_size_debugger<T, T, debug_unordered_set>>
            set;
        set.insert(T());
        set.insert(T());
        set.insert(T());
        return set.get_allocator().node_size() - sizeof(T);
    }
};

struct debug_unordered_multiset
{
    const char* name() const
    {
        return "unordered_multiset";
    }

    template <typename T>
    std::size_t debug()
    {
        std::unordered_multiset<T, hash, std::equal_to<T>,
                                node_size_debugger<T, T, debug_unordered_multiset>>
            set;
        set.insert(T());
        set.insert(T());
        set.insert(T());
        return set.get_allocator().node_size() - sizeof(T);
    }
};

struct debug_map
{
    const char* name() const
    {
        return "map";
    }

    template <typename T>
    std::size_t debug()
    {
        using type = std::pair<const T, T>;
        std::map<T, T, std::less<T>, node_size_debugger<type, type, debug_map>> map;
        map.insert(std::make_pair(T(), T()));
        map.insert(std::make_pair(T(), T()));
        map.insert(std::make_pair(T(), T()));
        return map.get_allocator().node_size() - sizeof(typename decltype(map)::value_type);
    }
};

struct debug_multimap
{
    const char* name() const
    {
        return "multimap";
    }

    template <typename T>
    std::size_t debug()
    {
        using type = std::pair<const T, T>;
        std::multimap<T, T, std::less<T>, node_size_debugger<type, type, debug_multimap>> map;
        map.insert(std::make_pair(T(), T()));
        map.insert(std::make_pair(T(), T()));
        map.insert(std::make_pair(T(), T()));
        return map.get_allocator().node_size() - sizeof(typename decltype(map)::value_type);
    }
};

struct debug_unordered_map
{
    const char* name() const
    {
        return "unordered_map";
    }

    template <typename T>
    std::size_t debug()
    {
        using type = std::pair<const T, T>;
        std::unordered_map<T, T, hash, std::equal_to<T>,
                           node_size_debugger<type, type, debug_unordered_map>>
            map;
        map.insert(std::make_pair(T(), T()));
        map.insert(std::make_pair(T(), T()));
        map.insert(std::make_pair(T(), T()));
        return map.get_allocator().node_size() - sizeof(typename decltype(map)::value_type);
    }
};

struct debug_unordered_multimap
{
    const char* name() const
    {
        return "unordered_multimap";
    }

    template <typename T>
    std::size_t debug()
    {
        using type = std::pair<const T, T>;
        std::unordered_multimap<T, T, hash, std::equal_to<T>,
                                node_size_debugger<type, type, debug_unordered_multimap>>
            map;
        map.insert(std::make_pair(T(), T()));
        map.insert(std::make_pair(T(), T()));
        map.insert(std::make_pair(T(), T()));
        return map.get_allocator().node_size() - sizeof(typename decltype(map)::value_type);
    }
};

struct debug_shared_ptr_stateless
{
    const char* name() const
    {
        return "shared_ptr_stateless";
    }

    template <typename T>
    std::size_t debug()
    {
        struct allocator_reference_payload
        {
        };

        auto ptr = std::allocate_shared<T>(
            node_size_debugger<T, T, debug_shared_ptr_stateless, allocator_reference_payload>());
        auto ptr2 = std::allocate_shared<T>(
            node_size_debugger<T, T, debug_shared_ptr_stateless, allocator_reference_payload>());
        return node_size_debugger<T, T, debug_shared_ptr_stateless>::node_size();
    }
};

struct debug_shared_ptr_stateful
{
    const char* name() const
    {
        return "shared_ptr_stateful";
    }

    template <typename T>
    std::size_t debug()
    {
        struct allocator_reference_payload
        {
            void* ptr;
        };

        auto ptr = std::allocate_shared<T>(
            node_size_debugger<T, T, debug_shared_ptr_stateful, allocator_reference_payload>());
        auto ptr2 = std::allocate_shared<T>(
            node_size_debugger<T, T, debug_shared_ptr_stateful, allocator_reference_payload>());
        return node_size_debugger<T, T, debug_shared_ptr_stateful>::node_size();
    }
};

struct debug_shared_ptr_stateful_mutex
{
    const char* name() const
    {
        return "shared_ptr_stateful_mutex";
    }

    template <typename T>
    std::size_t debug()
    {
        struct allocator_reference_payload
        {
            typename std::aligned_storage<sizeof(std::mutex)>::type mutex;
            void*                                                   ptr;
        };

        auto ptr = std::allocate_shared<T>(node_size_debugger<T, T, debug_shared_ptr_stateful_mutex,
                                                              allocator_reference_payload>());
        auto ptr2 =
            std::allocate_shared<T>(node_size_debugger<T, T, debug_shared_ptr_stateful_mutex,
                                                       allocator_reference_payload>());
        return node_size_debugger<T, T, debug_shared_ptr_stateful_mutex>::node_size();
    }
};

template <typename T, class Debugger>
std::size_t debug_single(Debugger debugger)
{
    return debugger.template debug<T>();
}

#include "test_types.hpp"

// Maps the alignment of the test types to the base size of the node.
// The base size of the node is the node size obtained via the allocator
// but without the storage for the value type.
// It is only dependent on the alignment of the value type.
using node_size_map = std::map<std::size_t, std::size_t>;

struct debug_result
{
    const char*   container_name;
    node_size_map node_sizes;
};

template <class Debugger, typename... Types>
node_size_map debug_impl(Debugger debugger, std::tuple<Types...>)
{
    node_size_map result;
    int dummy[] = {(result[alignof(Types)] = debug_single<Types>(debugger), 0)...};
    (void)dummy;
    return result;
}

template <class Debugger>
debug_result debug(Debugger debugger)
{
    return {debugger.name(), debug_impl(debugger, test_types{})};
}

#endif //FOONATHAN_MEMORY_TOOL_NODE_SIZE_DEBUGGER_HPP


================================================
FILE: trtlab/memory/tools/test_types.hpp
================================================
// Copyright (C) 2015-2016 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.

#ifndef FOONATHAN_MEMORY_TOOL_TEST_TYPES_HPP_INCLUDED
#define FOONATHAN_MEMORY_TOOL_TEST_TYPES_HPP_INCLUDED

#include <cstddef>
#include <tuple>

#if !defined(_MSC_VER)
//#include <foonathan/alignof.hpp>
//#include <foonathan/constexpr.hpp>

// erases duplicate alignments
// adopted from https://github.com/irrequietus/clause/blob/alignutil/clause/ample/storage/alignutil.hh
// Copyright (C) 2013 - 2016 George Makrydakis <george@irrequietus.eu>
namespace detail
{
    template <typename T>
    using M0 = typename T::type;

    /*~
     * @note Forward declarations for several utility templates that are to be used
     *       for emulating high order functions over a pack without using the rest
     *       of the clause::ample library for two reasons: (1) alignof allows for
     *       special optimizations when applied over a pack of types range for the
     *       "sorting by alignof" step; (2) a single header solution was required
     *       and depending on other parts would mean bring increasing compoments
     *       of full-fledged metaprogramming library features. This header is to
     *       provide utilities for aligned storage and it came up when a challenge
     *       was thrown to me during a discussion with my fellow C++ programmers
     *       Jonathan Müller and Manu Sánchez. Purpose of inclusion to `clause is
     *       simple: it can be of use when analyzing boilerplate generation for
     *       runtime containers and memory allocators by template metaprogramming.
     *
     *       tl;dr: fully standalone header for getting a duplicate-free, sorted by
     *       alignment list of types unique by alignment.
     */
    template <typename...>
    struct M1; // Insert by alignof (map)
    template <typename, typename>
    struct M2; // Remove by alignof (M1 map)
    template <typename...>
    struct M3; // A pack wrap instead of M1
    template <typename, typename, typename>
    struct M4; // 'foldl,fmap' dups to M1<>
    template <typename, typename...>
    struct M5; // Remove M1<>
    template <typename, std::size_t, std::size_t, std::size_t...>
    struct M6; // Sort by alignof

    /*~
     * @note Both `M1,`M2 are used as a mutable compile-time "map"; `M1 inheritance
     *       of function signature of the kind:
     *
     *          static auto C(int(*)[alignof(X)]) -> X
     *
     *       is used as a key/value store in the first "fold", while `M2 is used for
     *       a lookup removing occurences of duplicates in the second "fold" by
     *       substituting each with `M1<>; this is orchestrated by `M4 while cleanup
     *       is handled by `M5 (removal of those `M1<> markers).
     */
    template <typename X, typename... T>
    struct M1<X, T...> : M1<T...>
    {
        using M1<T...>::C;

        static auto C(int (*)[alignof(X)]) -> X;

        static std::size_t constexpr min_val =
            alignof(X) < M1<T...>::min_val ? alignof(X) : M1<T...>::min_val;

        static std::size_t constexpr max_val =
            alignof(X) > M1<T...>::max_val ? alignof(X) : M1<T...>::max_val;

        template <template <typename...> class W>
        using rebind = W<X, T...>;
    };

    template <>
    struct M1<>
    {
        static M1<>        C(...);
        static std::size_t constexpr min_val = 1;
        static std::size_t constexpr max_val = 1;

        template <template <typename...> class W>
        using rebind = W<>;
    };

    template <typename W, typename X>
    struct M2 : W
    {
        using W::C;
        static auto C(int (*)[alignof(X)]) -> M1<>;
    };

    template <typename...>
    struct M3
    { /* one could use M1 instead, but it renders the code more cryptic */
    };

    /*~
     * @note Scanning for duplicates while removing them at the same time.
     */
    template <typename S, typename A, template <typename...> class W, typename... X, typename... Y>
    struct M4<S, W<A, X...>, W<Y...>>
        : M4<M2<S, A>, W<X...>, W<Y..., decltype(S::C((int (*)[alignof(A)])(nullptr)))>>
    {
    };

    template <typename S, template <typename...> class W, typename... Y>
    struct M4<S, W<>, W<Y...>>
    {
        using type = W<Y...>;
    };

    template <typename A, typename...>
    struct M5
    {
        using type = A;
    };

    /*~
     * @note Cleaning up random empty `M1<> types after `M4.
     */
    template <template <typename...> class W, typename... A, typename... B, typename X>
    struct M5<W<A...>, W<X, B...>> : M5<W<A..., X>, W<B...>>
    {
    };

    template <template <typename...> class W, typename... A, typename... B>
    struct M5<W<A...>, W<M1<>, B...>> : M5<W<A...>, W<B...>>
    {
    };

    template <template <typename...> class W, typename... A>
    struct M5<W<A...>, W<>> : M5<M1<A...>>
    {
    }; // ::type instantiates to M1<A...> !

    /*~
     * @note Sorting step; because of alignof(X) being a power of 2 and the way
     *       our "map" in M1/M2 works, it is extremely simple to optimize using
     *       linear expansion of a sequence of powers of two, then use intrinsic
     *       "fmap" properties of  the triple-dot operator for pack expansion to
     *       yield the types remaining in the M1/M2 "map" (here, it is the S type
     *       parameter). Iterates through min/max values (parameters A, B) by
     *       creating that sequence then deploying it upon the ::C(int(*)[Z])
     *       function signature doing the lookup for M1/M2.
     */
    template <typename S, std::size_t A, std::size_t B, std::size_t... Z>
    struct M6 : M6<S, A * 2, B, Z..., A>
    {
    };

    template <typename S, std::size_t A, std::size_t... Z>
    struct M6<S, A, A, Z...>
    {
        using type =
            M1<decltype(S::C((int (*)[Z])(nullptr)))..., decltype(S::C((int (*)[A])(nullptr)))>;
    };

    /*~
     * @note Assembling everything together; `M0 is just for convenience purposes
     *       in order to avoid writing typename Type::type where applicable; while
     *       the `M4 cleans up duplicates by replacement through `M1 and `M2 lookup
     *       in combination with triple-dot expansion ("fmap"...). Notice that `M1
     *       is re-used many times as a plain linear container itself, upon which
     *       `M4 partial specializations match through ordering.
     */
    template <typename... X>
    using M7 =
        M0<M5<M3<>, M0<M4<M1<decltype(M1<X...>::C((int (*)[alignof(X)])(nullptr)))...>,
                          M3<X...>, M3<>>>>>;

    /*~
     * @note The final result is given by this template alias, instantiating to a
     *       `M1 wrapped pack containing everything that is used afterwards via
     *       a ::template rebind instantiation to wrap to an end user defined
     *       template template parameter type. Through this, `M6 will run only
     *       through the necessary range of powers of 2 for the sorting to occur.
     */
    template <typename... X>
    using unisorted_aligned_ = M0<M6<M7<X...>, M1<X...>::min_val, M1<X...>::max_val>>;

    /*~
     * @desc Given a sequence of types that may contain duplicates of both quality
     *       (the kind of type X) and of alignment (result of alignedof(X)) provide
     *       the equivalent sorted list of unique types by alignment. Semantics are
     *       eager.
     * @parm W   : template template parameter type wrapping a sequence of types.
     * @parm X...: parameter pack containing the aforementioned types.
     */
    template <template <typename...> class W, typename... X>
    using unisorted_aligned_wrap = typename unisorted_aligned_<X...>::template rebind<W>;
}

// All fundamental types that don't guarantee to have the same alignment (like int and unsigned int).
// It thus covers all fundamental alignments and all possible node sizes.
// Does not support extended alignments!
// The cryptic template stuff above erases duplicate alignments
using test_types = detail::unisorted_aligned_wrap<std::tuple, char, bool, short, int, long,
                                                  long long, float, double, long double>;
#else
using test_types = std::tuple<char, bool, short, int, long, long long, float, double, long double>;
#endif

#endif // FOONATHAN_MEMORY_TOOL_TEST_TYPES_HPP_INCLUDED


================================================
FILE: trtlab/nvrpc/BUILD.bazel
================================================

cc_library(
    name = "nvrpc",
    srcs = glob([
        "src/**/*.cc",
        "src/**/*.h",
    ]),
    hdrs = glob([
        "include/**/*.h",
    ]),
    deps = [
        "//trtlab/core",
        "@com_github_grpc_grpc//:grpc++",
#       "@com_github_grpc_grpc//:grpc++_unsecure",
    ],
    strip_include_prefix = "include",
    visibility = ["//visibility:public"],
)


================================================
FILE: trtlab/nvrpc/CMakeLists.txt
================================================
#project(nvrpc VERSION 0.1 LANGUAGES CXX)

# Build options
#set(CMAKE_CXX_STANDARD 17)
#set(default_build_type "Release")
#set(CMAKE_DEBUG_POSTFIX "-d")
#set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)

#add_subdirectory(tests)

# Find Protobuf installation
# Looks for protobuf-config.cmake file installed by Protobuf's cmake installation.
set(protobuf_MODULE_COMPATIBLE TRUE)
find_package(Protobuf CONFIG REQUIRED)
message(STATUS "Using protobuf ${protobuf_VERSION}")
set(_PROTOBUF_LIBPROTOBUF protobuf::libprotobuf)
set(_PROTOBUF_PROTOC $<TARGET_FILE:protobuf::protoc>)

# Find gRPC installation
# Looks for gRPCConfig.cmake file installed by gRPC's cmake installation.
find_package(gRPC CONFIG REQUIRED)
message(STATUS "Using gRPC ${gRPC_VERSION}")

set(_GRPC_GRPCPP_UNSECURE gRPC::grpc++_unsecure)
set(_GRPC_CPP_PLUGIN_EXECUTABLE $<TARGET_FILE:gRPC::grpc_cpp_plugin>)

add_library(nvrpc
  src/server.cc
  src/executor.cc
)

add_library(nvrpc-client
  src/client/client_executor.cc
)

add_library(${PROJECT_NAME}::nvrpc ALIAS nvrpc)
add_library(${PROJECT_NAME}::nvrpc-client ALIAS nvrpc-client)

target_link_libraries(nvrpc
  PUBLIC
    core
    ${_GRPC_GRPCPP_UNSECURE}
    gRPC::gpr
)

target_link_libraries(nvrpc-client
  PUBLIC
    core
    ${_GRPC_GRPCPP_UNSECURE}
)

target_include_directories(nvrpc
  PUBLIC
    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
)

target_include_directories(nvrpc-client
  PUBLIC
    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
)

set_target_properties(nvrpc PROPERTIES OUTPUT_NAME nvrpc)
set_target_properties(nvrpc-client PROPERTIES OUTPUT_NAME nvrpc-client)

install(
  TARGETS nvrpc
  EXPORT ${PROJECT_NAME}-targets
  RUNTIME DESTINATION  ${CMAKE_INSTALL_BINDIR}
  LIBRARY DESTINATION  ${CMAKE_INSTALL_LIBDIR}
  ARCHIVE DESTINATION  ${CMAKE_INSTALL_LIBDIR}
  INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
)
install(
  TARGETS nvrpc-client
  EXPORT ${PROJECT_NAME}-targets
  RUNTIME DESTINATION  ${CMAKE_INSTALL_BINDIR}
  LIBRARY DESTINATION  ${CMAKE_INSTALL_LIBDIR}
  ARCHIVE DESTINATION  ${CMAKE_INSTALL_LIBDIR}
  INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
)


install(
  DIRECTORY include/
  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
)

#if(ENABLE_TESTING)
   add_subdirectory(tests)
#endif()

#if(benchmark_FOUND)
#  add_subdirectory(benchmarks)
#endif()


================================================
FILE: trtlab/nvrpc/include/nvrpc/client/base_context.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

namespace nvrpc {
namespace client {

class BaseContext
{
  public:
    BaseContext() : m_MasterContext(this) {}
    BaseContext(BaseContext* master) : m_MasterContext(master) {}

    virtual ~BaseContext() {}
    virtual bool RunNextState(bool ok) = 0;

    virtual bool ExecutorShouldDeleteContext() const = 0;

    void* Tag() { return static_cast<void*>(this); }

    static BaseContext* Detag(void* tag) { return static_cast<BaseContext*>(tag); }

  protected:
    BaseContext* m_MasterContext;
};

} // namespace client
} // namespace nvrpc

================================================
FILE: trtlab/nvrpc/include/nvrpc/client/client_single_up_multiple_down.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once
#include <functional>
#include <memory>
#include <future>

#include <glog/logging.h>
#include <grpc++/grpc++.h>

#include "nvrpc/client/base_context.h"
#include "nvrpc/client/executor.h"
#include "trtlab/core/async_compute.h"

namespace nvrpc {
namespace client {

template<typename Request, typename Response>
struct ClientSingleUpMultipleDown : public BaseContext
{
    using Client = ClientSingleUpMultipleDown<Request, Response>;

  public:
    using PrepareFn = std::function<std::unique_ptr<::grpc_impl::ClientAsyncReader<Response>>(
        ::grpc::ClientContext*, const Request&, ::grpc::CompletionQueue*)>;

    ClientSingleUpMultipleDown(PrepareFn prepare_fn, std::shared_ptr<Executor> executor)
        : m_PrepareFn(prepare_fn), m_Executor(executor)
    {
        m_NextState = &Client::StateInvalid;
    }

    ~ClientSingleUpMultipleDown() {}

    void Write(Request&&);
    void Cancel()
    {
        m_Context.TryCancel();
    }

    bool ExecutorShouldDeleteContext() const final override { return false; }

  protected:
    ::grpc::ClientContext& GetClientContext() { return m_Context; }

  private:
    virtual void CallbackOnRequestSent(Request&&) {}
    virtual void CallbackOnResponseReceived(Response&&) = 0;
    virtual void CallbackOnComplete(const ::grpc::Status&) = 0;

    PrepareFn m_PrepareFn;
    std::shared_ptr<Executor> m_Executor;

    ::grpc::Status m_Status;
    ::grpc::ClientContext m_Context;
    std::unique_ptr<::grpc_impl::ClientAsyncReader<Response>> m_Stream;


    Request m_Request;
    Response m_Response;

    bool RunNextState(bool ok) final override { return (this->*m_NextState)(ok); }

    bool StateStreamInitialized(bool);
    bool StateReadDone(bool);
    bool StateFinishDone(bool);
    bool StateInvalid(bool);

    bool (Client::*m_NextState)(bool);

};

template<typename Request, typename Response>
void ClientSingleUpMultipleDown<Request, Response>::Write(Request&& request)
{
    CHECK(m_Stream == nullptr);

    m_Request = std::move(request);
    m_NextState = &Client::StateStreamInitialized;

    m_Stream = m_PrepareFn(&m_Context, m_Request, m_Executor->GetNextCQ());
    m_Stream->StartCall(this->Tag());
}

template<typename Request, typename Response>
bool ClientSingleUpMultipleDown<Request, Response>::StateStreamInitialized(bool ok)
{
    if(!ok)
    {
        DVLOG(1) << "stream failed to initialize";
        return false;
    }

    DVLOG(1) << "executing callback after initial write to server finished";
    CallbackOnRequestSent(std::move(m_Request));

    m_NextState = &Client::StateReadDone;
    m_Stream->Read(&m_Response, this->Tag());
}

template<typename Request, typename Response>
bool ClientSingleUpMultipleDown<Request, Response>::StateReadDone(bool ok)
{
    if(!ok)
    {
        DVLOG(1) << "server closed the read/download portion of the stream";
        m_NextState = &Client::StateFinishDone;
        m_Stream->Finish(&m_Status, this->Tag());
        return true;
    }

    DVLOG(1) << "issuing callback on received message";
    CallbackOnResponseReceived(std::move(m_Response));

    DVLOG(2) << "issuing next read from stream";
    m_NextState = &Client::StateReadDone;
    m_Stream->Read(&m_Response, this->Tag());
}

template<typename Request, typename Response>
bool ClientSingleUpMultipleDown<Request, Response>::StateFinishDone(bool ok)
{
    if(!ok)
    {
        DVLOG(1) << "failed to close the read/download portion of the stream";
        m_Context.TryCancel();
        return false;
        m_NextState = &Client::StateInvalid;
    }

    DVLOG(1) << "calling on complete callback";
    CallbackOnComplete(m_Status);

    m_NextState = &Client::StateInvalid;
    return false;
}

template<typename Request, typename Response>
bool ClientSingleUpMultipleDown<Request, Response>::StateInvalid(bool ok)
{
    LOG(FATAL) << "logic error in ClientSingleUpMultipleDown state management";
    return false;
}


template<typename Request, typename Response>
class ClientSUMD : ClientSingleUpMultipleDown<Request, Response>
{
    using Client = ClientSingleUpMultipleDown<Request, Response>;

public:
    using PrepareFn = typename Client::PrepareFn;
    using MetaData = std::multimap<::grpc::string_ref, ::grpc::string_ref>;
    using CallbackOnResponseFn = std::function<void(Response&&)>;
    using CallbackOnCompleteFn = std::function<void(const ::grpc::Status&, MetaData&)>;

    ClientSUMD(PrepareFn prepare_fn, std::shared_ptr<Executor> executor, CallbackOnResponseFn callback, CallbackOnCompleteFn completer)
    : Client(prepare_fn, executor), m_Callback(callback), m_Completer(completer)
    {

    }

    std::shared_future<::grpc::Status> Write(Request&& request)
    {
        Client::Write(std::move(request));
        return Status();
    }

    std::shared_future<::grpc::Status> Status() { return m_Promise.get_future().share(); }

private:
    std::promise<::grpc::Status> m_Promise;
    CallbackOnResponseFn m_Callback;
    CallbackOnCompleteFn m_Completer;

    void CallbackOnResponseReceived(Response&& response) final override
    {
        m_Callback(std::move(response));
    }

    void CallbackOnComplete(const ::grpc::Status& status) final override
    {
        m_Promise.set_value(status);
        auto metadata = this->GetClientContext().GetServerTrailingMetadata();
        m_Completer(status, metadata);
    }

};

} // namespace client
} // namespace nvrpc

================================================
FILE: trtlab/nvrpc/include/nvrpc/client/client_streaming.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once
#include <functional>
#include <memory>

#include <grpc++/grpc++.h>

#include "nvrpc/client/base_context.h"
#include "nvrpc/client/executor.h"
#include "trtlab/core/async_compute.h"

#include <glog/logging.h>

namespace nvrpc {
namespace client {

template<typename Request, typename Response>
struct ClientStreaming : public BaseContext
{
  public:
    using PrepareFn =
        std::function<std::unique_ptr<::grpc_impl::ClientAsyncReaderWriter<Request, Response>>(
            ::grpc::ClientContext*, ::grpc::CompletionQueue*)>;

    using ReadCallback = std::function<void(Response&&)>;
    using WriteCallback = std::function<void(Request&&)>;

    ClientStreaming(PrepareFn, std::shared_ptr<Executor>, WriteCallback, ReadCallback);
    ~ClientStreaming() { DVLOG(1) << "ClientStreaming dtor"; }

    // void Write(Request*);
    bool Write(Request&&);

    std::shared_future<::grpc::Status> Status();
    std::shared_future<::grpc::Status> Done();

    bool SetCorked(bool true_or_false) { m_Corked = true_or_false; }

    bool IsCorked() const { return m_Corked; }

    bool ExecutorShouldDeleteContext() const override { return false; }

    void ExecutorShouldDeleteContext(bool true_or_false) { m_ShouldDelete = true_or_false; }

  private:
    bool RunNextState(bool ok) final override { return (this->*m_NextState)(ok); }

    bool RunNextState(bool (ClientStreaming<Request, Response>::*state_fn)(bool), bool ok)
    {
        return (this->*state_fn)(ok);
    }

    class Context : public BaseContext
    {
      public:
        Context(BaseContext* master) : BaseContext(master) {}
        ~Context() override {}

      private:
        bool RunNextState(bool ok) final override
        {
            // DVLOG(1) << "Event for Tag: " << Tag();
            return static_cast<ClientStreaming*>(m_MasterContext)->RunNextState(m_NextState, ok);
        }

        bool (ClientStreaming<Request, Response>::*m_NextState)(bool);

        bool ExecutorShouldDeleteContext() const override { return false; }

        friend class ClientStreaming<Request, Response>;
    };

    ::grpc::Status m_Status;
    ::grpc::ClientContext m_Context;
    std::unique_ptr<::grpc_impl::ClientAsyncReaderWriter<Request, Response>> m_Stream;
    std::promise<::grpc::Status> m_Promise;

    PrepareFn m_PrepareFn;

    ReadCallback m_ReadCallback;
    WriteCallback m_WriteCallback;

    // Context<Request, Response> m_ReadState;
    // Context<Request, Response> m_WriteState;

    Context m_ReadState;
    Context m_WriteState;

    std::mutex m_Mutex;
    std::queue<Response> m_ReadQueue;
    std::queue<Request> m_WriteQueue;

    std::shared_ptr<Executor> m_Executor;

    bool m_Corked;
    bool m_ShouldDelete;

    using ReadHandle = bool;
    using WriteHandle = bool;
    using ExecuteHandle = std::function<void()>;
    using CloseHandle = bool;
    using FinishHandle = bool;
    using CompleteHandle = bool;
    using Actions = std::tuple<ReadHandle, WriteHandle, ExecuteHandle, CloseHandle, FinishHandle,
                               CompleteHandle>;

    bool m_Reading, m_Writing, m_Finishing, m_Closing, m_ReadsDone, m_WritesDone, m_FinishDone;

    bool (ClientStreaming<Request, Response>::*m_NextState)(bool);

    Actions EvaluateState();
    void ForwardProgress(Actions& actions);

    bool StateStreamInitialized(bool);
    bool StateReadDone(bool);
    bool StateWriteDone(bool);
    bool StateWritesDoneDone(bool);
    bool StateFinishDone(bool);
    bool StateInvalid(bool);
    bool StateIdle(bool);
};

template<typename Request, typename Response>
ClientStreaming<Request, Response>::ClientStreaming(PrepareFn prepare_fn,
                                                    std::shared_ptr<Executor> executor,
                                                    WriteCallback OnWrite, ReadCallback OnRead)
    : m_Executor(executor), m_PrepareFn(prepare_fn), m_ReadState(this), m_WriteState(this),
      m_ReadCallback(OnRead), m_WriteCallback(OnWrite), m_Reading(false), m_Writing(false),
      m_Finishing(false), m_Closing(false), m_ReadsDone(false), m_WritesDone(false),
      m_FinishDone(false), m_ShouldDelete(false), m_Corked(false)
{
    m_NextState = &ClientStreaming<Request, Response>::StateStreamInitialized;
    m_ReadState.m_NextState = &ClientStreaming<Request, Response>::StateInvalid;
    m_WriteState.m_NextState = &ClientStreaming<Request, Response>::StateInvalid;

    m_Stream = m_PrepareFn(&m_Context, m_Executor->GetNextCQ());
    m_Stream->StartCall(this->Tag());
}

/*
template<typename Request, typename Response>
void ClientStreaming<Request, Response>::Write(Request* request)
{
    // TODO: fixes this so it queues up a lambda
    Write(std::move(*request));
}
*/

template<typename Request, typename Response>
bool ClientStreaming<Request, Response>::Write(Request&& request)
{
    Actions actions;
    {
        std::lock_guard<std::mutex> lock(m_Mutex);
        DVLOG(1) << "Writing Request";

        if(m_WritesDone)
        {
            LOG(WARNING) << "Attempting to Write on a Stream that is closed";
            return false;
        }

        m_WriteQueue.push(std::move(request));
        m_WriteState.m_NextState = &ClientStreaming<Request, Response>::StateWriteDone;

        actions = EvaluateState();
    }
    ForwardProgress(actions);
    return true;
}

template<typename Request, typename Response>
std::shared_future<::grpc::Status> ClientStreaming<Request, Response>::Done()
{
    Actions actions;
    {
        std::lock_guard<std::mutex> lock(m_Mutex);
        DVLOG(1) << "Sending WritesDone - Closing Client -> Server side of the stream";

        m_WritesDone = true;

        actions = EvaluateState();
    }
    ForwardProgress(actions);
    return m_Promise.get_future();
}

template<typename Request, typename Response>
std::shared_future<::grpc::Status> ClientStreaming<Request, Response>::Status()
{
    return m_Promise.get_future();
}

template<typename Request, typename Response>
typename ClientStreaming<Request, Response>::Actions
    ClientStreaming<Request, Response>::EvaluateState()
{
    ReadHandle should_read = false;
    WriteHandle should_write = nullptr;
    ExecuteHandle should_execute = nullptr;
    CloseHandle should_close = false;
    FinishHandle should_finish = false;
    CompleteHandle should_complete = false;

    if(m_NextState == &ClientStreaming<Request, Response>::StateStreamInitialized)
    {
        DVLOG(1) << "Action Queued: Stream Initializing";
    }
    else
    {
        if(!m_Reading && !m_ReadsDone)
        {
            should_read = true;
            m_Reading = true;
            m_ReadQueue.emplace();
            m_ReadState.m_NextState = &ClientStreaming<Request, Response>::StateReadDone;

            should_execute = [this, response = std::move(m_ReadQueue.front())]() mutable {
                m_ReadCallback(std::move(response));
            };
            m_ReadQueue.pop();
        }
        if(!m_Writing && !m_WriteQueue.empty())
        {
            should_write = true;
            m_Writing = true;
            m_WriteState.m_NextState = &ClientStreaming<Request, Response>::StateWriteDone;
        }
        if(!m_Closing && !m_Writing && m_WritesDone)
        {
            should_close = true;
            m_Closing = true;
            m_WriteState.m_NextState = &ClientStreaming<Request, Response>::StateWritesDoneDone;
        }
        if(!m_Reading && !m_Writing && !m_Finishing && m_ReadsDone && m_WritesDone && !m_FinishDone)
        {
            should_finish = true;
            m_Finishing = true;
            m_NextState = &ClientStreaming<Request, Response>::StateFinishDone;
        }
        if(m_ReadsDone && m_WritesDone && m_FinishDone)
        {
            should_complete = true;
        }
    }

    // clang-format off
    DVLOG(1) << (should_read ? 1 : 0) << (should_write ? 1 : 0) << (should_execute ? 1 : 0)
               << (should_finish ? 1 : 0)
               << " -- " << m_Reading << m_Writing << m_Finishing
               << " -- " << m_ReadsDone << m_WritesDone
               << " -- " << m_Finishing;
    // clang-format on

    return std::make_tuple(should_read, should_write, should_execute, should_close, should_finish,
                           should_complete);
}

template<class Request, class Response>
void ClientStreaming<Request, Response>::ForwardProgress(Actions& actions)
{
    ReadHandle should_read = std::get<0>(actions);
    WriteHandle should_write = std::get<1>(actions);
    ExecuteHandle should_execute = std::get<2>(actions);
    CloseHandle should_close = std::get<3>(actions);
    FinishHandle should_finish = std::get<4>(actions);
    CompleteHandle should_complete = std::get<5>(actions);

    if(should_read)
    {
        DVLOG(1) << "Posting Read/Recv";
        m_Stream->Read(&m_ReadQueue.back(), m_ReadState.Tag());
    }
    if(should_write)
    {
        DVLOG(1) << "Writing/Sending Request";
        if(m_Corked)
        {
            ::grpc::WriteOptions options;
            options.set_corked();
            m_Stream->Write(m_WriteQueue.front(), options, m_WriteState.Tag());
        }
        else
        {
            m_Stream->Write(m_WriteQueue.front(), m_WriteState.Tag());
        }
    }
    if(should_close)
    {
        DVLOG(1) << "Sending WritesDone to Server";
        m_Stream->WritesDone(m_WriteState.Tag());
    }
    if(should_execute)
    {
        DVLOG(1) << "Kicking off Execution of Received Request";
        should_execute();
    }
    if(should_finish)
    {
        DVLOG(1) << "Closing Stream - Finish";
        m_Stream->Finish(&m_Status, Tag());
    }
    if(should_complete)
    {
        DVLOG(1) << "Completing Promise";
        m_Promise.set_value(std::move(m_Status));
    }
}

template<typename Request, typename Response>
bool ClientStreaming<Request, Response>::StateStreamInitialized(bool ok)
{
    if(!ok)
    {
        DVLOG(1) << "Stream Failed to Initialize";
        return false;
    }

    Actions actions;
    {
        std::lock_guard<std::mutex> lock(m_Mutex);
        DVLOG(1) << "StreamInitialized";

        m_NextState = &ClientStreaming<Request, Response>::StateInvalid;

        m_Reading = true;
        m_ReadQueue.emplace();
        m_ReadState.m_NextState = &ClientStreaming<Request, Response>::StateReadDone;

        actions = EvaluateState();
    }
    DVLOG(1) << "Posting Initial Read/Recv";
    m_Stream->Read(&m_ReadQueue.back(), m_ReadState.Tag());
    ForwardProgress(actions);
    return true;
}

template<typename Request, typename Response>
bool ClientStreaming<Request, Response>::StateReadDone(bool ok)
{
    Actions actions;
    {
        std::lock_guard<std::mutex> lock(m_Mutex);
        DVLOG(1) << "ReadDone: " << (ok ? "OK" : "NOT OK");

        m_Reading = false;
        m_ReadState.m_NextState = &ClientStreaming<Request, Response>::StateInvalid;

        if(!ok)
        {
            DVLOG(1) << "Server is closing the read/download portion of the stream";
            m_ReadsDone = true;
            m_WritesDone = true;
            m_Closing = true;
            if(m_Writing)
            {
                m_Context.TryCancel();
            }
        }

        actions = EvaluateState();
    }
    ForwardProgress(actions);
    return true;
}

template<typename Request, typename Response>
bool ClientStreaming<Request, Response>::StateWriteDone(bool ok)
{
    Actions actions;
    {
        std::lock_guard<std::mutex> lock(m_Mutex);
        DVLOG(1) << "WriteDone: " << (ok ? "OK" : "NOT OK");

        m_Writing = false;
        m_WriteQueue.pop();
        m_WriteState.m_NextState = &ClientStreaming<Request, Response>::StateInvalid;

        if(!ok)
        {
            // Invalidate any outstanding reads on stream
            DLOG(ERROR) << "Failed to Write to Stream - shutting down";
            m_WritesDone = true;
            if(!m_ReadsDone)
            {
                m_Context.TryCancel();
            }
            return false;
        }

        actions = EvaluateState();
    }
    ForwardProgress(actions);
    return true;
}

template<typename Request, typename Response>
bool ClientStreaming<Request, Response>::StateWritesDoneDone(bool ok)
{
    Actions actions;
    {
        std::lock_guard<std::mutex> lock(m_Mutex);
        DVLOG(1) << "WritesDoneDone: " << (ok ? "OK" : "NOT OK");

        // m_Closing = false;  // keep m_Closing true
        m_WriteState.m_NextState = &ClientStreaming<Request, Response>::StateInvalid;

        if(!ok)
        {
            LOG(ERROR) << "Failed to close write/upload portion of stream";
            if(!m_ReadsDone)
            {
                m_Context.TryCancel();
            }
            return true;
        }

        actions = EvaluateState();
    }
    ForwardProgress(actions);
    return true;
}

template<typename Request, typename Response>
bool ClientStreaming<Request, Response>::StateFinishDone(bool ok)
{
    Actions actions;
    {
        std::lock_guard<std::mutex> lock(m_Mutex);
        DVLOG(1) << "FinishedDone: " << (ok ? "OK" : "NOT OK");

        m_Finishing = false;
        m_FinishDone = true;

        if(!ok)
        {
            LOG(ERROR) << "Failed to close read/download portion of the stream";
            m_Context.TryCancel();
            return false;
        }

        actions = EvaluateState();
    }
    ForwardProgress(actions);
    return true;
}
/*
        DVLOG(1) << "Read/Download portion of the stream has closed";

        std::lock_guard<std::mutex> lock(m_Mutex);

        if(m_WriteState.m_NextState == &ClientStreaming<Request, Response>::StateInvalid)
        {
            DVLOG(1) << "Write/Upload has already finished - completing future";

            m_ReadState.m_NextState = &ClientStreaming<Request, Response>::StateInvalid;
        }
        else
        {
            DVLOG(1) << "Received Finished from Server before Client has sent Done writing";
            m_Context.TryCancel();
        }
        return true;
*/

template<typename Request, typename Response>
bool ClientStreaming<Request, Response>::StateInvalid(bool ok)
{
    LOG(FATAL) << "Your logic is bad - you should never have come here";
}

} // namespace client
} // namespace nvrpc

================================================
FILE: trtlab/nvrpc/include/nvrpc/client/client_streaming_v2.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once
#include <functional>
#include <memory>

#include <grpc++/grpc++.h>

#include "nvrpc/client/base_context.h"
#include "nvrpc/client/executor.h"
#include "trtlab/core/async_compute.h"

#include <glog/logging.h>

namespace nvrpc
{
    namespace client
    {
        namespace v2
        {
            template <typename Request, typename Response>
            struct ClientStreaming : public BaseContext
            {
            public:
                using PrepareFn =
                    std::function<std::unique_ptr<::grpc_impl::ClientAsyncReaderWriter<Request, Response>>(::grpc::ClientContext*,
                                                                                                           ::grpc::CompletionQueue*)>;

                ClientStreaming(PrepareFn, std::shared_ptr<Executor>, bool initialize = true);
                ~ClientStreaming() override {}

                // void Write(Request*);
                bool Write(Request&&);

                std::shared_future<::grpc::Status> Status();
                std::shared_future<::grpc::Status> Done();
                std::shared_future<::grpc::Status> Cancel();

                bool SetCorked(bool true_or_false)
                {
                    m_Corked = true_or_false;
                }

                bool IsCorked() const
                {
                    return m_Corked;
                }

                bool ExecutorShouldDeleteContext() const override
                {
                    return false;
                }

                void ExecutorShouldDeleteContext(bool true_or_false)
                {
                    m_ShouldDelete = true_or_false;
                }

            protected:
                void Initialize();

                ::grpc::ClientContext& GetClientContext()
                {
                    return m_Context;
                }

            private:
                bool RunNextState(bool ok) final override
                {
                    return (this->*m_NextState)(ok);
                }

                bool RunNextState(bool (ClientStreaming<Request, Response>::*state_fn)(bool), bool ok)
                {
                    return (this->*state_fn)(ok);
                }

                class Context : public BaseContext
                {
                public:
                    Context(BaseContext* master) : BaseContext(master) {}
                    ~Context() override {}

                private:
                    bool RunNextState(bool ok) final override
                    {
                        // DVLOG(1) << this << ": " << "Event for Tag: " << Tag();
                        return static_cast<ClientStreaming*>(m_MasterContext)->RunNextState(m_NextState, ok);
                    }

                    bool (ClientStreaming<Request, Response>::*m_NextState)(bool);

                    bool ExecutorShouldDeleteContext() const override
                    {
                        return false;
                    }

                    friend class ClientStreaming<Request, Response>;
                };

                ::grpc::Status                                                           m_Status;
                ::grpc::ClientContext                                                    m_Context;
                std::unique_ptr<::grpc_impl::ClientAsyncReaderWriter<Request, Response>> m_Stream;
                std::promise<::grpc::Status>                                             m_Promise;
                std::shared_future<::grpc::Status>                                       m_SharedFuture;

                PrepareFn m_PrepareFn;

                virtual void CallbackOnInitialized() {}
                virtual void CallbackOnRequestSent(Request&&) {}
                virtual void CallbackOnResponseReceived(Response&&) = 0;
                virtual void CallbackOnComplete(const ::grpc::Status&) {}

                Context m_ReadState;
                Context m_WriteState;

                std::mutex           m_Mutex;
                std::queue<Response> m_ReadQueue;
                std::queue<Request>  m_WriteQueue;

                std::shared_ptr<Executor> m_Executor;

                bool m_Corked;
                bool m_ShouldDelete;
                bool m_Initialized;

                using ReadHandle     = bool;
                using WriteHandle    = bool;
                using CloseHandle    = bool;
                using FinishHandle   = bool;
                using CompleteHandle = bool;
                using Actions        = std::tuple<ReadHandle, WriteHandle, CloseHandle, FinishHandle, CompleteHandle>;

                bool m_Reading, m_Writing, m_Finishing, m_Closing, m_ReadsDone, m_WritesDone, m_ClosedDone, m_FinishDone, m_Complete;

                bool (ClientStreaming<Request, Response>::*m_NextState)(bool);

                Actions EvaluateState();
                void    ForwardProgress(Actions& actions);

                bool StateStreamInitialized(bool);
                bool StateReadDone(bool);
                bool StateWriteDone(bool);
                bool StateWritesDoneDone(bool);
                bool StateFinishDone(bool);
                bool StateInvalid(bool);
                bool StateIdle(bool);
            };

            template <typename Request, typename Response>
            ClientStreaming<Request, Response>::ClientStreaming(PrepareFn prepare_fn, std::shared_ptr<Executor> executor, bool initialize)
            : m_Executor(executor),
              m_PrepareFn(prepare_fn),
              m_ReadState(this),
              m_WriteState(this),
              m_Reading(false),
              m_Writing(false),
              m_Finishing(false),
              m_Closing(false),
              m_ReadsDone(false),
              m_WritesDone(false),
              m_ClosedDone(false),
              m_FinishDone(false),
              m_Complete(false),
              m_ShouldDelete(false),
              m_Corked(false),
              m_Initialized(false)
            {
                m_NextState              = &ClientStreaming<Request, Response>::StateInvalid;
                m_ReadState.m_NextState  = &ClientStreaming<Request, Response>::StateInvalid;
                m_WriteState.m_NextState = &ClientStreaming<Request, Response>::StateInvalid;

                Initialize();

                m_SharedFuture = m_Promise.get_future().share();
            }

            template <typename Request, typename Response>
            void ClientStreaming<Request, Response>::Initialize()
            {
                std::lock_guard<std::mutex> lock(m_Mutex);

                m_Initialized = true;

                m_NextState = &ClientStreaming<Request, Response>::StateStreamInitialized;
                m_Stream    = m_PrepareFn(&m_Context, m_Executor->GetNextCQ());
                m_Stream->StartCall(this->Tag());
            }

            template <typename Request, typename Response>
            bool ClientStreaming<Request, Response>::Write(Request&& request)
            {
                Actions actions;
                {
                    std::lock_guard<std::mutex> lock(m_Mutex);
                    CHECK(m_Initialized);

                    if (m_WritesDone)
                    {
                        LOG(WARNING) << this << ": Attempting to Write on a Stream that is already closed";
                        return false;
                    }

                    DVLOG(1) << this << ": Queuing Write Request";
                    DVLOG(3) << this << ": On Queuing Write, there are " << m_WriteQueue.size() << " outstanding writes";

                    m_WriteQueue.push(std::move(request));
                    actions = EvaluateState();
                }
                ForwardProgress(actions);
                return true;
            }

            template <typename Request, typename Response>
            std::shared_future<::grpc::Status> ClientStreaming<Request, Response>::Done()
            {
                Actions actions;
                {
                    std::lock_guard<std::mutex> lock(m_Mutex);
                    CHECK(m_Initialized);

                    if (m_WritesDone)
                    {
                        LOG(WARNING) << this << ": Attempting to Close (issue WritesDone) to a Stream that is already closed";
                        return m_SharedFuture;
                    }

                    DVLOG(1) << this << ": Queuing WritesDone - no more Writes can be queued";
                    DVLOG(3) << this << ": On Queuing WritesDone, there are " << m_WriteQueue.size() << " outstanding writes";

                    m_WritesDone = true;
                    actions      = EvaluateState();
                }
                ForwardProgress(actions);
                return m_SharedFuture;
            }

            template <typename Request, typename Response>
            std::shared_future<::grpc::Status> ClientStreaming<Request, Response>::Cancel()
            {
                Actions actions;
                {
                    std::lock_guard<std::mutex> lock(m_Mutex);
                    DVLOG(1) << this << ": Client Cancelling Stream";
                    DVLOG(3) << this << ": On Cancel, there are " << m_WriteQueue.size() << " outstanding writes";

                    CHECK(m_Initialized);

                    m_WritesDone = true;
                    m_Context.TryCancel();
                }
                return m_SharedFuture;
            }

            template <typename Request, typename Response>
            std::shared_future<::grpc::Status> ClientStreaming<Request, Response>::Status()
            {
                return m_SharedFuture;
            }

            template <typename Request, typename Response>
            typename ClientStreaming<Request, Response>::Actions ClientStreaming<Request, Response>::EvaluateState()
            {
                ReadHandle     should_read     = false;
                WriteHandle    should_write    = nullptr;
                CloseHandle    should_close    = false;
                FinishHandle   should_finish   = false;
                CompleteHandle should_complete = false;

                if (m_NextState == &ClientStreaming<Request, Response>::StateStreamInitialized)
                {
                    DVLOG(1) << this << ": Action Queued: Stream Initializing";
                }
                else
                {
                    if (!m_Reading && !m_ReadsDone)
                    {
                        should_read = true;
                        m_Reading   = true;
                        m_ReadQueue.emplace();
                        m_ReadState.m_NextState = &ClientStreaming<Request, Response>::StateReadDone;
                    }
                    if (!m_Writing && !m_WriteQueue.empty())
                    {
                        should_write             = true;
                        m_Writing                = true;
                        m_WriteState.m_NextState = &ClientStreaming<Request, Response>::StateWriteDone;
                        DVLOG(3) << this << ": WriteQueue has " << m_WriteQueue.size() << " outstanding messages";
                    }
                    if (!m_Closing && !m_Writing && m_WritesDone)
                    {
                        should_close             = true;
                        m_Closing                = true;
                        m_WriteState.m_NextState = &ClientStreaming<Request, Response>::StateWritesDoneDone;
                    }
                    if (!m_Reading && !m_Writing && !m_Finishing && m_ReadsDone && m_WritesDone && m_ClosedDone && !m_FinishDone)
                    {
                        should_finish = true;
                        m_Finishing   = true;
                        m_NextState   = &ClientStreaming<Request, Response>::StateFinishDone;
                        DCHECK((m_ReadState.m_NextState == &ClientStreaming<Request, Response>::StateInvalid));
                        DCHECK((m_WriteState.m_NextState == &ClientStreaming<Request, Response>::StateInvalid));
                    }
                    if (m_ReadsDone && m_WritesDone && m_ClosedDone && m_FinishDone && !m_Complete)
                    {
                        m_Complete      = true;
                        should_complete = true;
                    }
                }

                // clang-format off
                DVLOG(1) << this << ": " << (should_read ? 1 : 0) << (should_write ? 1 : 0) << (should_finish ? 1 : 0)
                    << " -- " << m_Reading << m_Writing << m_Finishing
                    << " -- " << m_ReadsDone << m_WritesDone
                    << " -- " << m_Finishing;
                // clang-format on

                return std::make_tuple(should_read, should_write, should_close, should_finish, should_complete);
            }

            template <class Request, class Response>
            void ClientStreaming<Request, Response>::ForwardProgress(Actions& actions)
            {
                ReadHandle     should_read     = std::get<0>(actions);
                WriteHandle    should_write    = std::get<1>(actions);
                CloseHandle    should_close    = std::get<2>(actions);
                FinishHandle   should_finish   = std::get<3>(actions);
                CompleteHandle should_complete = std::get<4>(actions);

                if (should_read)
                {
                    DVLOG(1) << this << ": Posting Read/Recv";
                    m_Stream->Read(&m_ReadQueue.back(), m_ReadState.Tag());
                }
                if (should_write)
                {
                    DVLOG(1) << this << ": Writing/Sending Request";
                    if (m_Corked)
                    {
                        ::grpc::WriteOptions options;
                        options.set_corked();
                        m_Stream->Write(m_WriteQueue.front(), options, m_WriteState.Tag());
                    }
                    else
                    {
                        m_Stream->Write(m_WriteQueue.front(), m_WriteState.Tag());
                    }
                }
                if (should_close)
                {
                    DVLOG(1) << this << ": Sending WritesDone to Server";
                    m_Stream->WritesDone(m_WriteState.Tag());
                }
                if (should_finish)
                {
                    DVLOG(1) << this << ": Closing Stream - Finish";
                    m_Stream->Finish(&m_Status, Tag());
                }
                if (should_complete)
                {
                    DVLOG(1) << this << ": Completing Promise";
                    CallbackOnComplete(m_Status);
                    m_Promise.set_value(std::move(m_Status));
                }
            }

            template <typename Request, typename Response>
            bool ClientStreaming<Request, Response>::StateStreamInitialized(bool ok)
            {
                if (!ok)
                {
                    LOG(ERROR) << this << ": Stream Failed to Initialize";
                    m_Status    = ::grpc::Status::CANCELLED;
                    m_ReadsDone = m_WritesDone = m_ClosedDone = m_FinishDone = m_Complete = true;
                    m_Promise.set_value(::grpc::Status::CANCELLED);
                    CallbackOnComplete(m_Status);
                    return false;
                }

                Actions actions;
                {
                    std::lock_guard<std::mutex> lock(m_Mutex);
                    DVLOG(1) << this << ": StreamInitialized";

                    m_NextState = &ClientStreaming<Request, Response>::StateInvalid;
                    actions     = EvaluateState();
                }

                CallbackOnInitialized();
                ForwardProgress(actions);
                return true;
            }

            template <typename Request, typename Response>
            bool ClientStreaming<Request, Response>::StateReadDone(bool ok)
            {
                Actions               actions;
                std::function<void()> callback = nullptr;

                {
                    std::lock_guard<std::mutex> lock(m_Mutex);
                    DVLOG(1) << this << ": ReadDone: " << (ok ? "OK" : "NOT OK");

                    m_Reading               = false;
                    m_ReadState.m_NextState = &ClientStreaming<Request, Response>::StateInvalid;

                    if (!ok)
                    {
                        DVLOG(1) << this << ": Server is closing the read/download portion of the stream";
                        m_ReadsDone  = true;
                        m_WritesDone = true;
                        if (m_Writing)
                        {
                            DLOG(WARNING) << "ReadDone with NOT OK; however, there is still an outstanding Write";
                            m_Context.TryCancel();
                        }
                    }
                    else
                    {
                        callback = [this, response = std::move(m_ReadQueue.front())]() mutable {
                            CallbackOnResponseReceived(std::move(response));
                        };
                        m_ReadQueue.pop();
                    }
                    actions = EvaluateState();
                }

                // drop mutex and perform actions
                if (callback)
                {
                    callback();
                }
                ForwardProgress(actions);
                return true;
            }

            template <typename Request, typename Response>
            bool ClientStreaming<Request, Response>::StateWriteDone(bool ok)
            {
                Actions               actions;
                std::function<void()> callback = nullptr;

                {
                    std::lock_guard<std::mutex> lock(m_Mutex);
                    DVLOG(1) << this << ": WriteDone: " << (ok ? "OK" : "NOT OK");

                    m_Writing                = false;
                    m_WriteState.m_NextState = &ClientStreaming<Request, Response>::StateInvalid;

                    if (!ok)
                    {
                        // Invalidate any outstanding reads on stream
                        DLOG(ERROR) << "Failed to Write to Stream - shutting down";
                        m_WritesDone = true;
                        m_WriteQueue = std::queue<Request>();
                        if (!m_ReadsDone)
                        {
                            m_Context.TryCancel();
                        }
                    }
                    else
                    {
                        callback = [this, request = std::move(m_WriteQueue.front())]() mutable {
                            CallbackOnRequestSent(std::move(request));
                        };
                        m_WriteQueue.pop();
                    }

                    actions = EvaluateState();
                }

                // drop mutex and perform any actions
                if (callback)
                {
                    callback();
                }
                ForwardProgress(actions);
                return true;
            }

            template <typename Request, typename Response>
            bool ClientStreaming<Request, Response>::StateWritesDoneDone(bool ok)
            {
                Actions actions;
                {
                    std::lock_guard<std::mutex> lock(m_Mutex);
                    DVLOG(1) << this << ": WritesDoneDone: " << (ok ? "OK" : "NOT OK");

                    m_ClosedDone             = true;
                    m_WriteState.m_NextState = &ClientStreaming<Request, Response>::StateInvalid;

                    if (!ok)
                    {
                        LOG(ERROR) << "Failed to close write/upload portion of stream";
                        if (!m_ReadsDone)
                        {
                            m_Context.TryCancel();
                        }
                    }

                    actions = EvaluateState();
                }
                ForwardProgress(actions);
                return true;
            }

            template <typename Request, typename Response>
            bool ClientStreaming<Request, Response>::StateFinishDone(bool ok)
            {
                Actions actions;
                {
                    std::lock_guard<std::mutex> lock(m_Mutex);
                    DVLOG(1) << this << ": FinishedDone: " << (ok ? "OK" : "NOT OK");

                    m_Finishing  = false;
                    m_FinishDone = true;

                    if (!ok)
                    {
                        LOG(ERROR) << "Request to Finish the stream failed";
                        m_Context.TryCancel();
                    }

                    actions = EvaluateState();
                }
                ForwardProgress(actions);
                return false;
            }

            template <typename Request, typename Response>
            bool ClientStreaming<Request, Response>::StateInvalid(bool ok)
            {
                LOG(FATAL) << "Your logic is bad - you should never have come here";
            }

        } // namespace v2
    }     // namespace client
} // namespace nvrpc

================================================
FILE: trtlab/nvrpc/include/nvrpc/client/client_streaming_v3.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once
#include <functional>
#include <memory>

#include <grpc++/grpc++.h>

#include "nvrpc/client/base_context.h"
#include "nvrpc/client/executor.h"
#include "trtlab/core/async_compute.h"

#include <glog/logging.h>

namespace nvrpc
{
    namespace client
    {
        namespace v3
        {
            template <typename Request, typename Response>
            struct ClientStreaming : public BaseContext
            {
                using Stream = std::unique_ptr<::grpc_impl::ClientAsyncReaderWriter<Request, Response>>;

            public:
                using PrepareFn =
                    std::function<std::unique_ptr<::grpc_impl::ClientAsyncReaderWriter<Request, Response>>(::grpc::ClientContext*,
                                                                                                           ::grpc::CompletionQueue*)>;

                ClientStreaming(PrepareFn, std::shared_ptr<Executor>);
                ~ClientStreaming() override {}

                bool Write(Request&&);
                bool CloseWrites();
                bool IsComplete();

                void Cancel();

                bool SetCorked(bool true_or_false)
                {
                    m_Corked = true_or_false;
                }

                bool IsCorked() const
                {
                    return m_Corked;
                }

                bool ExecutorShouldDeleteContext() const override
                {
                    return false;
                }

                void ExecutorShouldDeleteContext(bool true_or_false)
                {
                    m_ShouldDelete = true_or_false;
                }

            protected:
                void Initialize();

                ::grpc::ClientContext& GetClientContext()
                {
                    return m_Context;
                }

            private:
                bool RunNextState(bool ok) final override
                {
                    return (this->*m_NextState)(ok);
                }

                bool RunNextState(bool (ClientStreaming<Request, Response>::*state_fn)(bool), bool ok)
                {
                    return (this->*state_fn)(ok);
                }

                class Context : public BaseContext
                {
                public:
                    Context(BaseContext* master) : BaseContext(master) {}
                    ~Context() override {}

                private:
                    bool RunNextState(bool ok) final override
                    {
                        // DVLOG(1) << this << ": " << "Event for Tag: " << Tag();
                        return static_cast<ClientStreaming*>(m_MasterContext)->RunNextState(m_NextState, ok);
                    }

                    bool (ClientStreaming<Request, Response>::*m_NextState)(bool);

                    bool ExecutorShouldDeleteContext() const override
                    {
                        return false;
                    }

                    friend class ClientStreaming<Request, Response>;
                };

                ::grpc::Status        m_Status;
                ::grpc::ClientContext m_Context;
                Stream                m_Stream;

                PrepareFn m_PrepareFn;

                virtual void CallbackOnInitialized() {}
                virtual void CallbackOnRequestSent(Request&&) {}
                virtual void CallbackOnResponseReceived(Response&&) = 0;
                virtual void CallbackOnComplete(const ::grpc::Status&) {}

                Context m_ReadState;
                Context m_WriteState;

                std::mutex           m_Mutex;
                std::queue<Response> m_ReadQueue;
                std::queue<Request>  m_WriteQueue;

                std::shared_ptr<Executor> m_Executor;

                bool m_Corked;
                bool m_ShouldDelete;
                bool m_Initialized;

                using ReadHandle   = bool;
                using WriteHandle  = bool;
                using CloseHandle  = bool;
                using FinishHandle = bool;
                using Actions      = std::tuple<ReadHandle, WriteHandle, CloseHandle, FinishHandle>;

                bool m_Reading, m_Writing, m_Finishing, m_Closing, m_ReadsDone, m_WritesDone, m_ClosedDone, m_FinishDone, m_Complete;

                bool (ClientStreaming<Request, Response>::*m_NextState)(bool);

                Actions EvaluateState();
                void    ForwardProgress(Actions& actions);

                bool StateStreamInitialized(bool);
                bool StateReadDone(bool);
                bool StateWriteDone(bool);
                bool StateWritesDoneDone(bool);
                bool StateFinishDone(bool);
                bool StateInvalid(bool);
                bool StateIdle(bool);
            };

            template <typename Request, typename Response>
            ClientStreaming<Request, Response>::ClientStreaming(PrepareFn prepare_fn, std::shared_ptr<Executor> executor)
            : m_Executor(executor),
              m_PrepareFn(prepare_fn),
              m_ReadState(this),
              m_WriteState(this),
              m_Reading(false),
              m_Writing(false),
              m_Finishing(false),
              m_Closing(false),
              m_ReadsDone(false),
              m_WritesDone(false),
              m_ClosedDone(false),
              m_FinishDone(false),
              m_Complete(false),
              m_ShouldDelete(false),
              m_Corked(false),
              m_Initialized(false)
            {
                m_NextState              = &ClientStreaming<Request, Response>::StateInvalid;
                m_ReadState.m_NextState  = &ClientStreaming<Request, Response>::StateInvalid;
                m_WriteState.m_NextState = &ClientStreaming<Request, Response>::StateInvalid;

                Initialize();
            }

            template <typename Request, typename Response>
            void ClientStreaming<Request, Response>::Initialize()
            {
                std::lock_guard<std::mutex> lock(m_Mutex);

                m_Initialized = true;

                m_NextState = &ClientStreaming<Request, Response>::StateStreamInitialized;
                m_Stream    = m_PrepareFn(&m_Context, m_Executor->GetNextCQ());
                m_Stream->StartCall(this->Tag());
            }

            template <typename Request, typename Response>
            bool ClientStreaming<Request, Response>::Write(Request&& request)
            {
                Actions actions;
                {
                    std::lock_guard<std::mutex> lock(m_Mutex);
                    CHECK(m_Initialized);

                    if (m_WritesDone)
                    {
                        LOG(WARNING) << this << ": Attempting to Write on a Stream that is already closed";
                        return false;
                    }

                    DVLOG(1) << this << ": Queuing Write Request";
                    DVLOG(3) << this << ": On Queuing Write, there are " << m_WriteQueue.size() << " outstanding writes";

                    m_WriteQueue.push(std::move(request));
                    actions = EvaluateState();
                }
                ForwardProgress(actions);
                return true;
            }

            template <typename Request, typename Response>
            bool ClientStreaming<Request, Response>::CloseWrites()
            {
                Actions actions;
                {
                    std::lock_guard<std::mutex> lock(m_Mutex);
                    CHECK(m_Initialized);

                    if (m_WritesDone)
                    {
                        LOG(WARNING) << this << ": Attempting to Close (issue WritesDone) to a Stream that is already closed";
                        return false;
                    }

                    DVLOG(1) << this << ": Queuing WritesDone - no more Writes can be queued";
                    DVLOG(3) << this << ": On Queuing WritesDone, there are " << m_WriteQueue.size() << " outstanding writes";

                    m_WritesDone = true;
                    actions      = EvaluateState();
                }
                ForwardProgress(actions);
                return true;
            }

            template <typename Request, typename Response>
            void ClientStreaming<Request, Response>::Cancel()
            {
                Actions actions;
                {
                    std::lock_guard<std::mutex> lock(m_Mutex);
                    DVLOG(1) << this << ": Client Cancelling Stream";
                    DVLOG(3) << this << ": On Cancel, there are " << m_WriteQueue.size() << " outstanding writes";

                    CHECK(m_Initialized);

                    m_WritesDone = true;
                    m_Context.TryCancel();
                }
            }

            template <typename Request, typename Response>
            bool ClientStreaming<Request, Response>::IsComplete()
            {
                std::lock_guard<std::mutex> lock(m_Mutex);
                return m_Complete;
            }

            template <typename Request, typename Response>
            typename ClientStreaming<Request, Response>::Actions ClientStreaming<Request, Response>::EvaluateState()
            {
                ReadHandle   should_read   = false;
                WriteHandle  should_write  = nullptr;
                CloseHandle  should_close  = false;
                FinishHandle should_finish = false;

                if (m_NextState == &ClientStreaming<Request, Response>::StateStreamInitialized)
                {
                    DVLOG(1) << this << ": Action Queued: Stream Initializing";
                }
                else
                {
                    if (!m_Reading && !m_ReadsDone)
                    {
                        should_read = true;
                        m_Reading   = true;
                        m_ReadQueue.emplace();
                        m_ReadState.m_NextState = &ClientStreaming<Request, Response>::StateReadDone;
                    }
                    if (!m_Writing && !m_WriteQueue.empty())
                    {
                        should_write             = true;
                        m_Writing                = true;
                        m_WriteState.m_NextState = &ClientStreaming<Request, Response>::StateWriteDone;
                        DVLOG(3) << this << ": WriteQueue has " << m_WriteQueue.size() << " outstanding messages";
                    }
                    if (!m_Closing && !m_Writing && m_WritesDone)
                    {
                        should_close             = true;
                        m_Closing                = true;
                        m_WriteState.m_NextState = &ClientStreaming<Request, Response>::StateWritesDoneDone;
                    }
                    if (!m_Reading && !m_Writing && !m_Finishing && m_ReadsDone && m_WritesDone && m_ClosedDone && !m_FinishDone)
                    {
                        should_finish = true;
                        m_Finishing   = true;
                        m_NextState   = &ClientStreaming<Request, Response>::StateFinishDone;
                        DCHECK((m_ReadState.m_NextState == &ClientStreaming<Request, Response>::StateInvalid));
                        DCHECK((m_WriteState.m_NextState == &ClientStreaming<Request, Response>::StateInvalid));
                    }
                    if (m_ReadsDone && m_WritesDone && m_ClosedDone && m_FinishDone && !m_Complete)
                    {
                        CallbackOnComplete(m_Status);
                        m_Complete = true;
                    }
                }

                // clang-format off
                DVLOG(1) << this << ": " << (should_read ? 1 : 0) << (should_write ? 1 : 0) << (should_finish ? 1 : 0)
                    << " -- " << m_Reading << m_Writing << m_Finishing
                    << " -- " << m_ReadsDone << m_WritesDone
                    << " -- " << m_Finishing;
                // clang-format on

                return std::make_tuple(should_read, should_write, should_close, should_finish);
            }

            template <class Request, class Response>
            void ClientStreaming<Request, Response>::ForwardProgress(Actions& actions)
            {
                ReadHandle   should_read   = std::get<0>(actions);
                WriteHandle  should_write  = std::get<1>(actions);
                CloseHandle  should_close  = std::get<2>(actions);
                FinishHandle should_finish = std::get<3>(actions);

                if (should_read)
                {
                    DVLOG(1) << this << ": Posting Read/Recv";
                    m_Stream->Read(&m_ReadQueue.back(), m_ReadState.Tag());
                }
                if (should_write)
                {
                    DVLOG(1) << this << ": Writing/Sending Request";
                    if (m_Corked)
                    {
                        ::grpc::WriteOptions options;
                        options.set_corked();
                        m_Stream->Write(m_WriteQueue.front(), options, m_WriteState.Tag());
                    }
                    else
                    {
                        m_Stream->Write(m_WriteQueue.front(), m_WriteState.Tag());
                    }
                }
                if (should_close)
                {
                    DVLOG(1) << this << ": Sending WritesDone to Server";
                    m_Stream->WritesDone(m_WriteState.Tag());
                }
                if (should_finish)
                {
                    DVLOG(1) << this << ": Closing Stream - Finish";
                    m_Stream->Finish(&m_Status, Tag());
                }
            }

            template <typename Request, typename Response>
            bool ClientStreaming<Request, Response>::StateStreamInitialized(bool ok)
            {
                if (!ok)
                {
                    LOG(ERROR) << this << ": Stream Failed to Initialize";
                    m_Status    = ::grpc::Status::CANCELLED;
                    m_ReadsDone = m_WritesDone = m_ClosedDone = m_FinishDone = m_Complete = true;
                    CallbackOnComplete(m_Status);
                    return false;
                }

                Actions actions;
                {
                    std::lock_guard<std::mutex> lock(m_Mutex);
                    DVLOG(1) << this << ": StreamInitialized";

                    m_NextState = &ClientStreaming<Request, Response>::StateInvalid;
                    actions     = EvaluateState();
                }

                CallbackOnInitialized();

                ForwardProgress(actions);
                return true;
            }

            template <typename Request, typename Response>
            bool ClientStreaming<Request, Response>::StateReadDone(bool ok)
            {
                Actions               actions;
                std::function<void()> callback = nullptr;

                {
                    std::lock_guard<std::mutex> lock(m_Mutex);
                    DVLOG(1) << this << ": ReadDone: " << (ok ? "OK" : "NOT OK");

                    m_Reading               = false;
                    m_ReadState.m_NextState = &ClientStreaming<Request, Response>::StateInvalid;

                    if (!ok)
                    {
                        DVLOG(1) << this << ": Server is closing the read/download portion of the stream";
                        m_ReadsDone  = true;
                        m_WritesDone = true;
                        if (m_Writing)
                        {
                            DLOG(WARNING) << "ReadDone with NOT OK; however, there is still an outstanding Write";
                            m_Context.TryCancel();
                        }
                    }
                    else
                    {
                        callback = [this, response = std::move(m_ReadQueue.front())]() mutable {
                            CallbackOnResponseReceived(std::move(response));
                        };
                        m_ReadQueue.pop();
                    }
                    actions = EvaluateState();
                }

                // drop mutex and perform actions
                if (callback)
                {
                    callback();
                }
                ForwardProgress(actions);
                return true;
            }

            template <typename Request, typename Response>
            bool ClientStreaming<Request, Response>::StateWriteDone(bool ok)
            {
                Actions               actions;
                std::function<void()> callback = nullptr;

                {
                    std::lock_guard<std::mutex> lock(m_Mutex);
                    DVLOG(1) << this << ": WriteDone: " << (ok ? "OK" : "NOT OK");

                    m_Writing                = false;
                    m_WriteState.m_NextState = &ClientStreaming<Request, Response>::StateInvalid;

                    if (!ok)
                    {
                        // Invalidate any outstanding reads on stream
                        DLOG(ERROR) << "Failed to Write to Stream - shutting down";
                        m_WritesDone = true;
                        m_WriteQueue = std::queue<Request>();
                        if (!m_ReadsDone)
                        {
                            m_Context.TryCancel();
                        }
                    }
                    else
                    {
                        callback = [this, request = std::move(m_WriteQueue.front())]() mutable {
                            CallbackOnRequestSent(std::move(request));
                        };
                        m_WriteQueue.pop();
                    }

                    actions = EvaluateState();
                }

                // drop mutex and perform any actions
                if (callback)
                {
                    callback();
                }
                ForwardProgress(actions);
                return true;
            }

            template <typename Request, typename Response>
            bool ClientStreaming<Request, Response>::StateWritesDoneDone(bool ok)
            {
                Actions actions;
                {
                    std::lock_guard<std::mutex> lock(m_Mutex);
                    DVLOG(1) << this << ": WritesDoneDone: " << (ok ? "OK" : "NOT OK");

                    m_ClosedDone             = true;
                    m_WriteState.m_NextState = &ClientStreaming<Request, Response>::StateInvalid;

                    if (!ok)
                    {
                        LOG(ERROR) << "Failed to close write/upload portion of stream";
                        if (!m_ReadsDone)
                        {
                            m_Context.TryCancel();
                        }
                    }

                    actions = EvaluateState();
                }
                ForwardProgress(actions);
                return true;
            }

            template <typename Request, typename Response>
            bool ClientStreaming<Request, Response>::StateFinishDone(bool ok)
            {
                Actions actions;
                {
                    std::lock_guard<std::mutex> lock(m_Mutex);
                    DVLOG(1) << this << ": FinishedDone: " << (ok ? "OK" : "NOT OK");

                    m_Finishing  = false;
                    m_FinishDone = true;

                    if (!ok)
                    {
                        LOG(ERROR) << "Request to Finish the stream failed";
                        m_Context.TryCancel();
                    }

                    actions = EvaluateState();
                }
                ForwardProgress(actions);
                return false;
            }

            template <typename Request, typename Response>
            bool ClientStreaming<Request, Response>::StateInvalid(bool ok)
            {
                LOG(FATAL) << "Your logic is bad - you should never have come here";
            }

        } // namespace v3
    }     // namespace client
} // namespace nvrpc

================================================
FILE: trtlab/nvrpc/include/nvrpc/client/client_unary.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once
#include <functional>
#include <memory>

#include <glog/logging.h>
#include <grpc++/grpc++.h>

#include "nvrpc/client/base_context.h"
#include "nvrpc/client/executor.h"
#include "trtlab/core/async_compute.h"

namespace nvrpc {
namespace client {

template<typename Request, typename Response>
struct ClientUnary : public ::trtlab::async_compute<void(Request&, Response&, ::grpc::Status&)>
{
  public:
    using PrepareFn = std::function<std::unique_ptr<::grpc_impl::ClientAsyncResponseReader<Response>>(
        ::grpc::ClientContext*, const Request&, ::grpc::CompletionQueue*)>;

    ClientUnary(PrepareFn prepare_fn, std::shared_ptr<Executor> executor)
        : m_PrepareFn(prepare_fn), m_Executor(executor)
    {
    }

    ~ClientUnary() {}

    template<typename OnReturnFn>
    auto Enqueue(Request* request, Response* response, OnReturnFn on_return,
                 std::map<std::string, std::string>& headers)
    {
        auto wrapped = this->wrap(on_return);
        auto future = wrapped->get_future();

        Context* ctx = new Context;
        ctx->m_Request = request;
        ctx->m_Response = response;
        ctx->m_Callback = [ctx, wrapped]() mutable {
            (*wrapped)(*ctx->m_Request, *ctx->m_Response, ctx->m_Status);
        };

        for(auto& header : headers)
        {
            ctx->m_Context.AddMetadata(header.first, header.second);
        }

        ctx->m_Reader = m_PrepareFn(&ctx->m_Context, *ctx->m_Request, m_Executor->GetNextCQ());
        ctx->m_Reader->StartCall();
        ctx->m_Reader->Finish(ctx->m_Response, &ctx->m_Status, ctx->Tag());

        return future.share();
    }

    template<typename OnReturnFn>
    auto Enqueue(Request&& request, OnReturnFn on_return)
    {
        std::map<std::string, std::string> empty_headers;
        return Enqueue(std::move(request), on_return, empty_headers);
    }

    template<typename OnReturnFn>
    auto Enqueue(Request&& request, OnReturnFn on_return,
                 std::map<std::string, std::string>& headers)
    {
        auto req = std::make_shared<Request>(std::move(request));
        auto resp = std::make_shared<Response>();

        auto extended_on_return = [req, resp, on_return](Request & request, Response & response,
                                                         ::grpc::Status & status) mutable -> auto
        {
            return on_return(request, response, status);
        };

        return Enqueue(req.get(), resp.get(), extended_on_return, headers);
    }

  private:
    PrepareFn m_PrepareFn;
    std::shared_ptr<Executor> m_Executor;

    class Context : public BaseContext
    {
        Context() : m_NextState(&Context::StateFinishedDone) {}
        ~Context() override {}

        bool RunNextState(bool ok) final override
        {
            bool ret = (this->*m_NextState)(ok);
            // DLOG_IF(INFO, !ret) << "RunNextState returning false";
            return ret;
        }

        bool ExecutorShouldDeleteContext() const override { return true; }

      protected:
        bool StateFinishedDone(bool ok)
        {
            DVLOG(1) << "ClientContext: " << Tag() << " finished with "
                       << (m_Status.ok() ? "OK" : "CANCELLED");
            m_Callback();
            DVLOG(1) << "ClientContext: " << Tag() << " callback completed";
            return false;
        }

      private:
        Request* m_Request;
        Response* m_Response;
        std::function<void()> m_Callback;
        ::grpc::Status m_Status;
        ::grpc::ClientContext m_Context;
        std::unique_ptr<::grpc_impl::ClientAsyncResponseReader<Response>> m_Reader;
        bool (Context::*m_NextState)(bool);

        friend class ClientUnary;
    };
};

template<typename Request, typename Response>
struct ClientUnaryWithMetaData : public ::trtlab::async_compute<void(Request&, Response&, ::grpc::Status&)>
{
  public:
    using metadata_t = std::multimap<::grpc::string_ref, ::grpc::string_ref>;

    using PrepareFn = std::function<std::unique_ptr<::grpc_impl::ClientAsyncResponseReader<Response>>(
        ::grpc::ClientContext*, const Request&, ::grpc::CompletionQueue*)>;

    ClientUnaryWithMetaData(PrepareFn prepare_fn, std::shared_ptr<Executor> executor)
        : m_PrepareFn(prepare_fn), m_Executor(executor)
    {
    }

    ~ClientUnaryWithMetaData() {}

    template<typename OnReturnFn>
    auto Enqueue(Request* request, Response* response, OnReturnFn on_return,
                 std::map<std::string, std::string>& headers)
    {
        auto wrapped = this->wrap(on_return);
        auto future = wrapped->get_future();

        Context* ctx = new Context;
        ctx->m_Request = request;
        ctx->m_Response = response;
        ctx->m_Callback = [ctx, wrapped]() mutable {
            auto metadata = ctx->GetMetaData();
            (*wrapped)(*ctx->m_Request, *ctx->m_Response, ctx->m_Status, metadata);
        };

        for(auto& header : headers)
        {
            ctx->m_Context.AddMetadata(header.first, header.second);
        }

        ctx->m_Reader = m_PrepareFn(&ctx->m_Context, *ctx->m_Request, m_Executor->GetNextCQ());
        ctx->m_Reader->StartCall();
        ctx->m_Reader->Finish(ctx->m_Response, &ctx->m_Status, ctx->Tag());

        return future.share();
    }

    template<typename OnReturnFn>
    auto Enqueue(Request&& request, OnReturnFn on_return)
    {
        std::map<std::string, std::string> empty_headers;
        return Enqueue(std::move(request), on_return, empty_headers);
    }

    template<typename OnReturnFn>
    auto Enqueue(Request&& request, OnReturnFn on_return,
                 std::map<std::string, std::string>& headers)
    {
        auto req = std::make_shared<Request>(std::move(request));
        auto resp = std::make_shared<Response>();

        auto extended_on_return = [req, resp, on_return](Request & request, Response & response,
                                                         ::grpc::Status & status, metadata_t& metadata) mutable -> auto
        {
            return on_return(request, response, status, metadata);
        };

        return Enqueue(req.get(), resp.get(), extended_on_return, headers);
    }

  private:
    PrepareFn m_PrepareFn;
    std::shared_ptr<Executor> m_Executor;

    class Context : public BaseContext
    {
        Context() : m_NextState(&Context::StateFinishedDone) {}
        ~Context() override {}

        bool RunNextState(bool ok) final override
        {
            bool ret = (this->*m_NextState)(ok);
            // DLOG_IF(INFO, !ret) << "RunNextState returning false";
            return ret;
        }

        bool ExecutorShouldDeleteContext() const override { return true; }

        metadata_t GetMetaData() { return m_Context.GetServerTrailingMetadata(); }

      protected:
        bool StateFinishedDone(bool ok)
        {
            DVLOG(1) << "ClientContext: " << Tag() << " finished with "
                       << (m_Status.ok() ? "OK" : "CANCELLED");
            m_Callback();
            DVLOG(1) << "ClientContext: " << Tag() << " callback completed";
            return false;
        }

      private:
        Request* m_Request;
        Response* m_Response;
        std::function<void()> m_Callback;
        ::grpc::Status m_Status;
        ::grpc::ClientContext m_Context;
        std::unique_ptr<::grpc_impl::ClientAsyncResponseReader<Response>> m_Reader;
        bool (Context::*m_NextState)(bool);

        friend class ClientUnaryWithMetaData;
    };

};

} // namespace client
} // namespace nvrpc

================================================
FILE: trtlab/nvrpc/include/nvrpc/client/client_unary_v2.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once
#include <functional>
#include <memory>
#include <future>

#include <glog/logging.h>
#include <grpc++/grpc++.h>

#include "nvrpc/client/base_context.h"
#include "nvrpc/client/executor.h"
#include "trtlab/core/async_compute.h"

namespace nvrpc
{
    namespace client
    {
        namespace v2
        {
            template <typename Request, typename Response>
            struct ClientUnary : public BaseContext
            {
                using Client = ClientUnary<Request, Response>;
                using Reader = std::unique_ptr<::grpc_impl::ClientAsyncResponseReader<Response>>;

            public:
                using PrepareFn = std::function<Reader(::grpc::ClientContext*, const Request&, ::grpc::CompletionQueue*)>;

                ClientUnary(PrepareFn prepare_fn, std::shared_ptr<Executor> executor) : m_PrepareFn(prepare_fn), m_Executor(executor)
                {
                    m_NextState = &Client::StateInvalid;
                }

                ~ClientUnary() {}

                void Write(Request&&);

                virtual void CallbackOnRequestSent(Request&&) {}
                virtual void CallbackOnResponseReceived(Response&&)    = 0;
                virtual void CallbackOnComplete(const ::grpc::Status&) = 0;

                bool ExecutorShouldDeleteContext() const override
                {
                    return false;
                }

            protected:
                ::grpc::ClientContext& GetClientContext()
                {
                    return m_Context;
                }

            private:
                PrepareFn                 m_PrepareFn;
                std::shared_ptr<Executor> m_Executor;

                ::grpc::Status        m_Status;
                ::grpc::ClientContext m_Context;
                Reader                m_Stream;

                Request  m_Request;
                Response m_Response;

                bool RunNextState(bool ok) final override
                {
                    return (this->*m_NextState)(ok);
                }

                bool StateFinishDone(bool);
                bool StateInvalid(bool);

                bool (Client::*m_NextState)(bool);
            };

            template <typename Request, typename Response>
            void ClientUnary<Request, Response>::Write(Request&& request)
            {
                CHECK(m_Stream == nullptr);

                m_Request   = std::move(request);
                m_NextState = &Client::StateFinishDone;

                m_Stream = m_PrepareFn(&m_Context, m_Request, m_Executor->GetNextCQ());
                m_Stream->StartCall();
                m_Stream->Finish(&m_Response, &m_Status, this->Tag());
            }

            template <typename Request, typename Response>
            bool ClientUnary<Request, Response>::StateFinishDone(bool ok)
            {
                m_NextState = &Client::StateInvalid;

                if (!ok)
                {
                    DVLOG(1) << "FinishDone handler called with NOT OK";
                }

                DVLOG(1) << "calling on complete callback";
                if (m_Status.ok())
                {
                    CallbackOnResponseReceived(std::move(m_Response));
                }
                CallbackOnComplete(m_Status);

                return false;
            }

            template <typename Request, typename Response>
            bool ClientUnary<Request, Response>::StateInvalid(bool ok)
            {
                LOG(FATAL) << "logic error in ClientUnary state management";
                return false;
            }

        } // namespace v2
    }     // namespace client
} // namespace nvrpc

================================================
FILE: trtlab/nvrpc/include/nvrpc/client/executor.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once
#include <memory>
#include <mutex>
#include <vector>

#include <grpc++/grpc++.h>

#include <trtlab/core/thread_pool.h>

namespace nvrpc {
namespace client {

class Executor : public std::enable_shared_from_this<Executor>
{
  public:
    Executor();
    Executor(int numThreads);
    Executor(std::unique_ptr<::trtlab::ThreadPool> threadpool);

    Executor(Executor&& other) noexcept = delete;
    Executor& operator=(Executor&& other) noexcept = delete;

    Executor(const Executor& other) = delete;
    Executor& operator=(const Executor& other) = delete;

    virtual ~Executor();

    void ShutdownAndJoin();
    ::grpc::CompletionQueue* GetNextCQ() const;

  private:
    void ProgressEngine(::grpc::CompletionQueue&);

    mutable std::size_t m_Counter;
    std::unique_ptr<::trtlab::ThreadPool> m_ThreadPool;
    std::vector<std::unique_ptr<::grpc::CompletionQueue>> m_CQs;
    mutable std::mutex m_Mutex;
};

} // namespace client
} // namespace nvrpc

================================================
FILE: trtlab/nvrpc/include/nvrpc/context.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

#include "nvrpc/interfaces.h"
#include "nvrpc/life_cycle_batching.h"
#include "nvrpc/life_cycle_bidirectional.h"
#include "nvrpc/life_cycle_streaming.h"
#include "nvrpc/life_cycle_unary.h"

#ifdef NVRPC_METRICS_ENABLED
#include "YAIS/Metrics.h"
#endif

namespace nvrpc {

template<class LifeCycle, class Resources>
class BaseContext;

template<class Request, class Response, class Resources>
using Context = BaseContext<LifeCycleUnary<Request, Response>, Resources>;

template<class Request, class Response, class Resources>
using BatchingContext = BaseContext<LifeCycleBatching<Request, Response>, Resources>;

template<class Request, class Response, class Resources>
using BidirectionalContext =
    BaseContext<BidirectionalLifeCycleStreaming<Request, Response>, Resources>;

template<class Request, class Response, class Resources>
using StreamingContext = BaseContext<LifeCycleStreaming<Request, Response>, Resources>;

template<class LifeCycle, class Resources>
class BaseContext : public LifeCycle
{
  public:
    using RequestType = typename LifeCycle::RequestType;
    using ResponseType = typename LifeCycle::ResponseType;
    using ResourcesType = std::shared_ptr<Resources>;
    using QueueFuncType = typename LifeCycle::ExecutorQueueFuncType;

    using LifeCycleType = LifeCycle;
    virtual ~BaseContext() override {}

  protected:
    const ResourcesType& GetResources() const { return m_Resources; }
    double Walltime() const;

    virtual void OnContextStart() {}
    virtual void OnContextReset() {}

  private:
    virtual void OnLifeCycleStart() final override;
    virtual void OnLifeCycleReset() final override;

    ResourcesType m_Resources;
    std::chrono::high_resolution_clock::time_point m_StartTime;

    void FactoryInitializer(QueueFuncType, ResourcesType);

    // Factory function allowed to create unique pointers to context objects
    template<class ContextType>
    friend std::unique_ptr<ContextType>
        ContextFactory(typename ContextType::QueueFuncType q_fn,
                       typename ContextType::ResourcesType resources);

  public:
    // Convenience method to acquire the Context base pointer from a derived class
    BaseContext<LifeCycle, Resources>* GetBase()
    {
        return dynamic_cast<BaseContext<LifeCycle, Resources>*>(this);
    }
};

// Implementations

/**
 * @brief Method invoked when a request is received and the per-call context lifecycle begins.
 */
template<class LifeCycle, class Resources>
void BaseContext<LifeCycle, Resources>::OnLifeCycleStart()
{
    m_StartTime = std::chrono::high_resolution_clock::now();
#ifdef NVRPC_METRICS_ENABLED
    Metrics::ExecutionQueueDepthIncrement();
#endif
    OnContextStart();
}

/**
 * @brief Method invoked at the end of the per-call lifecycle just before the context is reset.
 */
template<class LifeCycle, class Resources>
void BaseContext<LifeCycle, Resources>::OnLifeCycleReset()
{
#ifdef NVRPC_METRICS_ENABLED
    Metrics::ExecutionQueueDepthDecrement();
#endif
    OnContextReset();
}

/**
 * @brief Number of seconds since the start of the RPC
 */
template<class LifeCycle, class Resources>
double BaseContext<LifeCycle, Resources>::Walltime() const
{
    return std::chrono::duration<double>(std::chrono::high_resolution_clock::now() - m_StartTime)
        .count();
}

/**
 * @brief Used by ContextFactory to initialize the Context
 */
template<class LifeCycle, class Resources>
void BaseContext<LifeCycle, Resources>::FactoryInitializer(QueueFuncType queue_fn,
                                                           ResourcesType resources)
{
    this->SetQueueFunc(queue_fn);
    m_Resources = resources;
}

/**
 * @brief ContextFactory is the only function in the library allowed to create an IContext object.
 */
template<class ContextType>
std::unique_ptr<ContextType> ContextFactory(typename ContextType::QueueFuncType queue_fn,
                                            typename ContextType::ResourcesType resources)
{
    auto ctx = std::make_unique<ContextType>();
    auto base = ctx->GetBase();
    base->FactoryInitializer(queue_fn, resources);
    return ctx;
}

} // namespace nvrpc


================================================
FILE: trtlab/nvrpc/include/nvrpc/executor.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

#include "nvrpc/interfaces.h"
#include "trtlab/core/resources.h"
#include "trtlab/core/thread_pool.h"

#include <thread>

#include <glog/logging.h>

namespace nvrpc
{
    class Executor : public IExecutor
    {
    public:
        Executor();
        Executor(int numThreads);
        Executor(std::unique_ptr<::trtlab::ThreadPool> threadpool);
        ~Executor() override {}

        void Initialize(::grpc::ServerBuilder& builder) final override
        {
            for (int i = 0; i < m_ThreadPool->Size(); i++)
            {
                m_ServerCompletionQueues.emplace_back(builder.AddCompletionQueue());
            }
        }

        void RegisterContexts(IRPC* rpc, std::shared_ptr<::trtlab::Resources> resources, int numContextsPerThread) final override
        {
            auto base = dynamic_cast<IExecutor*>(this);
            CHECK_EQ(m_ThreadPool->Size(), m_ServerCompletionQueues.size()) << "Incorrect number of CQs";
            for (int i = 0; i < m_ThreadPool->Size(); i++)
            {
                auto cq = m_ServerCompletionQueues[i].get();
                for (int j = 0; j < numContextsPerThread; j++)
                {
                    DVLOG(3) << "Creating Context " << j << " on thread " << i;
                    m_Contexts.emplace_back(this->CreateContext(rpc, cq, resources));
                }
            }
        }

        void Shutdown() final override
        {
            for (auto& cq : m_ServerCompletionQueues)
            {
                LOG(INFO) << "Telling CQ to Shutdown: " << cq.get();
                cq->Shutdown();
                m_Running = false;
            }
            // exit(911);
            LOG(INFO) << "Joining Executor Threads";
            m_ThreadPool.reset();
        }

        void Run() final override
        {
            // Launch the threads polling on their CQs
            for (int i = 0; i < m_ThreadPool->Size(); i++)
            {
                m_ThreadPool->enqueue([this, i] { ProgressEngine(i); });
                // m_Threads.emplace_back(&Executor::ProgressEngine, this, i);
            }
            // Queue the Execution Contexts in the recieve queue
            for (int i = 0; i < m_Contexts.size(); i++)
            {
                ResetContext(m_Contexts[i].get());
                // TODO: add a hooked to allow one to customize some
                // ContextDidReset(m_Contexts[i].get());
            }
        }

    protected:
        void SetTimeout(time_point, std::function<void()>) final override;

        //private:
        virtual void ProgressEngine(int thread_id);

        volatile bool                                               m_Running;
        time_point                                                  m_TimeoutDeadline;
        std::function<void()>                                       m_TimeoutCallback;
        std::vector<std::unique_ptr<IContext>>                      m_Contexts;
        std::vector<std::unique_ptr<::grpc::ServerCompletionQueue>> m_ServerCompletionQueues;
        // std::vector<std::unique_ptr<PerThreadState>> m_ShutdownState;
        std::unique_ptr<::trtlab::ThreadPool> m_ThreadPool;
    };

} // namespace nvrpc


================================================
FILE: trtlab/nvrpc/include/nvrpc/fiber/executor.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

#include "nvrpc/executor.h"
#include <boost/fiber/all.hpp>

namespace nvrpc
{
    // todo: the derviation of FiberExecutor required making the variables of Executor protected
    // instead of private.  work should be to clean up the interface and establish better 
    // inheritance properites
    class FiberExecutor : public Executor
    {
        using Executor::Executor;

        void ProgressEngine(int thread_id) override
        {
            //::trtlab::async::shared_work_pool<WorkPoolID>();
            ::boost::fibers::use_scheduling_algorithm<::boost::fibers::algo::shared_work>();
            bool  ok;
            void* tag;
            auto  myCQ = m_ServerCompletionQueues[thread_id].get();
            m_Running  = true;

            while (myCQ->Next(&tag, &ok))
            {
                ::boost::fibers::fiber([this, tag, ok]() mutable {
                    DVLOG(3) << "execution fiber " << boost::this_fiber::get_id() << " running on thread " << std::this_thread::get_id();
                    auto ctx = IContext::Detag(tag);
                    if (!RunContext(ctx, ok))
                    {
                        if (m_Running)
                        {
                            ResetContext(ctx);
                        }
                    }
                }).detach();
            }
        }
    };

} // namespace nvrpc


================================================
FILE: trtlab/nvrpc/include/nvrpc/interfaces.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#ifndef NVIS_INTERFACES_H_
#define NVIS_INTERFACES_H_

#include <grpcpp/grpcpp.h>

#include "trtlab/core/resources.h"

namespace nvrpc {

class IContext;
class IExecutor;
class IContextLifeCycle;
class IRPC;
class IService;

/**
 * The IContext object and it's subsequent derivations are the single more important class
 * in this library. Contexts are responsible for maintaining the state of a message and
 * performing the custom code for an RPC invocation.
 */
class IContext
{
  public:
    virtual ~IContext() {}
    static IContext* Detag(void* tag) { return static_cast<IContext*>(tag); }

  protected:
    IContext() : m_PrimaryContext(this) {}
    IContext(IContext* primary_context) : m_PrimaryContext(primary_context) {}

    void* Tag() { return reinterpret_cast<void*>(this); }

  protected:
    IContext* m_PrimaryContext;

  private:
    virtual bool RunNextState(bool) = 0;
    virtual void Reset() = 0;

    friend class IRPC;
    friend class IExecutor;
};

class IContextLifeCycle : public IContext
{
  public:
    ~IContextLifeCycle() override {}

  protected:
    IContextLifeCycle() = default;

    virtual void OnLifeCycleStart() = 0;
    virtual void OnLifeCycleReset() = 0;

    virtual void FinishResponse() = 0;
    virtual void CancelResponse() = 0;
};

class IService
{
  public:
    IService() = default;
    virtual ~IService() {}

    virtual void Initialize(::grpc::ServerBuilder&) = 0;
};

class IRPC
{
  public:
    IRPC() = default;
    virtual ~IRPC() {}

  protected:
    virtual std::unique_ptr<IContext> CreateContext(::grpc::ServerCompletionQueue*,
                                                    std::shared_ptr<::trtlab::Resources>) = 0;

    friend class IExecutor;
};

class IExecutor
{
  public:
    IExecutor() = default;
    virtual ~IExecutor() {}

    virtual void Initialize(::grpc::ServerBuilder&) = 0;
    virtual void Run() = 0;
    virtual void RegisterContexts(IRPC* rpc, std::shared_ptr<::trtlab::Resources> resources,
                                  int numContextsPerThread) = 0;
    virtual void Shutdown() = 0;

  protected:
    using time_point = std::chrono::system_clock::time_point;

    virtual void SetTimeout(time_point, std::function<void()>) = 0;

    inline bool RunContext(IContext* ctx, bool ok) { return ctx->RunNextState(ok); }
    inline void ResetContext(IContext* ctx) { ctx->Reset(); }
    inline std::unique_ptr<IContext> CreateContext(IRPC* rpc, ::grpc::ServerCompletionQueue* cq,
                                                   std::shared_ptr<::trtlab::Resources> res)
    {
        return rpc->CreateContext(cq, res);
    }
};

} // namespace nvrpc

#endif // NVIS_INTERFACES_H_


================================================
FILE: trtlab/nvrpc/include/nvrpc/life_cycle_batching.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

#include "nvrpc/interfaces.h"
#include "nvrpc/life_cycle_streaming.h"

namespace nvrpc {

template<typename Request, typename Response>
class LifeCycleBatchingNew : public LifeCycleStreaming<Request, Response>
{
  public:
    using LifeCycleStreaming<Request, Response>::LifeCycleStreaming;
    using Stream = typename LifeCycleStreaming<Request, Response>::ServerStream;

  protected:
    virtual void ExecuteRPC(std::vector<Request>&, std::shared_ptr<Stream>) = 0;

  private:
    void RequestReceived(Request&&, std::shared_ptr<Stream>) final override;
    void RequestsFinished(std::shared_ptr<Stream>) final override;

    std::vector<Request> m_Requests;
};

template<class Request, class Response>
void LifeCycleBatchingNew<Request, Response>::RequestReceived(Request&& request,
                                                              std::shared_ptr<Stream> stream)
{
    m_Requests.push_back(std::move(request));
}

template<class Request, class Response>
void LifeCycleBatchingNew<Request, Response>::RequestsFinished(std::shared_ptr<Stream> stream)
{
    ExecuteRPC(m_Requests, stream);
}

/**
 * @brief LifeCycle State Machine All-In, then All-Out BATCHING
 *
 * Client sends message until it says it is done, then execute the RPC
 * on all received items, then for each item, return a response on the
 * stream in the same order the requests were received.
 *
 * @tparam Request
 * @tparam Response
 */
template<class Request, class Response>
class LifeCycleBatching : public IContextLifeCycle
{
  public:
    using RequestType = Request;
    using ResponseType = Response;
    using ServiceQueueFuncType = std::function<void(
        ::grpc::ServerContext*, ::grpc_impl::ServerAsyncReaderWriter<ResponseType, RequestType>*,
        ::grpc::CompletionQueue*, ::grpc::ServerCompletionQueue*, void*)>;
    using ExecutorQueueFuncType =
        std::function<void(::grpc::ServerContext*,
                           ::grpc_impl::ServerAsyncReaderWriter<ResponseType, RequestType>*, void*)>;

    virtual ~LifeCycleBatching() override {}

  protected:
    LifeCycleBatching() = default;
    void SetQueueFunc(ExecutorQueueFuncType q_fn) { m_QueuingFunc = q_fn; }

    virtual void ExecuteRPC(std::vector<RequestType>&, std::vector<ResponseType>&) = 0;
    virtual void OnRequestReceived(const RequestType&) {}

    void FinishResponse() final override;
    void CancelResponse() final override;

  private:
    // IContext Methods
    bool RunNextState(bool ok) final override;
    void Reset() final override;

    bool StateRequestDone(bool);
    bool StateReadDone(bool);
    bool StateWriteDone(bool);
    bool StateFinishedDone(bool);

    // Function pointers
    ExecutorQueueFuncType m_QueuingFunc;
    bool (LifeCycleBatching<RequestType, ResponseType>::*m_NextState)(bool);

    std::vector<RequestType> m_Requests;
    std::vector<ResponseType> m_Responses;
    std::unique_ptr<::grpc::ServerContext> m_Context;
    std::unique_ptr<::grpc_impl::ServerAsyncReaderWriter<ResponseType, RequestType>> m_Stream;
    typename std::vector<ResponseType>::const_iterator m_ResponseIterator;

  public:
    template<class RequestFuncType, class ServiceType>
    static ServiceQueueFuncType BindServiceQueueFunc(
        /*
        std::function<void(
            ServiceType*, ServerContextType*,
            ServerAsyncReaderWriter<ResponseType, RequestType>*,
            CompletionQueue*, ServerCompletionQueue*, void*)>
        */
        RequestFuncType request_fn, ServiceType* service_type)
    {
        return std::bind(request_fn, service_type,
                         std::placeholders::_1, // ServerContext*
                         std::placeholders::_2, // AsyncReaderWriter<ResponseType, RequestType>*
                         std::placeholders::_3, // CQ*
                         std::placeholders::_4, // ServerCQ*
                         std::placeholders::_5 // Tag
        );
    }

    static ExecutorQueueFuncType BindExecutorQueueFunc(ServiceQueueFuncType service_q_fn,
                                                       ::grpc::ServerCompletionQueue* cq)
    {
        return std::bind(service_q_fn,
                         std::placeholders::_1, // ServerContext*
                         std::placeholders::_2, // AsyncReaderWriter<ResponseType, RequestType>*
                         cq, cq,
                         std::placeholders::_3 // Tag
        );
    }
};

// Implementations

template<class Request, class Response>
bool LifeCycleBatching<Request, Response>::RunNextState(bool ok)
{
    return (this->*m_NextState)(ok);
}

template<class Request, class Response>
void LifeCycleBatching<Request, Response>::Reset()
{
    OnLifeCycleReset();
    m_Requests.clear();
    m_Responses.clear();
    m_Context.reset(new ::grpc::ServerContext);
    m_Stream.reset(new ::grpc_impl::ServerAsyncReaderWriter<ResponseType, RequestType>(m_Context.get()));
    m_NextState = &LifeCycleBatching<RequestType, ResponseType>::StateRequestDone;
    m_QueuingFunc(m_Context.get(), m_Stream.get(), IContext::Tag());
}

template<class Request, class Response>
bool LifeCycleBatching<Request, Response>::StateRequestDone(bool ok)
{
    if(!ok) return false;
    OnLifeCycleStart();
    m_Requests.emplace(m_Requests.end());
    m_NextState = &LifeCycleBatching<RequestType, ResponseType>::StateReadDone;
    m_Stream->Read(&m_Requests.back(), IContext::Tag());
    return true;
}

template<class Request, class Response>
bool LifeCycleBatching<Request, Response>::StateReadDone(bool ok)
{
    if(ok)
    {
        // Execute Callback for the Request item received
        OnRequestReceived(m_Requests.back());
        // Stream is still open, so pull off another request
        m_Requests.emplace(m_Requests.end());
        m_NextState = &LifeCycleBatching<RequestType, ResponseType>::StateReadDone;
        m_Stream->Read(&m_Requests.back(), IContext::Tag());
    }
    else
    {
        // Client has signaled it will not send any more requests
        // Remove the Request we didn't actually use
        m_Requests.pop_back();
        // Execute Batched RPC on all Requests
        ExecuteRPC(m_Requests, m_Responses);
    }
    return true;
}

template<class Request, class Response>
bool LifeCycleBatching<Request, Response>::StateWriteDone(bool ok)
{
    if(!ok) return false;
    if(m_ResponseIterator != m_Responses.cend())
    {
        if(m_ResponseIterator + 1 != m_Responses.cend())
        {
            m_NextState = &LifeCycleBatching<RequestType, ResponseType>::StateWriteDone;
            m_Stream->Write(*m_ResponseIterator, IContext::Tag());
            // The following hangs even though the client is guaranteed to have sent WritesDone();
            // thus, all the response writes from the server should not rely on any of the previous
            // messages https://grpc.io/grpc/cpp/md_doc_cpp_perf_notes.html
            // m_Stream->Write(*m_ResponseIterator, ::grpc::WriteOptions().set_buffer_hint(),
            // IContext::Tag());
            m_ResponseIterator++;
        }
        else
        {
            m_NextState = &LifeCycleBatching<RequestType, ResponseType>::StateFinishedDone;
            m_Stream->WriteAndFinish(*m_ResponseIterator, ::grpc::WriteOptions(),
                                     ::grpc::Status::OK, IContext::Tag());
        }
    }
    else
    {
        m_NextState = &LifeCycleBatching<RequestType, ResponseType>::StateFinishedDone;
        m_Stream->Finish(::grpc::Status::OK, IContext::Tag());
    }
    return true;
}

template<class Request, class Response>
bool LifeCycleBatching<Request, Response>::StateFinishedDone(bool ok)
{
    return false;
}

template<class Request, class Response>
void LifeCycleBatching<Request, Response>::FinishResponse()
{
    m_ResponseIterator = m_Responses.cbegin();
    StateWriteDone(true);
}

template<class Request, class Response>
void LifeCycleBatching<Request, Response>::CancelResponse()
{
    m_NextState = &LifeCycleBatching<RequestType, ResponseType>::StateFinishedDone;
    m_Stream->Finish(::grpc::Status::CANCELLED, IContext::Tag());
}

} // namespace nvrpc


================================================
FILE: trtlab/nvrpc/include/nvrpc/life_cycle_bidirectional.h
================================================
// Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#pragma once

#include <queue>

#include "nvrpc/interfaces.h"

#include <glog/logging.h>

namespace nvrpc {

// A bidirectional streaming version of LifeCycleUnary class
// Note that the bidirectional streaming feature in gRPC supports
// arbitrary call order of ServerReaderWriter::Read() and
// ServerReaderWriter::Write(), so we are able to handle
// reading request and writing response seperately.
template<class Request, class Response>
class BidirectionalLifeCycleStreaming : public IContextLifeCycle
{
  public:
    using RequestType = Request;
    using ResponseType = Response;
    using ServiceQueueFuncType = std::function<void(
        ::grpc::ServerContext*, ::grpc_impl::ServerAsyncReaderWriter<ResponseType, RequestType>*,
        ::grpc::CompletionQueue*, ::grpc::ServerCompletionQueue*, void*)>;
    using ExecutorQueueFuncType =
        std::function<void(::grpc::ServerContext*,
                           ::grpc_impl::ServerAsyncReaderWriter<ResponseType, RequestType>*, void*)>;

    ~BidirectionalLifeCycleStreaming() override {}

  protected:
    // Class to wrap over the State function pointers to allow the use of
    // different tags while referencing to the same Context.
    // Executor Detag() the tag, which points to the StateContext, which contains
    // a pointer to the actual context (master context).
    template<class RequestType, class ResponseType>
    class StateContext : public IContext
    {
      public:
        StateContext(IContext* master) : IContext(master) {}

      private:
        // IContext Methods
        bool RunNextState(bool ok) final override
        {
            return static_cast<BidirectionalLifeCycleStreaming*>(m_PrimaryContext)
                ->RunNextState(m_NextState, ok);
        }
        void Reset() final override { LOG(FATAL) << "ooops; call reset on master"; }

        bool (BidirectionalLifeCycleStreaming<RequestType, ResponseType>::*m_NextState)(bool);

        friend class BidirectionalLifeCycleStreaming<RequestType, ResponseType>;
    };

    BidirectionalLifeCycleStreaming();
    void SetQueueFunc(ExecutorQueueFuncType);

    // Function to actually process the request
    virtual void ExecuteRPC(RequestType& request, ResponseType& response) = 0;

    void FinishResponse() final override;
    void CancelResponse() final override;

  private:
    std::tuple<bool, bool, bool> EvaluateState();
    void ProgressState(bool should_write, bool should_execute, bool should_finish);

    // IContext Methods
    bool RunNextState(bool ok) final override;
    bool RunNextState(bool (BidirectionalLifeCycleStreaming<Request, Response>::*state_fn)(bool),
                      bool ok);
    void Reset() final override;

    // BidirectionalLifeCycleStreaming Specific Methods
    bool StateInitializedDone(bool ok);
    bool StateRequestDone(bool ok);
    bool StateResponseDone(bool ok);
    bool StateFinishDone(bool ok);
    bool StateInvalid(bool ok);

    // Function pointers
    ExecutorQueueFuncType m_QueuingFunc;
    bool (BidirectionalLifeCycleStreaming<RequestType, ResponseType>::*m_NextState)(bool);

    // Variables
    // The mutex will be more useful once we can keep reading requests
    // without waiting for response to be sent
    std::mutex m_QueueMutex;
    std::queue<RequestType> m_RequestQueue;
    std::queue<ResponseType> m_ResponseQueue;
    std::queue<ResponseType> m_ResponseWriteBackQueue;

    StateContext<RequestType, ResponseType> m_ReadStateContext;
    StateContext<RequestType, ResponseType> m_WriteStateContext;

    bool m_Writing;
    bool m_Executing;
    bool m_WritesDone;
    bool m_Finishing;

    std::unique_ptr<::grpc::ServerContext> m_Context;
    std::unique_ptr<::grpc_impl::ServerAsyncReaderWriter<ResponseType, RequestType>> m_ReaderWriter;

    friend class StateContext<RequestType, ResponseType>;

  public:
    template<class RequestFuncType, class ServiceType>
    static ServiceQueueFuncType BindServiceQueueFunc(
        /*
        std::function<void(
            ServiceType *, ::grpc::ServerContext *,
            ::grpc_impl::ServerAsyncReaderWriter<ResponseType, RequestType>*,
            ::grpc::CompletionQueue *, ::grpc::ServerCompletionQueue *, void *)>
        */
        RequestFuncType request_fn, ServiceType* service_type)
    {
        return std::bind(request_fn, service_type,
                         std::placeholders::_1, // ServerContext*
                         std::placeholders::_2, // AsyncReaderWriter<OutputType, InputType>
                         std::placeholders::_3, // CQ
                         std::placeholders::_4, // ServerCQ
                         std::placeholders::_5 // Tag
        );
    }

    static ExecutorQueueFuncType BindExecutorQueueFunc(ServiceQueueFuncType service_q_fn,
                                                       ::grpc::ServerCompletionQueue* cq)
    {
        return std::bind(service_q_fn,
                         std::placeholders::_1, // ServerContext*
                         std::placeholders::_2, // AsyncReaderWriter<Response, Request> *
                         cq, cq,
                         std::placeholders::_3 // Tag
        );
    }
};

// Implementation
template<class Request, class Response>
bool BidirectionalLifeCycleStreaming<Request, Response>::RunNextState(bool ok)
{
    return (this->*m_NextState)(ok);
}

template<class Request, class Response>
bool BidirectionalLifeCycleStreaming<Request, Response>::RunNextState(
    bool (BidirectionalLifeCycleStreaming<Request, Response>::*state_fn)(bool), bool ok)
{
    return (this->*state_fn)(ok);
}

template<class Request, class Response>
BidirectionalLifeCycleStreaming<Request, Response>::BidirectionalLifeCycleStreaming()
    : m_ReadStateContext(static_cast<IContext*>(this)),
      m_WriteStateContext(static_cast<IContext*>(this)), m_WritesDone(false), m_Writing(false),
      m_Executing(false), m_Finishing(false)
{
    m_ReadStateContext.m_NextState =
        &BidirectionalLifeCycleStreaming<RequestType, ResponseType>::StateRequestDone;
    m_WriteStateContext.m_NextState =
        &BidirectionalLifeCycleStreaming<RequestType, ResponseType>::StateResponseDone;
}

template<class Request, class Response>
void BidirectionalLifeCycleStreaming<Request, Response>::Reset()
{
    std::queue<RequestType> empty_request_queue;
    std::queue<ResponseType> empty_response_queue;
    std::queue<ResponseType> empty_response_write_back_queue;
    OnLifeCycleReset();
    {
        std::lock_guard<std::mutex> lock(m_QueueMutex);
        m_Writing = false;
        m_Executing = false;
        m_WritesDone = false;
        m_Finishing = false;
        m_RequestQueue.swap(empty_request_queue);
        m_ResponseQueue.swap(empty_response_queue);
        m_ResponseWriteBackQueue.swap(empty_response_write_back_queue);
        m_Context.reset(new ::grpc::ServerContext);
        m_ReaderWriter.reset(
            new ::grpc_impl::ServerAsyncReaderWriter<ResponseType, RequestType>(m_Context.get()));

        m_NextState =
            &BidirectionalLifeCycleStreaming<RequestType, ResponseType>::StateInitializedDone;
    }
    m_QueuingFunc(m_Context.get(), m_ReaderWriter.get(), IContext::Tag());
}

template<class Request, class Response>
std::tuple<bool, bool, bool> BidirectionalLifeCycleStreaming<Request, Response>::EvaluateState()
{
    bool should_write = false;
    bool should_execute = false;
    bool should_finish = false;

    if(!m_Executing && !m_ResponseQueue.empty())
    {
        should_execute = true;
        m_Executing = true;
    }

    if(!m_Writing && !m_ResponseWriteBackQueue.empty())
    {
        should_write = true;
        m_Writing = true;
    }

    if(!should_write && !should_execute && !m_Writing && !m_Executing && !m_Finishing &&
       m_WritesDone)
    {
        should_finish = true;
        m_Finishing = true;
        m_NextState = &BidirectionalLifeCycleStreaming<RequestType, ResponseType>::StateFinishDone;
    }

    DLOG(INFO) << should_write << "; " << should_execute << "; " << should_finish << " -- "
               << m_Writing << "; " << m_Executing << "; " << m_WritesDone;

    return std::make_tuple(should_write, should_execute, should_finish);
}

template<class Request, class Response>
void BidirectionalLifeCycleStreaming<Request, Response>::ProgressState(bool should_write,
                                                                       bool should_execute,
                                                                       bool should_finish)
{
    if(should_write)
    {
        DLOG(INFO) << "Writing response";
        m_ReaderWriter->Write(m_ResponseWriteBackQueue.front(),
                              m_WriteStateContext.IContext::Tag());
    }
    if(should_execute)
    {
        DLOG(INFO) << "Executing";
        ExecuteRPC(m_RequestQueue.front(), m_ResponseQueue.front());
    }
    if(should_finish)
    {
        DLOG(INFO) << "Triggering Finish";
        m_ReaderWriter->Finish(::grpc::Status::OK, IContext::Tag());
    }
}

// The following are a set of functions used as function pointers
// to keep track of the state of the context.
template<class Request, class Response>
bool BidirectionalLifeCycleStreaming<Request, Response>::StateInitializedDone(bool ok)
{
    if(!ok)
    {
        return false;
    }

    OnLifeCycleStart();
    // Start reading once connection is created
    {
        std::lock_guard<std::mutex> lock(m_QueueMutex);
        m_RequestQueue.emplace();
        m_NextState = &BidirectionalLifeCycleStreaming<RequestType, ResponseType>::StateInvalid;
    }
    m_ReaderWriter->Read(&m_RequestQueue.back(), m_ReadStateContext.IContext::Tag());
    return true;
}

// If the Context.m_NextState is at this state or at StateResponseDone state,
// it will keep reading requests from the stream until no more requests will
// be read (Read() brings back status ok==false)
template<class Request, class Response>
bool BidirectionalLifeCycleStreaming<Request, Response>::StateRequestDone(bool ok)
{
    // No more message to be read from this stream, however, if it is executing
    // a request, then a ServerReaderWriter::Write() will be called. In that case,
    // let WriteStateContext handle the reset procedure.

    bool should_write, should_execute, should_finish, should_read = true;

    {
        std::lock_guard<std::mutex> lock(m_QueueMutex);
        DLOG(INFO) << "RequestDone Triggered";

        if(!ok)
        {
            {
                // Client called WritesDone
                DLOG(INFO) << "WritesDone received from Client; closing Server Reads";
                m_WritesDone = true;
            }
        }

        // Successfully receive request
        should_read = !m_WritesDone;
        if(should_read)
        {
            m_ResponseQueue.emplace(); // add a response object which will be written on execution
            m_RequestQueue.emplace(); // post a read/recv on a new request object
        }

        auto should_wef = EvaluateState();
        should_write = std::get<0>(should_wef);
        should_execute = std::get<1>(should_wef);
        should_finish = std::get<2>(should_wef);
    }
    if(should_read)
    {
        // Post Read/Receive
        m_ReaderWriter->Read(&m_RequestQueue.back(), m_ReadStateContext.IContext::Tag());
    }
    ProgressState(should_write, should_execute, should_finish);
    return true;
}

// If the Context.m_NextState is at this state or at StateResponseDone state,
// it will keep writing completed response to the stream until it is closed
// (Write() brings back status ok==false)
template<class Request, class Response>
bool BidirectionalLifeCycleStreaming<Request, Response>::StateResponseDone(bool ok)
{
    // If write didn't go through, then the call is dead. Start reseting
    if(!ok)
    {
        // this is likely an unrecoverable error on the client
        // i think we should return a false without trying to cancel;
        DLOG(ERROR) << "not ok in ResponseDone";
        CancelResponse();
        return true;
    }

    // Done writing back one response
    bool should_write, should_execute, should_finish;
    {
        std::lock_guard<std::mutex> lock(m_QueueMutex);
        DLOG(ERROR) << "Finished Writing a Response - ResponseDone";

        m_Writing = false;
        m_ResponseWriteBackQueue.pop();

        auto should_wef = EvaluateState();
        should_write = std::get<0>(should_wef);
        should_execute = std::get<1>(should_wef);
        should_finish = std::get<2>(should_wef);
    }
    ProgressState(should_write, should_execute, should_finish);
    return true;
}

template<class Request, class Response>
bool BidirectionalLifeCycleStreaming<Request, Response>::StateFinishDone(bool ok)
{
    DLOG(INFO) << "Server closed Write stream - FinishedDone";
    return false;
}

template<class Request, class Response>
bool BidirectionalLifeCycleStreaming<Request, Response>::StateInvalid(bool ok)
{
    throw std::runtime_error("invalid state");
    return false;
}

template<class Request, class Response>
void BidirectionalLifeCycleStreaming<Request, Response>::FinishResponse()
{
    bool should_write, should_execute, should_finish;
    {
        std::lock_guard<std::mutex> lock(m_QueueMutex);
        DLOG(INFO) << "InFinishResponse";

        m_Executing = false;
        m_ResponseWriteBackQueue.push(std::move(m_ResponseQueue.front()));
        m_RequestQueue.pop();
        m_ResponseQueue.pop();

        auto should_wef = EvaluateState();
        should_write = std::get<0>(should_wef);
        should_execute = std::get<1>(should_wef);
        should_finish = std::get<2>(should_wef);
    }
    ProgressState(should_write, should_execute, should_finish);
}

template<class Request, class Response>
void BidirectionalLifeCycleStreaming<Request, Response>::CancelResponse()
{
    bool reset_ready = false;
    {
        std::lock_guard<std::mutex> lock(m_QueueMutex);
        m_NextState = &BidirectionalLifeCycleStreaming<RequestType, ResponseType>::StateFinishDone;
        reset_ready = !m_Executing;
    }
    // Only call Finish() when no RPC is being executed to avoid clearing
    // request and response while they are being referenced in the RPC
    if(reset_ready)
    {
        DLOG(INFO) << "Closing Server Writes";
        m_ReaderWriter->Finish(::grpc::Status::CANCELLED, IContext::Tag());
    }
}

template<class Request, class Response>
void BidirectionalLifeCycleStreaming<Request, Response>::SetQueueFunc(
    ExecutorQueueFuncType queue_fn)
{
    m_QueuingFunc = queue_fn;
}

} // namespace nvrpc

================================================
FILE: trtlab/nvrpc/include/nvrpc/life_cycle_streaming.h
================================================
// Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
#pragma once

#include <queue>
#include "nvrpc/interfaces.h"

#include <glog/logging.h>

namespace nvrpc
{
    /**
 * @brief Base Steaming Life Cycle
 *
 * For every incoming request, `ReceivedRequest` is executed with a shared pointer to a
 * `ServerStream`.  The `ServerStream` is used to write responses back on the stream or
 * optionally cancel the stream.  The life cycle maintains a weak pointer to the `ServerStream`
 * which allows it detect when all external `ServerStream` objects have been dereferenced.
 *
 * The `ServerStream` can be handed off to an external Resource.  If this happens, the external
 * resource can write back responses on the stream until either it decides to release the
 * `ServerStream` object or the stream becomes invalidated.  All `ServerStream` object become
 * invalid when either `CloseStream` or `FinishStream` is called on either a `ServerStream`
 * or by the LifeCycle.
 *
 * The stream is closed when:
 *  1) the client has closed its half of stream, meaning no more Requests will be received,
 *  2) either,
 *     a) all `ServerStream` object have been dereferenced, or
 *     b) `CancelStream` or `FinishStream` is called on either the lifecycle or an external
 *        `ServerStream`.
 *
 * @tparam Request
 * @tparam Response
 */
    template <class Request, class Response>
    class LifeCycleStreaming : public IContextLifeCycle
    {
    public:
        using RequestType  = Request;
        using ResponseType = Response;
        using ServiceQueueFuncType =
            std::function<void(::grpc::ServerContext*, ::grpc_impl::ServerAsyncReaderWriter<ResponseType, RequestType>*,
                               ::grpc::CompletionQueue*, ::grpc::ServerCompletionQueue*, void*)>;
        using ExecutorQueueFuncType =
            std::function<void(::grpc::ServerContext*, ::grpc_impl::ServerAsyncReaderWriter<ResponseType, RequestType>*, void*)>;

        ~LifeCycleStreaming() override {}

        class ServerStream;

    protected:
        LifeCycleStreaming();
        void SetQueueFunc(ExecutorQueueFuncType);

        // developer must implement RequestsReceived
        virtual void RequestReceived(Request&&, std::shared_ptr<ServerStream>) = 0;

        // optional hooks to perform actions when the stream is openned/closed
        virtual void StreamInitialized(std::shared_ptr<ServerStream>) {}
        virtual void RequestsFinished(std::shared_ptr<ServerStream>) {}

        // TODO: Add an OnInitialized virtual method

    public:
        // template<typename RequestType, typename ResponseType>
        class ServerStream
        {
        public:
            // ServerStream(LifeCycleStreaming<RequestType, ResponseType>* master) : m_Stream(master) {}
            ServerStream(LifeCycleStreaming<Request, Response>* master) : m_Stream(master) {}
            ~ServerStream()
            {
                if (m_Stream)
                {
                    LOG(WARNING) << "ServerStream is still valid on deconstruction; "
                                 << "this means an external handler was the last thing to own the ServerStream, "
                                 << "but Cancel/FinishStream was never called";
                    m_Stream->UnblockFinish();
                    m_Stream->CancelResponse();
                }
            }

            bool IsConnected()
            {
                return m_Stream;
            }

            std::uint64_t StreamID()
            {
                std::lock_guard<std::recursive_mutex> lock(m_Mutex);
                if (!IsConnected())
                {
                    DLOG(WARNING) << "Attempted to get ID of a disconnected stream";
                    return 0UL;
                }
                return reinterpret_cast<std::uint64_t>(m_Stream->Tag());
            }

            bool WriteResponse(ResponseType&& response)
            {
                std::lock_guard<std::recursive_mutex> lock(m_Mutex);
                if (!IsConnected())
                {
                    DLOG(WARNING) << "Attempted to write to a disconnected stream";
                    return false;
                }
                m_Stream->WriteResponse(std::move(response));
                return true;
            }

            bool CancelStream()
            {
                std::lock_guard<std::recursive_mutex> lock(m_Mutex);
                if (!IsConnected())
                {
                    LOG(WARNING) << "Attempted to cancel to a disconnected stream";
                    return false;
                }
                m_Stream->CancelResponse();
                return false;
            }

            bool FinishStream()
            {
                std::lock_guard<std::recursive_mutex> lock(m_Mutex);
                if (!IsConnected())
                {
                    LOG(WARNING) << "Attempted to finish to a disconnected stream";
                    return false;
                }
                m_Stream->FinishResponse();
                return false;
            }

            bool BlockFinish()
            {
                std::lock_guard<std::recursive_mutex> lock(m_Mutex);
                if (!IsConnected())
                {
                    DLOG(WARNING) << "Attempted to block to a disconnected stream";
                    return false;
                }
                m_Stream->BlockFinish();
                return true;
            }

            bool UnblockFinish()
            {
                std::lock_guard<std::recursive_mutex> lock(m_Mutex);
                if (!IsConnected())
                {
                    DLOG(WARNING) << "Attempted to block to a disconnected stream";
                    return false;
                }
                m_Stream->UnblockFinish();
                return true;
            }

        protected:
            void Invalidate()
            {
                std::lock_guard<std::recursive_mutex> lock(m_Mutex);
                m_Stream = nullptr;
            }

        private:
            std::recursive_mutex                   m_Mutex;
            LifeCycleStreaming<Request, Response>* m_Stream;

            friend class LifeCycleStreaming<Request, Response>;
        };

    protected:
        template <class RequestType, class ResponseType>
        class StateContext : public IContext
        {
        public:
            StateContext(IContext* primary_context) : IContext(primary_context) {}

        private:
            // IContext Methods
            bool RunNextState(bool ok) final override
            {
                return static_cast<LifeCycleStreaming*>(m_PrimaryContext)->RunNextState(m_NextState, ok);
            }
            void Reset() final override
            {
                static_cast<LifeCycleStreaming*>(m_PrimaryContext)->Reset();
            }

            bool (LifeCycleStreaming<RequestType, ResponseType>::*m_NextState)(bool);

            friend class LifeCycleStreaming<RequestType, ResponseType>;
        };

    protected:
        void BlockFinish();
        void UnblockFinish();

    private:
        using ReadHandle    = bool;
        using WriteHandle   = bool;
        using ExecuteHandle = std::function<void()>;
        using FinishHandle  = bool;
        using Actions       = std::tuple<ReadHandle, WriteHandle, ExecuteHandle, FinishHandle>;

        // IContext Methods
        void Reset() final override;
        bool RunNextState(bool ok) final override;
        bool RunNextState(bool (LifeCycleStreaming<Request, Response>::*state_fn)(bool), bool ok);

        // IContextLifeCycle Methods
        void FinishResponse() final override;
        void CancelResponse() final override;

        // LifeCycleStreaming Specific Methods
        bool StateInitializedDone(bool ok);
        bool StateReadDone(bool ok);
        bool StateWriteDone(bool ok);
        bool StateFinishedDone(bool ok);
        bool StateInvalid(bool ok);

        // Progress Engine
        Actions EvaluateState();
        void    ForwardProgress(Actions&);

        // User Actions
        void WriteResponse(Response&& response);
        void CloseStream(::grpc::Status);

        // Function pointers
        ExecutorQueueFuncType m_QueuingFunc;
        bool (LifeCycleStreaming<RequestType, ResponseType>::*m_NextState)(bool);

        // Internal State
        std::recursive_mutex     m_QueueMutex;
        std::queue<RequestType>  m_RequestQueue;
        std::queue<ResponseType> m_ResponseQueue;

        std::shared_ptr<ServerStream> m_ServerStream;
        std::weak_ptr<ServerStream>   m_ExternalStream;

        bool m_Reading, m_Writing, m_Finishing, m_ReadsDone, m_WritesDone, m_ReadsFinished, m_BlockFinish;

        StateContext<RequestType, ResponseType> m_ReadStateContext;
        StateContext<RequestType, ResponseType> m_WriteStateContext;

        std::unique_ptr<::grpc::Status>                                                  m_Status;
        std::unique_ptr<::grpc::ServerContext>                                           m_Context;
        std::unique_ptr<::grpc_impl::ServerAsyncReaderWriter<ResponseType, RequestType>> m_Stream;

        friend class StateContext<RequestType, ResponseType>;
        // friend class ServerStream<RequestType, ResponseType>;
        friend class ServerStream;

    public:
        template <class RequestFuncType, class ServiceType>
        static ServiceQueueFuncType BindServiceQueueFunc(RequestFuncType request_fn, ServiceType* service_type)
        {
            /*
            RequestFuncType = std::function<void(
                ServiceType *, ::grpc::ServerContext *,
                ::grpc_impl::ServerAsyncReaderWriter<ResponseType, RequestType>*,
                ::grpc::CompletionQueue *, ::grpc::ServerCompletionQueue *, void *)>
        */
            return std::bind(request_fn, service_type,
                             std::placeholders::_1, // ServerContext*
                             std::placeholders::_2, // AsyncReaderWriter<OutputType, InputType>
                             std::placeholders::_3, // CQ
                             std::placeholders::_4, // ServerCQ
                             std::placeholders::_5  // Tag
            );
        }

        static ExecutorQueueFuncType BindExecutorQueueFunc(ServiceQueueFuncType service_q_fn, ::grpc::ServerCompletionQueue* cq)
        {
            return std::bind(service_q_fn,
                             std::placeholders::_1, // ServerContext*
                             std::placeholders::_2, // AsyncReaderWriter<Response, Request> *
                             cq, cq,
                             std::placeholders::_3 // Tag
            );
        }
    };

    // Implementation
    template <class Request, class Response>
    LifeCycleStreaming<Request, Response>::LifeCycleStreaming()
    : m_ReadStateContext(static_cast<IContext*>(this)),
      m_WriteStateContext(static_cast<IContext*>(this)),
      m_Reading(false),
      m_Writing(false),
      m_Finishing(false),
      m_ReadsDone(false),
      m_WritesDone(false),
      m_ReadsFinished(false),
      m_BlockFinish(false)
    {
        m_NextState                     = &LifeCycleStreaming<RequestType, ResponseType>::StateInvalid;
        m_ReadStateContext.m_NextState  = &LifeCycleStreaming<RequestType, ResponseType>::StateInvalid;
        m_WriteStateContext.m_NextState = &LifeCycleStreaming<RequestType, ResponseType>::StateInvalid;
    }

    template <class Request, class Response>
    void LifeCycleStreaming<Request, Response>::Reset()
    {
        std::queue<RequestType>  empty_request_queue;
        std::queue<ResponseType> empty_response_queue;
        OnLifeCycleReset();
        {
            std::lock_guard<std::recursive_mutex> lock(m_QueueMutex);
            m_Reading       = false;
            m_Writing       = false;
            m_Finishing     = false;
            m_ReadsDone     = false;
            m_WritesDone    = false;
            m_ReadsFinished = false;
            m_RequestQueue.swap(empty_request_queue);
            m_ResponseQueue.swap(empty_response_queue);
            m_ServerStream.reset();
            m_ExternalStream.reset();

            m_NextState                     = &LifeCycleStreaming<RequestType, ResponseType>::StateInitializedDone;
            m_ReadStateContext.m_NextState  = &LifeCycleStreaming<RequestType, ResponseType>::StateInvalid;
            m_WriteStateContext.m_NextState = &LifeCycleStreaming<RequestType, ResponseType>::StateInvalid;

            m_Status.reset();
            m_Context.reset(new ::grpc::ServerContext);
            m_Stream.reset(new ::grpc_impl::ServerAsyncReaderWriter<ResponseType, RequestType>(m_Context.get()));
        }
        m_QueuingFunc(m_Context.get(), m_Stream.get(), IContext::Tag());
    }

    template <class Request, class Response>
    bool LifeCycleStreaming<Request, Response>::RunNextState(bool ok)
    {
        return (this->*m_NextState)(ok);
    }

    template <class Request, class Response>
    bool LifeCycleStreaming<Request, Response>::RunNextState(bool (LifeCycleStreaming<Request, Response>::*state_fn)(bool), bool ok)
    {
        return (this->*state_fn)(ok);
    }

    template <class Request, class Response>
    typename LifeCycleStreaming<Request, Response>::Actions LifeCycleStreaming<Request, Response>::EvaluateState()
    {
        ReadHandle    should_read    = false;
        WriteHandle   should_write   = false;
        ExecuteHandle should_execute = nullptr;
        FinishHandle  should_finish  = false;

        if (!m_Reading && !m_ReadsDone)
        {
            should_read = true;
            m_Reading   = true;
            m_RequestQueue.emplace();
            m_ReadStateContext.m_NextState = &LifeCycleStreaming<RequestType, ResponseType>::StateReadDone;

            should_execute = [this, request = std::move(m_RequestQueue.front()), stream = m_ServerStream]() mutable {
                RequestReceived(std::move(request), stream);
            };
            m_RequestQueue.pop();
        }

        if (!m_Reading && m_ReadsDone && !m_ReadsFinished && !(m_Status || m_ExternalStream.expired()))
        {
            DCHECK_NOTNULL(m_ServerStream);
            m_ReadsFinished = true;
            should_execute  = [this, stream = m_ServerStream]() mutable {
                DVLOG(1) << "Client sent WritesDone";
                RequestsFinished(stream);
            };
            // we will never hand m_ServerStream to another handler
            // it is safe for the context to decrement the refcount
            // the ServerStream will Cancel the Stream if it is still
            // valid on deconstruction
            m_ServerStream.reset();
        }

        if (m_Status && !m_Status->ok())
        {
            DVLOG(1) << "Stream was CANCELLED by Server - Cancel All Callbacks";
            should_execute = nullptr;
        }

        if (!m_Writing && !m_ResponseQueue.empty())
        {
            should_write                    = true;
            m_Writing                       = true;
            m_WriteStateContext.m_NextState = &LifeCycleStreaming<RequestType, ResponseType>::StateWriteDone;
        }

        if (!m_Reading && !m_Writing && !m_Finishing && m_Status && !m_BlockFinish)
        {
            should_finish = true;
            m_Finishing   = true;
            m_NextState   = &LifeCycleStreaming<RequestType, ResponseType>::StateFinishedDone;
        }
        // clang-format off
    DVLOG(1) << (should_read ? 1 : 0) << (should_write ? 1 : 0) << (should_execute ? 1 : 0)
               << (should_finish ? 1 : 0) 
               << " -- " << m_Reading << m_Writing 
               << " -- " << m_ReadsDone << m_ReadsFinished << (m_Status ? 1 : 0) << (m_ExternalStream.expired() ? 1 : 0)
               << " -- " << m_Finishing;
        // clang-format on

        return std::make_tuple(should_read, should_write, should_execute, should_finish);
    }

    template <class Request, class Response>
    void LifeCycleStreaming<Request, Response>::ForwardProgress(Actions& actions)
    {
        ReadHandle    should_read    = std::get<0>(actions);
        WriteHandle   should_write   = std::get<1>(actions);
        ExecuteHandle should_execute = std::get<2>(actions);
        FinishHandle  should_finish  = std::get<3>(actions);

        if (should_write)
        {
            DVLOG(1) << "Writing Response";
            m_Stream->Write(m_ResponseQueue.front(), m_WriteStateContext.IContext::Tag());
        }
        if (should_execute)
        {
            DVLOG(1) << "Kicking off Execution";
            should_execute();
        }
        // moved to after should_execute; this allows the m_ReadCallback method
        // to update the state of the context without contending with another
        // potential m_ReadCallback call should the Read have been posted
        if (should_read)
        {
            DVLOG(1) << "Posting Read/Recv";
            m_Stream->Read(&m_RequestQueue.back(), m_ReadStateContext.IContext::Tag());
        }
        if (should_finish)
        {
            DVLOG(1) << "Closing Stream - " << (m_Status->ok() ? "OK" : "CANCELLED");
            m_Stream->Finish(*m_Status, IContext::Tag());
        }
    }

    // The following are a set of functions used as function pointers
    // to keep track of the state of the context.
    template <class Request, class Response>
    bool LifeCycleStreaming<Request, Response>::StateInitializedDone(bool ok)
    {
        if (!ok)
        {
            DVLOG(2) << "Stream Initialization Failed - Server Shutting Down";
            return false;
        }

        OnLifeCycleStart();
        {
            std::lock_guard<std::recursive_mutex> lock(m_QueueMutex);
            DVLOG(1) << "Initialize Stream";

            m_NextState = &LifeCycleStreaming<RequestType, ResponseType>::StateInvalid;

            // Object that allows the server to response on the stream
            // new ServerStream<RequestType, ResponseType>(this), [this](auto ptr) mutable {
            m_ServerStream   = std::shared_ptr<ServerStream>(new ServerStream(this), [this](auto ptr) mutable {
                // Custom Deleter - may trigger stream closing
                Actions actions;
                {
                    std::lock_guard<std::recursive_mutex> lock(m_QueueMutex);
                    DVLOG(1) << "All ServerStream objects have been deleted";
                    actions = this->EvaluateState();
                };
                this->ForwardProgress(actions);
                delete ptr;
            });
            m_ExternalStream = m_ServerStream;
            StreamInitialized(m_ServerStream);

            // Start reading once connection is created - State
            m_Reading = true;
            m_RequestQueue.emplace();
            m_ReadStateContext.m_NextState = &LifeCycleStreaming<RequestType, ResponseType>::StateReadDone;
        }

        // Start reading once connection is created - Action
        m_Stream->Read(&m_RequestQueue.back(), m_ReadStateContext.IContext::Tag());
        return true;
    }

    template <class Request, class Response>
    bool LifeCycleStreaming<Request, Response>::StateReadDone(bool ok)
    {
        Actions actions;
        {
            std::lock_guard<std::recursive_mutex> lock(m_QueueMutex);
            DVLOG(1) << "ReadDone Event: " << (ok ? "OK" : "NOT OK");

            m_Reading = false;

            if (!ok)
            {
                {
                    // Client called WritesDone
                    DVLOG(1) << "WritesDone received from Client; closing Server Reads";
                    m_ReadsDone = true;
                    m_RequestQueue.pop();
                    m_ReadStateContext.m_NextState = &LifeCycleStreaming<RequestType, ResponseType>::StateInvalid;
                }
            }

            actions = EvaluateState();
        }
        ForwardProgress(actions);
        return true;
    }

    template <class Request, class Response>
    bool LifeCycleStreaming<Request, Response>::StateWriteDone(bool ok)
    {
        // If write didn't go through, then the call is dead. Start reseting
        DCHECK(m_ResponseQueue.size());

        Actions actions;
        {
            std::lock_guard<std::recursive_mutex> lock(m_QueueMutex);
            DVLOG(1) << "WriteDone Event: " << (ok ? "OK" : "NOT OK");

            m_Writing = false;

            if (!ok)
            {
                DVLOG(1) << "Write Response failed on Stream; Client may have Cancelled";
                CancelResponse();
            }
            else
            {
                // todo: add a CallbackOnResponseSent
                m_ResponseQueue.pop();
                m_WriteStateContext.m_NextState = &LifeCycleStreaming<RequestType, ResponseType>::StateInvalid;
            }

            actions = EvaluateState();
        }
        ForwardProgress(actions);
        return true;
    }

    template <class Request, class Response>
    bool LifeCycleStreaming<Request, Response>::StateFinishedDone(bool ok)
    {
        DVLOG(1) << "Server closed Write stream - FinishedDone - " << (ok ? "OK" : "CANCELLED");
        // Clear the m_ServerStream since we will not be launching any new tasks
        // m_ExternalStream holds a weak_ptr to the original m_ServerStream so we
        // can track when the last external reference was released.
        m_ServerStream.reset();
        if (!m_ExternalStream.expired())
        {
            auto sp = m_ExternalStream.lock();
            sp->Invalidate();
        }
        return false;
    }

    template <class Request, class Response>
    bool LifeCycleStreaming<Request, Response>::StateInvalid(bool ok)
    {
        throw std::runtime_error("invalid state");
        return false;
    }

    template <class Request, class Response>
    void LifeCycleStreaming<Request, Response>::FinishResponse()
    {
        CloseStream(::grpc::Status::OK);
    }

    template <class Request, class Response>
    void LifeCycleStreaming<Request, Response>::CancelResponse()
    {
        CloseStream(::grpc::Status::CANCELLED);
    }

    template <class Request, class Response>
    void LifeCycleStreaming<Request, Response>::BlockFinish()
    {
        {
            std::lock_guard<std::recursive_mutex> lock(m_QueueMutex);
            DVLOG(1) << "Setting BlockFinish to true; UnblockFinish must be called for RPC to complete";
            m_BlockFinish = true;
        }
    }

    template <class Request, class Response>
    void LifeCycleStreaming<Request, Response>::UnblockFinish()
    {
        Actions actions;
        {
            std::lock_guard<std::recursive_mutex> lock(m_QueueMutex);
            DVLOG(1) << "Setting BlockFinish to false - this will also trigger a step thru the progress engine";
            m_BlockFinish = false;
        }
        ForwardProgress(actions);
    }

    template <class Request, class Response>
    void LifeCycleStreaming<Request, Response>::WriteResponse(Response&& response)
    {
        Actions actions;
        {
            std::lock_guard<std::recursive_mutex> lock(m_QueueMutex);
            DVLOG(1) << "Queuing Response";

            m_ResponseQueue.push(std::move(response));
            actions = EvaluateState();
        }
        ForwardProgress(actions);
    }

    template <class Request, class Response>
    void LifeCycleStreaming<Request, Response>::CloseStream(::grpc::Status status)
    {
        Actions actions;
        {
            std::lock_guard<std::recursive_mutex> lock(m_QueueMutex);
            DVLOG(1) << "Queue Close Stream: " << (status.ok() ? "OK" : "NOT OK");

            m_WritesDone = true;
            m_Status     = std::make_unique<::grpc::Status>(status);

            if (!m_ReadsDone)
            {
                DVLOG(1) << "Server Canceling before Client WritesDone; issue TryCancel() to flush Read Tags";
                m_ResponseQueue = std::queue<Response>();
                m_Context->TryCancel();
            }

            actions = EvaluateState();
        }
        ForwardProgress(actions);
    }

    template <class Request, class Response>
    void LifeCycleStreaming<Request, Response>::SetQueueFunc(ExecutorQueueFuncType queue_fn)
    {
        m_QueuingFunc = queue_fn;
    }

} // namespace nvrpc

================================================
FILE: trtlab/nvrpc/include/nvrpc/life_cycle_unary.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once
#include "nvrpc/interfaces.h"

namespace nvrpc {

template<class Request, class Response>
class LifeCycleUnary : public IContextLifeCycle
{
  public:
    using RequestType = Request;
    using ResponseType = Response;
    using ServiceQueueFuncType = std::function<void(
        ::grpc::ServerContext*, RequestType*, ::grpc_impl::ServerAsyncResponseWriter<ResponseType>*,
        ::grpc::CompletionQueue*, ::grpc::ServerCompletionQueue*, void*)>;
    using ExecutorQueueFuncType =
        std::function<void(::grpc::ServerContext*, RequestType*,
                           ::grpc_impl::ServerAsyncResponseWriter<ResponseType>*, void*)>;

    ~LifeCycleUnary() override {}

  protected:
    LifeCycleUnary() = default;
    void SetQueueFunc(ExecutorQueueFuncType);

    virtual void ExecuteRPC(RequestType& request, ResponseType& response) = 0;

    void FinishResponse() final override;
    void CancelResponse() final override;

    bool CheckDeadlineAndShouldContinue();

    const std::multimap<grpc::string_ref, grpc::string_ref>& ClientMetadata();

  private:
    // IContext Methods
    bool RunNextState(bool ok) final override;
    void Reset() final override;

    // LifeCycleUnary Specific Methods
    bool StateRequestDone(bool ok);
    bool StateFinishedDone(bool ok);

    // Function pointers
    ExecutorQueueFuncType m_QueuingFunc;
    bool (LifeCycleUnary<RequestType, ResponseType>::*m_NextState)(bool);

    // Variables
    std::unique_ptr<RequestType> m_Request;
    std::unique_ptr<ResponseType> m_Response;
    std::unique_ptr<::grpc::ServerContext> m_Context;
    std::unique_ptr<::grpc_impl::ServerAsyncResponseWriter<ResponseType>> m_ResponseWriter;

  public:
    template<class RequestFuncType, class ServiceType>
    static ServiceQueueFuncType BindServiceQueueFunc(
        /*
        std::function<void(
            ServiceType *, ::grpc::ServerContext *, RequestType *,
            ::grpc_impl::ServerAsyncResponseWriter<ResponseType> *,
            ::grpc::CompletionQueue *, ::grpc::ServerCompletionQueue *, void *)>
        */
        RequestFuncType request_fn, ServiceType* service_type)
    {
        return std::bind(request_fn, service_type,
                         std::placeholders::_1, // ServerContext*
                         std::placeholders::_2, // InputType
                         std::placeholders::_3, // AsyncResponseWriter<OutputType>
                         std::placeholders::_4, // CQ
                         std::placeholders::_5, // ServerCQ
                         std::placeholders::_6 // Tag
        );
    }

    static ExecutorQueueFuncType BindExecutorQueueFunc(ServiceQueueFuncType service_q_fn,
                                                       ::grpc::ServerCompletionQueue* cq)
    {
        return std::bind(service_q_fn,
                         std::placeholders::_1, // ServerContext*
                         std::placeholders::_2, // Request *
                         std::placeholders::_3, // AsyncResponseWriter<Response> *
                         cq, cq,
                         std::placeholders::_4 // Tag
        );
    }
};

// Implementation

template<class Request, class Response>
bool LifeCycleUnary<Request, Response>::RunNextState(bool ok)
{
    return (this->*m_NextState)(ok);
}

template<class Request, class Response>
void LifeCycleUnary<Request, Response>::Reset()
{
    OnLifeCycleReset();
    m_Request.reset(new Request);
    m_Response.reset(new Response);
    m_Context.reset(new ::grpc::ServerContext);
    m_ResponseWriter.reset(new ::grpc_impl::ServerAsyncResponseWriter<ResponseType>(m_Context.get()));
    m_NextState = &LifeCycleUnary<RequestType, ResponseType>::StateRequestDone;
    m_QueuingFunc(m_Context.get(), m_Request.get(), m_ResponseWriter.get(), IContext::Tag());
}

template<class Request, class Response>
const std::multimap<grpc::string_ref, grpc::string_ref>&
    LifeCycleUnary<Request, Response>::ClientMetadata()
{
    return m_Context->client_metadata();
}

template<class Request, class Response>
bool LifeCycleUnary<Request, Response>::CheckDeadlineAndShouldContinue()
{
    // this segfaults
    // todo: work on the check deadline feature
    /*
    if(m_Context->IsCancelled())
    {
        CancelResponse();
        return false;
    }
    */
    return true;
}

template<class Request, class Response>
bool LifeCycleUnary<Request, Response>::StateRequestDone(bool ok)
{
    if(!ok)
    {
        return false;
    }
    OnLifeCycleStart();
    if(CheckDeadlineAndShouldContinue())
    {
        ExecuteRPC(*m_Request, *m_Response);
    }
    return true;
}

template<class Request, class Response>
bool LifeCycleUnary<Request, Response>::StateFinishedDone(bool ok)
{
    return false;
}

template<class Request, class Response>
void LifeCycleUnary<Request, Response>::FinishResponse()
{
    m_NextState = &LifeCycleUnary<RequestType, ResponseType>::StateFinishedDone;
    m_ResponseWriter->Finish(*m_Response, ::grpc::Status::OK, IContext::Tag());
}

template<class Request, class Response>
void LifeCycleUnary<Request, Response>::CancelResponse()
{
    m_NextState = &LifeCycleUnary<RequestType, ResponseType>::StateFinishedDone;
    m_ResponseWriter->Finish(*m_Response, ::grpc::Status::CANCELLED, IContext::Tag());
}

template<class Request, class Response>
void LifeCycleUnary<Request, Response>::SetQueueFunc(ExecutorQueueFuncType queue_fn)
{
    m_QueuingFunc = queue_fn;
}

} // namespace nvrpc


================================================
FILE: trtlab/nvrpc/include/nvrpc/rpc.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#ifndef NVIS_RPC_H_
#define NVIS_RPC_H_

#include "nvrpc/context.h"
#include "trtlab/core/resources.h"

namespace nvrpc {

template<class ContextType, class ServiceType>
class AsyncRPC : public IRPC
{
  public:
    using ContextType_t = ContextType;
    using ServiceQueueFuncType = typename ContextType::LifeCycleType::ServiceQueueFuncType;
    using ExecutorQueueFuncType = typename ContextType::LifeCycleType::ExecutorQueueFuncType;

    AsyncRPC(ServiceQueueFuncType);
    ~AsyncRPC() override {}

  protected:
    std::unique_ptr<IContext> CreateContext(::grpc::ServerCompletionQueue*,
                                            std::shared_ptr<::trtlab::Resources>) final override;

  private:
    ServiceQueueFuncType m_RequestFunc;
};

template<class ContextType, class ServiceType>
AsyncRPC<ContextType, ServiceType>::AsyncRPC(ServiceQueueFuncType req_fn) : m_RequestFunc(req_fn)
{
}

template<class ContextType, class ServiceType>
std::unique_ptr<IContext>
    AsyncRPC<ContextType, ServiceType>::CreateContext(::grpc::ServerCompletionQueue* cq,
                                                      std::shared_ptr<::trtlab::Resources> r)
{
    auto ctx_resources =
        std::dynamic_pointer_cast<typename ContextType::ResourcesType::element_type>(r);
    if(!ctx_resources)
    {
        throw std::runtime_error("Incompatible Resource object");
    }
    auto q_fn = ContextType::LifeCycleType::BindExecutorQueueFunc(m_RequestFunc, cq);
    std::unique_ptr<IContext> ctx = ContextFactory<ContextType>(q_fn, ctx_resources);
    return ctx;
}

} // namespace nvrpc

#endif // NVIS_RPC_H_


================================================
FILE: trtlab/nvrpc/include/nvrpc/server.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#ifndef NVIS_SERVER_H_
#define NVIS_SERVER_H_
#pragma once

#include <chrono>
#include <condition_variable>

#include "nvrpc/service.h"

namespace nvrpc {

using std::chrono::milliseconds;

class Server
{
  public:
    Server(std::string server_address);

    Server() : Server("0.0.0.0:50051") {}

    template<class ServiceType>
    AsyncService<typename ServiceType::AsyncService>* RegisterAsyncService();

    IExecutor* RegisterExecutor(IExecutor* executor)
    {
        m_Executors.emplace_back(executor);
        executor->Initialize(m_Builder);
        return executor;
    }

    void Run();
    void Run(milliseconds timeout, std::function<void()> control_fn);
    void AsyncStart();
    void Shutdown();

    bool Running();

    ::grpc::ServerBuilder& Builder();

  private:
    bool m_Running;
    std::mutex m_Mutex;
    std::condition_variable m_Condition;
    std::string m_ServerAddress;
    ::grpc::ServerBuilder m_Builder;
    std::unique_ptr<::grpc::Server> m_Server;
    std::vector<std::unique_ptr<IService>> m_Services;
    std::vector<std::unique_ptr<IExecutor>> m_Executors;
};

template<class ServiceType>
AsyncService<typename ServiceType::AsyncService>* Server::RegisterAsyncService()
{
    if(m_Running)
    {
        throw std::runtime_error("Error: cannot register service on a running server");
    }
    auto service = new AsyncService<typename ServiceType::AsyncService>;
    auto base = static_cast<IService*>(service);
    m_Services.emplace_back(base);
    service->Initialize(m_Builder);
    return service;
}

} // namespace nvrpc

#endif // NVIS_SERVER_H_


================================================
FILE: trtlab/nvrpc/include/nvrpc/service.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

#include "nvrpc/interfaces.h"
#include "nvrpc/rpc.h"

namespace nvrpc {

template<class ServiceType>
class AsyncService : public IService
{
  public:
    using ServiceType_t = ServiceType;

    AsyncService() : IService(), m_Service(std::make_unique<ServiceType>()) {}
    ~AsyncService() override {}

    void Initialize(::grpc::ServerBuilder& builder) final override
    {
        builder.RegisterService(m_Service.get());
    }

    template<typename ContextType, typename RequestFuncType>
    IRPC* RegisterRPC(RequestFuncType req_fn)
    {
        auto q_fn = ContextType::LifeCycleType::BindServiceQueueFunc(req_fn, m_Service.get());
        auto rpc = new AsyncRPC<ContextType, ServiceType>(q_fn);
        auto base = static_cast<IRPC*>(rpc);
        m_RPCs.emplace_back(base);
        return base;
    }

  private:
    std::unique_ptr<ServiceType> m_Service;
    std::vector<std::unique_ptr<IRPC>> m_RPCs;
};

} // namespace nvrpc


================================================
FILE: trtlab/nvrpc/src/client/client_executor.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "nvrpc/client/executor.h"
#include "nvrpc/client/base_context.h"

#include <glog/logging.h>

using trtlab::ThreadPool;

namespace nvrpc {
namespace client {

Executor::Executor() : Executor(1) {}

Executor::Executor(int numThreads) : Executor(std::make_unique<ThreadPool>(numThreads)) {}

Executor::Executor(std::unique_ptr<ThreadPool> threadpool) : m_ThreadPool(std::move(threadpool)), m_Counter(0)
{
    // for(decltype(m_ThreadPool->Size()) i = 0; i < m_ThreadPool->Size(); i++)
    for(auto i = 0; i < m_ThreadPool->Size(); i++)
    {
        DVLOG(1) << "Starting Client Progress Engine #" << i;
        m_CQs.emplace_back(new ::grpc::CompletionQueue);
        auto cq = m_CQs.back().get();
        m_ThreadPool->enqueue([this, cq] { ProgressEngine(*cq); });
    }
}

Executor::~Executor() { ShutdownAndJoin(); }

void Executor::ShutdownAndJoin()
{
    for(auto& cq : m_CQs)
    {
        cq->Shutdown();
    }
    m_ThreadPool.reset();
}

void Executor::ProgressEngine(::grpc::CompletionQueue& cq)
{
    void* tag;
    bool ok = false;

    while(cq.Next(&tag, &ok))
    {
        // CHECK(ok);
        BaseContext* ctx = BaseContext::Detag(tag);
        DVLOG(3) << "executor issuing callback";
        auto should_delete = ctx->ExecutorShouldDeleteContext();
        if(!ctx->RunNextState(ok))
        {
            if(should_delete)
            {
                DVLOG(1) << "Deleting ClientContext: " << tag;
                delete ctx;
            }
        }
        DVLOG(3) << "executor callback complete";
    }
}

::grpc::CompletionQueue* Executor::GetNextCQ() const
{
    std::size_t idx = 0;
    {
        std::lock_guard<std::mutex> lock(m_Mutex);
        DCHECK_LT(m_Counter, m_ThreadPool->Size());
        if(++m_Counter == m_ThreadPool->Size()) { m_Counter = 0; }
        idx = m_Counter;
    }
    return m_CQs[idx].get();
}

} // namespace client
} // namespace nvrpc


================================================
FILE: trtlab/nvrpc/src/executor.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "nvrpc/executor.h"

#include <glog/logging.h>

#include <grpc/support/time.h>
#include <grpcpp/support/time.h>

using trtlab::ThreadPool;

namespace nvrpc {

Executor::Executor() : Executor(1) {}

Executor::Executor(int numThreads) : Executor(std::make_unique<ThreadPool>(numThreads)) {}

Executor::Executor(std::unique_ptr<ThreadPool> threadpool)
    : IExecutor(), m_ThreadPool(std::move(threadpool)), m_Running(false)
{
    m_TimeoutCallback = [] {};
}

void Executor::ProgressEngine(int thread_id)
{
    bool ok;
    void* tag;
    auto myCQ = m_ServerCompletionQueues[thread_id].get();
    using NextStatus = ::grpc::ServerCompletionQueue::NextStatus;
    m_Running = true;

    while(myCQ->Next(&tag, &ok))
    {
        auto ctx = IContext::Detag(tag);
        if(!RunContext(ctx, ok))
        {
            if(m_Running)
            {
                ResetContext(ctx);
            }
        }
    }
}

void Executor::SetTimeout(time_point deadline, std::function<void()> callback)
{
    m_TimeoutDeadline = deadline;
    m_TimeoutCallback = callback;
}

} // namespace nvrpc


================================================
FILE: trtlab/nvrpc/src/server.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "nvrpc/server.h"

#include <csignal>
#include <thread>

#include <glog/logging.h>

namespace {
std::function<void(int)> shutdown_handler;
void signal_handler(int signal) { shutdown_handler(signal); }
} // namespace

namespace nvrpc {

Server::Server(std::string server_address) : m_ServerAddress(server_address), m_Running(false)
{
    VLOG(1) << "gRPC will listening on: " << m_ServerAddress;
    m_Builder.AddListeningPort(m_ServerAddress, ::grpc::InsecureServerCredentials());
}

::grpc::ServerBuilder& Server::Builder()
{
    LOG_IF(FATAL, m_Running) << "Unable to access Builder after the Server is running.";
    return m_Builder;
}

void Server::Run()
{
    Run(std::chrono::milliseconds(1000), [] {});
}

void Server::Run(std::chrono::milliseconds timeout, std::function<void()> control_fn)
{
    AsyncStart();
    while(m_Running)
    {
        {
            std::unique_lock<std::mutex> lock(m_Mutex);
            if(m_Condition.wait_for(lock, timeout, [this] { return !m_Running; }))
            {
                // if not running
                m_Condition.notify_all();
                DLOG(INFO) << "Server::Run exitting";
                return;
            }
            else
            {
                // if running
                // DLOG(INFO) << "Server::Run executing user lambda";
                control_fn();
            }
        }
    }
}

void Server::AsyncStart()
{
    {
        std::lock_guard<std::mutex> lock(m_Mutex);
        CHECK_EQ(m_Running, false) << "Server is already running";
        m_Server = m_Builder.BuildAndStart();

        shutdown_handler = [this](int signal) {
            LOG(INFO) << "Trapped Signal: " << signal;
            Shutdown();
        };
        std::signal(SIGINT, signal_handler);

        for(int i = 0; i < m_Executors.size(); i++)
        {
            m_Executors[i]->Run();
        }
        m_Running = true;
    }
    m_Condition.notify_all();
    LOG(INFO) << "grpc server and event loop initialized and accepting connections";
}

void Server::Shutdown()
{
    LOG(INFO) << "Shutdown Requested";
    CHECK(m_Server);
    {
        std::lock_guard<std::mutex> lock(m_Mutex);
        m_Server->Shutdown();
        for(auto& executor : m_Executors)
        {
            executor->Shutdown();
        }
        m_Running = false;
    }
    m_Condition.notify_all();
}

bool Server::Running()
{
    std::lock_guard<std::mutex> lock(m_Mutex);
    return m_Running;
}

} // namespace nvrpc


================================================
FILE: trtlab/nvrpc/tests/CMakeLists.txt
================================================
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

find_package(GTest)
set(protobuf_MODULE_COMPATIBLE TRUE)
find_package(Protobuf CONFIG REQUIRED)
message(STATUS "Using protobuf ${protobuf_VERSION}")
set(_PROTOBUF_LIBPROTOBUF protobuf::libprotobuf)
set(_PROTOBUF_PROTOC $<TARGET_FILE:protobuf::protoc>)

INCLUDE(GRPCGenerateCPP)

PROTOBUF_GENERATE_CPP(PROTO_SRCS PROTO_HDRS 
    testing.proto
)

PROTOBUF_GENERATE_GRPC_CPP(PROTO_GRPC_SRCS PROTO_GRPC_HDRS 
    testing.proto
)

add_library(nvrpc-testing-protos
    ${PROTO_SRCS}
    ${PROTO_GRPC_SRCS}
)

target_link_libraries(nvrpc-testing-protos
  PUBLIC
  ${_PROTOBUF_LIBPROTOBUF}
)

target_include_directories(nvrpc-testing-protos PUBLIC
    ${CMAKE_CURRENT_BINARY_DIR}
)

set(LIBS nvrpc nvrpc-testing-protos)

add_executable(test_nvrpc
  test_resources.cc
  test_pingpong.cc
  test_server.cc
)

target_link_libraries(test_nvrpc
  PRIVATE 
    ${PROJECT_NAME}::core
    nvrpc
    nvrpc-client
    nvrpc-testing-protos
    GTest::gtest_main
)

add_test(
  NAME nvrpc
  COMMAND $<TARGET_FILE:test_nvrpc
)

================================================
FILE: trtlab/nvrpc/tests/test_build_client.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

#include "nvrpc/executor.h"
#include "nvrpc/server.h"

#include "nvrpc/client/client_streaming.h"
#include "nvrpc/client/client_unary.h"

#include "test_resources.h"

#include "testing.grpc.pb.h"
#include "testing.pb.h"

namespace nvrpc {
namespace testing {

std::unique_ptr<client::ClientUnary<Input, Output>> BuildUnaryClient()
{
    auto executor = std::make_shared<client::Executor>(1);

    auto channel = grpc::CreateChannel("localhost:13377", grpc::InsecureChannelCredentials());
    std::shared_ptr<TestService::Stub> stub = TestService::NewStub(channel);

    auto infer_prepare_fn = [stub](::grpc::ClientContext * context, const Input& request,
                                   ::grpc::CompletionQueue* cq) -> auto
    {
        return std::move(stub->PrepareAsyncUnary(context, request, cq));
    };

    return std::make_unique<client::ClientUnary<Input, Output>>(infer_prepare_fn, executor);
}

std::unique_ptr<client::ClientStreaming<Input, Output>>
    BuildStreamingClient(std::function<void(Input&&)> on_sent,
                         std::function<void(Output&&)> on_recv)
{
    auto executor = std::make_shared<client::Executor>(1);

    auto channel = grpc::CreateChannel("localhost:13377", grpc::InsecureChannelCredentials());
    std::shared_ptr<TestService::Stub> stub = TestService::NewStub(channel);

    auto infer_prepare_fn = [stub](::grpc::ClientContext * context,
                                   ::grpc::CompletionQueue * cq) -> auto
    {
        return std::move(stub->PrepareAsyncStreaming(context, cq));
    };

    return std::make_unique<client::ClientStreaming<Input, Output>>(infer_prepare_fn, executor,
                                                                    on_sent, on_recv);
}

} // namespace testing
} // namespace nvrpc

================================================
FILE: trtlab/nvrpc/tests/test_build_server.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

#include "nvrpc/executor.h"
#include "nvrpc/server.h"

#include "test_resources.h"

#include "testing.grpc.pb.h"
#include "testing.pb.h"

namespace nvrpc {
namespace testing {

template<typename Context>
std::unique_ptr<Server> BuildServer();

template<typename T, typename ExecutorType = Executor>
std::unique_ptr<Server> BuildStreamingServer()
{
    auto server = std::make_unique<Server>("0.0.0.0:13377");
    auto resources = std::make_shared<TestResources>(3);
    auto executor = server->RegisterExecutor(new ExecutorType(1));
    auto service = server->RegisterAsyncService<TestService>();
    auto rpc_streaming = service->RegisterRPC<T>(&TestService::AsyncService::RequestStreaming);
    executor->RegisterContexts(rpc_streaming, resources, 10);
    return std::move(server);
}

template<typename UnaryContext, typename StreamingContext, typename ExecutorType = Executor>
std::unique_ptr<Server> BuildServer()
{
    auto server = std::make_unique<Server>("0.0.0.0:13377");
    auto resources = std::make_shared<TestResources>(3);
    auto executor = server->RegisterExecutor(new ExecutorType(1));
    auto service = server->RegisterAsyncService<TestService>();
    auto rpc_unary = service->RegisterRPC<UnaryContext>(&TestService::AsyncService::RequestUnary);
    auto rpc_streaming =
        service->RegisterRPC<StreamingContext>(&TestService::AsyncService::RequestStreaming);
    executor->RegisterContexts(rpc_unary, resources, 10);
    executor->RegisterContexts(rpc_streaming, resources, 10);
    return std::move(server);
}

} // namespace testing
} // namespace nvrpc

================================================
FILE: trtlab/nvrpc/tests/test_pingpong.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "test_pingpong.h"

#include "nvrpc/server.h"

#include "test_build_client.h"
#include "test_build_server.h"

#include <nvrpc/fiber/executor.h>

#include <gtest/gtest.h>

#define PINGPONG_SEND_COUNT 10

using namespace nvrpc;
using namespace nvrpc::testing;

void PingPongUnaryContext::ExecuteRPC(Input& input, Output& output)
{
    auto headers    = ClientMetadata();
    auto model_name = headers.find("x-content-model");
    EXPECT_NE(model_name, headers.end());
    EXPECT_EQ(model_name->second, "flowers-152");
    output.set_batch_id(input.batch_id());
    FinishResponse();
}

void PingPongStreamingContext::RequestReceived(Input&& input, std::shared_ptr<ServerStream> stream)
{
    EXPECT_EQ(++m_Counter, input.batch_id());

    EXPECT_NE(stream, nullptr);
    Output output;
    output.set_batch_id(input.batch_id());
    stream->WriteResponse(std::move(output));
}

void PingPongStreamingContext::StreamInitialized(std::shared_ptr<ServerStream> stream)
{
    m_Counter = 0;
}


/**
 * @brief Server->Client stream closes with OK before Client->Server stream
 *
 * In this test, the Server closes its server->client stream with a call to FinishStream.  This
 * essentially says, "me the server is happy with what it has gotten and this call was a success".
 *
 * The server will continue to process incoming requests from the client, but will not be able to
 * send back responses.
 *
 * In the EarlyCancel test, we call CancelStream, which will also immediately stop and drain the
 * processing of incoming requests.
 */
void PingPongStreamingEarlyFinishContext::RequestReceived(Input&& input, std::shared_ptr<ServerStream> stream)
{
    //static size_t counter = 0;
    //m_Counter             = ++counter;
    m_Counter++;
    EXPECT_EQ(m_Counter, input.batch_id());

    if (stream && m_Counter > PINGPONG_SEND_COUNT / 2)
    {
        // We are closing the server->client portion of the stream early
        EXPECT_NE(stream, nullptr);
        stream->FinishStream();
    }
    if (!stream || !stream->IsConnected())
    {
        // Stream was closed
        EXPECT_GT(m_Counter, PINGPONG_SEND_COUNT / 2);
        return;
    }

    EXPECT_NE(stream, nullptr);
    Output output;
    output.set_batch_id(input.batch_id());
    stream->WriteResponse(std::move(output));
}

void PingPongStreamingEarlyFinishContext::StreamInitialized(std::shared_ptr<ServerStream> stream)
{
    m_Counter = 0;
}

void PingPongStreamingEarlyFinishContext::RequestsFinished(std::shared_ptr<ServerStream> stream)
{
    // The Server should still receive all incoming requests until the client sends WritesDone
    EXPECT_EQ(m_Counter, PINGPONG_SEND_COUNT);
}

/**
 * @brief Server->Client stream closes with CANCELLED before Client->Server stream
 *
 * In this test, the Server closes its server->client stream with a call to CancelStream.  This
 * essentially says, "me the server is unhappy with what it has gotten and its time to shut down"
 *
 * The server will stop processing incoming requests from the client and will not be able to
 * send back responses.
 */
void PingPongStreamingEarlyCancelContext::RequestReceived(Input&& input, std::shared_ptr<ServerStream> stream)
{
    // static size_t counter = 0;
    // m_Counter             = ++counter;
    m_Counter++;
    EXPECT_EQ(m_Counter, input.batch_id());

    if (stream && m_Counter > PINGPONG_SEND_COUNT / 2)
    {
        // We are closing the server->client portion of the stream early
        EXPECT_NE(stream, nullptr);
        stream->CancelStream();
    }
    if (!stream || !stream->IsConnected())
    {
        // Stream was closed
        EXPECT_EQ(m_Counter, PINGPONG_SEND_COUNT / 2 + 1);
        return;
    }

    EXPECT_NE(stream, nullptr);
    Output output;
    output.set_batch_id(input.batch_id());
    stream->WriteResponse(std::move(output));
}

void PingPongStreamingEarlyCancelContext::StreamInitialized(std::shared_ptr<ServerStream> stream)
{
    m_Counter = 0;
}

void PingPongStreamingEarlyCancelContext::RequestsFinished(std::shared_ptr<ServerStream> stream)
{
    // The Server should still receive all incoming requests until the client sends WritesDone
    EXPECT_EQ(m_Counter, PINGPONG_SEND_COUNT / 2);
}

class PingPongTest : public ::testing::Test
{
    void SetUp() override {}

    void TearDown() override
    {
        if (m_Server)
        {
            m_Server->Shutdown();
            m_Server.reset();
        }
    }

protected:
    std::unique_ptr<Server> m_Server;
};

TEST_F(PingPongTest, UnaryTest)
{
    m_Server = BuildServer<PingPongUnaryContext, PingPongStreamingContext>();
    m_Server->AsyncStart();
    EXPECT_TRUE(m_Server->Running());

    std::mutex  mutex;
    std::size_t count      = 0;
    std::size_t recv_count = 0;
    std::size_t send_count = PINGPONG_SEND_COUNT;

    auto client = BuildUnaryClient();

    std::vector<std::shared_future<void>> futures;

    for (int i = 1; i <= send_count; i++)
    {
        {
            std::lock_guard<std::mutex> lock(mutex);
            ++count;
        }
        Input input;
        input.set_batch_id(i);
        std::map<std::string, std::string> headers = {{"x-content-model", "flowers-152"}};
        futures.push_back(client->Enqueue(std::move(input),
                                          [&mutex, &count, &recv_count, i](Input& input, Output& output, ::grpc::Status& status) {
                                              EXPECT_EQ(output.batch_id(), i);
                                              EXPECT_TRUE(status.ok());
                                              std::lock_guard<std::mutex> lock(mutex);
                                              --count;
                                              ++recv_count;
                                          },
                                          headers));
    }

    for (auto& future : futures)
    {
        future.wait();
    }

    EXPECT_EQ(count, 0UL);
    EXPECT_EQ(send_count, recv_count);
    EXPECT_TRUE(m_Server->Running());

    m_Server->Shutdown();
    EXPECT_FALSE(m_Server->Running());
}

TEST_F(PingPongTest, FibersUnaryTest)
{
    // set up worker fiber pool
    trtlab::ThreadPool                    workers(1);
    bool                                  workers_running = true;
    std::mutex                            workers_mutex;
    boost::fibers::condition_variable_any workers_cv;
    for (int i = 0; i < workers.Size(); i++)
    {
        workers.enqueue([&workers_mutex, &workers_cv, &workers_running] {
            // start the fiber scheduler and put the main to deferred sleep
            LOG(INFO) << "fiber runner thread id: " << std::this_thread::get_id();
            boost::fibers::use_scheduling_algorithm<boost::fibers::algo::shared_work>();
            std::unique_lock<std::mutex> lock(workers_mutex);
            workers_cv.wait(lock, [&workers_running]() { return !workers_running; });
        });
    }

    m_Server = BuildServer<PingPongUnaryContext, PingPongStreamingContext, FiberExecutor>();
    m_Server->AsyncStart();
    EXPECT_TRUE(m_Server->Running());

    std::mutex  mutex;
    std::size_t count      = 0;
    std::size_t recv_count = 0;
    std::size_t send_count = PINGPONG_SEND_COUNT;

    auto client = BuildUnaryClient();

    std::vector<std::shared_future<void>> futures;

    for (int i = 1; i <= send_count; i++)
    {
        {
            std::lock_guard<std::mutex> lock(mutex);
            ++count;
        }
        Input input;
        input.set_batch_id(i);
        std::map<std::string, std::string> headers = {{"x-content-model", "flowers-152"}};
        futures.push_back(client->Enqueue(std::move(input),
                                          [&mutex, &count, &recv_count, i](Input& input, Output& output, ::grpc::Status& status) {
                                              EXPECT_EQ(output.batch_id(), i);
                                              EXPECT_TRUE(status.ok());
                                              std::lock_guard<std::mutex> lock(mutex);
                                              --count;
                                              ++recv_count;
                                          },
                                          headers));
    }

    for (auto& future : futures)
    {
        future.wait();
    }

    EXPECT_EQ(count, 0UL);
    EXPECT_EQ(send_count, recv_count);
    EXPECT_TRUE(m_Server->Running());

    m_Server->Shutdown();
    EXPECT_FALSE(m_Server->Running());

    // shutdown worker fibers
    std::unique_lock<std::mutex> lock(workers_mutex);
    workers_running = false;
    lock.unlock();
    workers_cv.notify_all();
}

TEST_F(PingPongTest, StreamingTest)
{
    m_Server = BuildServer<PingPongUnaryContext, PingPongStreamingContext>();
    m_Server->AsyncStart();
    EXPECT_TRUE(m_Server->Running());

    std::mutex  mutex;
    std::size_t count      = 0;
    std::size_t recv_count = 0;
    std::size_t send_count = PINGPONG_SEND_COUNT;

    auto on_recv = [&mutex, &count, &recv_count](Output&& response) {
        static size_t last = 0;
        EXPECT_EQ(++last, response.batch_id());
        std::lock_guard<std::mutex> lock(mutex);
        --count;
        ++recv_count;
    };

    auto stream = BuildStreamingClient([](Input&&) {}, on_recv);

    for (int i = 1; i <= send_count; i++)
    {
        {
            std::lock_guard<std::mutex> lock(mutex);
            ++count;
        }
        Input input;
        input.set_batch_id(i);
        EXPECT_TRUE(stream->Write(std::move(input)));
    }

    auto future = stream->Done();
    auto status = future.get();

    EXPECT_TRUE(status.ok());
    EXPECT_EQ(count, 0UL);
    EXPECT_EQ(send_count, recv_count);
    EXPECT_TRUE(m_Server->Running());

    m_Server->Shutdown();
    EXPECT_FALSE(m_Server->Running());
}

TEST_F(PingPongTest, FibersStreamingTest)
{

    // set up worker fiber pool
    trtlab::ThreadPool                    workers(4);
    bool                                  workers_running = true;
    std::mutex                            workers_mutex;
    boost::fibers::condition_variable_any workers_cv;
    for (int i = 0; i < workers.Size(); i++)
    {
        workers.enqueue([&workers_mutex, &workers_cv, &workers_running] {
            // start the fiber scheduler and put the main to deferred sleep
            LOG(INFO) << "fiber runner thread id: " << std::this_thread::get_id();
            boost::fibers::use_scheduling_algorithm<boost::fibers::algo::shared_work>();
            std::unique_lock<std::mutex> lock(workers_mutex);
            workers_cv.wait(lock, [&workers_running]() { return !workers_running; });
        });
    }

    m_Server = BuildServer<PingPongUnaryContext, PingPongStreamingContext, FiberExecutor>();
    m_Server->AsyncStart();
    EXPECT_TRUE(m_Server->Running());

    std::mutex  mutex;
    std::size_t count      = 0;
    std::size_t recv_count = 0;
    std::size_t send_count = PINGPONG_SEND_COUNT;

    auto on_recv = [&mutex, &count, &recv_count](Output&& response) {
        static size_t last = 0;
        EXPECT_EQ(++last, response.batch_id());
        std::lock_guard<std::mutex> lock(mutex);
        --count;
        ++recv_count;
    };

    auto stream = BuildStreamingClient([](Input&&) {}, on_recv);

    for (int i = 1; i <= send_count; i++)
    {
        {
            std::lock_guard<std::mutex> lock(mutex);
            ++count;
        }
        Input input;
        input.set_batch_id(i);
        EXPECT_TRUE(stream->Write(std::move(input)));
    }

    auto future = stream->Done();
    auto status = future.get();

    EXPECT_TRUE(status.ok());
    EXPECT_EQ(count, 0UL);
    EXPECT_EQ(send_count, recv_count);
    EXPECT_TRUE(m_Server->Running());

    m_Server->Shutdown();
    EXPECT_FALSE(m_Server->Running());

    // shutdown worker fibers
    std::unique_lock<std::mutex> lock(workers_mutex);
    workers_running = false;
    lock.unlock();
    workers_cv.notify_all();
}

TEST_F(PingPongTest, ServerEarlyFinish)
{
    m_Server = BuildStreamingServer<PingPongStreamingEarlyFinishContext>();
    m_Server->AsyncStart();
    EXPECT_TRUE(m_Server->Running());

    std::mutex  mutex;
    std::size_t count      = 0;
    std::size_t recv_count = 0;
    std::size_t send_count = PINGPONG_SEND_COUNT;

    auto on_recv = [&mutex, &count, &recv_count](Output&& response) {
        static size_t last = 0;
        EXPECT_EQ(++last, response.batch_id());
        std::lock_guard<std::mutex> lock(mutex);
        --count;
        ++recv_count;
    };

    auto stream = BuildStreamingClient([](Input&&) {}, on_recv);

    for (int i = 1; i <= send_count; i++)
    {
        {
            std::lock_guard<std::mutex> lock(mutex);
            ++count;
        }
        Input input;
        input.set_batch_id(i);
        EXPECT_TRUE(stream->Write(std::move(input)));
    }

    auto future = stream->Done();
    auto status = future.get();

    EXPECT_TRUE(status.ok());
    EXPECT_EQ(send_count / 2, recv_count);
    EXPECT_TRUE(m_Server->Running());

    m_Server->Shutdown();
    EXPECT_FALSE(m_Server->Running());
}

TEST_F(PingPongTest, ServerEarlyCancel)
{
    m_Server = BuildServer<PingPongUnaryContext, PingPongStreamingEarlyCancelContext>();
    m_Server->AsyncStart();
    EXPECT_TRUE(m_Server->Running());

    std::mutex  mutex;
    std::size_t count      = 0;
    std::size_t recv_count = 0;
    std::size_t send_count = PINGPONG_SEND_COUNT;

    auto on_recv = [&mutex, &count, &recv_count](Output&& response) {
        static size_t last = 0;
        EXPECT_EQ(++last, response.batch_id());
        std::lock_guard<std::mutex> lock(mutex);
        --count;
        ++recv_count;
    };

    auto stream = BuildStreamingClient([](Input&&) {}, on_recv);

    for (int i = 1; i <= send_count; i++)
    {
        {
            std::lock_guard<std::mutex> lock(mutex);
            ++count;
        }
        Input input;
        input.set_batch_id(i);
        EXPECT_TRUE(stream->Write(std::move(input)));
    }

    auto future = stream->Done();
    auto status = future.get();

    EXPECT_FALSE(status.ok());
    EXPECT_EQ(send_count / 2, recv_count);
    EXPECT_TRUE(m_Server->Running());

    // We need a sleep here - The Server's TryCancel() seems to
    // issue an OOB CANCELLED such that the Client receives the
    // status before the server actually flushes and shuts down.
    // This is expected behavior on gRPC cancelling.
    // The wait allows the server to complete it's testing
    // Since Client and Server are in the same process, we could
    // use a mutex+condition to synchronize this event. Anyone?
    std::this_thread::sleep_for(std::chrono::seconds(1));

    m_Server->Shutdown();
    EXPECT_FALSE(m_Server->Running());
}


================================================
FILE: trtlab/nvrpc/tests/test_pingpong.h
================================================


/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "nvrpc/context.h"

#include "test_resources.h"

#include "testing.grpc.pb.h"
#include "testing.pb.h"

namespace nvrpc {
namespace testing {

class PingPongUnaryContext final : public Context<Input, Output, TestResources>
{
    void ExecuteRPC(Input& input, Output& output) final override;
};

class PingPongStreamingContext final : public StreamingContext<Input, Output, TestResources>
{
    void RequestReceived(Input&& input, std::shared_ptr<ServerStream> stream) final override;
    void StreamInitialized(std::shared_ptr<ServerStream>) final override;

    std::size_t m_Counter;
};

class PingPongStreamingEarlyFinishContext final
    : public StreamingContext<Input, Output, TestResources>
{
    void RequestReceived(Input&& input, std::shared_ptr<ServerStream> stream) final override;
    void RequestsFinished(std::shared_ptr<ServerStream>) final override;
    void StreamInitialized(std::shared_ptr<ServerStream>) final override;

    std::size_t m_Counter;
};

class PingPongStreamingEarlyCancelContext final
    : public StreamingContext<Input, Output, TestResources>
{
    void RequestReceived(Input&& input, std::shared_ptr<ServerStream> stream) final override;
    void RequestsFinished(std::shared_ptr<ServerStream>) final override;
    void StreamInitialized(std::shared_ptr<ServerStream>) final override;

    std::size_t m_Counter;
};
} // namespace testing
} // namespace nvrpc

================================================
FILE: trtlab/nvrpc/tests/test_resources.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "test_resources.h"

namespace nvrpc {
namespace testing {

TestResources::TestResources(int numThreadsInPool) : m_ThreadPool(numThreadsInPool) {}

::trtlab::ThreadPool& TestResources::AcquireThreadPool() { return m_ThreadPool; }

void TestResources::StreamManagerInit()
{
    std::lock_guard<std::mutex> lock(m_MessageMutex);
    m_Running = true;
    m_ThreadPool.enqueue([this]() mutable { StreamManagerWorker(); });
}

void TestResources::StreamManagerFini()
{
    std::lock_guard<std::mutex> lock(m_MessageMutex);
    m_Running = false;
}

void TestResources::StreamManagerWorker()
{
    while(m_Running)
    {
        {
            std::lock_guard<std::mutex> lock(m_MessageMutex);
            for(auto& item : m_Streams)
            {
                LOG_FIRST_N(INFO, 10) << "Progress Engine";
                auto stream_id = item.first;
                auto& stream = item.second;

                for(size_t i = m_MessagesSent[stream_id] + 1; i <= m_MessagesRecv[stream_id]; i++)
                {
                    DLOG(INFO) << "Writing: " << i;
                    Output output;
                    output.set_batch_id(i);
                    stream->WriteResponse(std::move(output));
                }

                m_MessagesSent[stream_id] = m_MessagesRecv[stream_id];
            }
        }
        std::this_thread::sleep_for(std::chrono::microseconds(100));
    }
}

void TestResources::CloseStream(Stream stream)
{
    std::lock_guard<std::mutex> lock(m_MessageMutex);
    auto stream_id = stream->StreamID();

    m_Streams.erase(stream_id);
    m_MessagesRecv.erase(stream_id);
    m_MessagesSent.erase(stream_id);

    DLOG(INFO) << "****** Client Closed ****** ";
    stream->FinishStream();
}

void TestResources::IncrementStreamCount(Stream stream)
{
    std::lock_guard<std::mutex> lock(m_MessageMutex);
    auto stream_id = stream->StreamID();
    auto search = m_Streams.find(stream_id);
    if(search == m_Streams.end())
    {
        m_Streams[stream_id] = stream;
        m_MessagesRecv[stream_id] = 1;
    }
    else
    {
        m_MessagesRecv[stream_id]++;
    }
}

} // namespace testing
} // namespace nvrpc

================================================
FILE: trtlab/nvrpc/tests/test_resources.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

#include "trtlab/core/resources.h"
#include "trtlab/core/thread_pool.h"

#include "nvrpc/life_cycle_streaming.h"

#include "testing.grpc.pb.h"
#include "testing.pb.h"

namespace nvrpc {
namespace testing {

struct TestResources : public ::trtlab::Resources
{
    TestResources(int numThreadsInPool = 3);

    using Stream = std::shared_ptr<LifeCycleStreaming<Input, Output>::ServerStream>;
    using StreamID = std::size_t;
    using Counter = std::size_t;

    ::trtlab::ThreadPool& AcquireThreadPool();

    void StreamManagerInit();
    void StreamManagerFini();
    void StreamManagerWorker();

    void IncrementStreamCount(Stream);
    void CloseStream(Stream);

  private:
    ::trtlab::ThreadPool m_ThreadPool;

    bool m_Running;
    std::mutex m_MessageMutex;
    std::map<StreamID, Stream> m_Streams;
    std::map<StreamID, Counter> m_MessagesRecv;
    std::map<StreamID, Counter> m_MessagesSent;

    std::mutex m_ShutdownMutex;
    bool m_ClientRunning;
    bool m_ServerRunning;
};

} // namespace testing
} // namespace nvrpc

================================================
FILE: trtlab/nvrpc/tests/test_server.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "test_build_server.h"
#include "test_pingpong.h"
#include "test_resources.h"

#include <gtest/gtest.h>

using namespace nvrpc;
using namespace nvrpc::testing;

class ServerTest : public ::testing::Test
{
    void SetUp() override
    {
        m_BackgroundThreads = std::make_unique<::trtlab::ThreadPool>(1);
        m_Server = BuildServer<PingPongUnaryContext, PingPongStreamingContext>();
    }

    void TearDown() override
    {
        if(m_Server)
        {
            m_Server->Shutdown();
            m_Server.reset();
        }
        m_BackgroundThreads.reset();
    }

  protected:
    std::unique_ptr<Server> m_Server;
    std::unique_ptr<::trtlab::ThreadPool> m_BackgroundThreads;
};

TEST_F(ServerTest, AsyncStartAndShutdown)
{
    EXPECT_FALSE(m_Server->Running());
    m_Server->AsyncStart();
    EXPECT_TRUE(m_Server->Running());
    m_Server->Shutdown();
    EXPECT_FALSE(m_Server->Running());
}

TEST_F(ServerTest, RunAndShutdown)
{
    bool running = false;
    std::mutex mutex;
    std::condition_variable condition;

    EXPECT_FALSE(m_Server->Running());
    m_BackgroundThreads->enqueue([&, this] {
        m_Server->Run(std::chrono::milliseconds(1), [&] {
            {
                std::lock_guard<std::mutex> lock(mutex);
                running = true;
            }
            condition.notify_all();
        });
    });
    {
        std::unique_lock<std::mutex> lock(mutex);
        condition.wait(lock, [&running] { return running; });
    }
    EXPECT_TRUE(m_Server->Running());
    m_Server->Shutdown();
    EXPECT_FALSE(m_Server->Running());
}

================================================
FILE: trtlab/nvrpc/tests/testing.proto
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 syntax = "proto3";

 package nvrpc.testing;
 
 service TestService {
    rpc Unary (Input) returns (Output) {}
    rpc Streaming (stream Input) returns (stream Output) {}
 }
 
 message SystemV {
     uint64 shm_id = 1;
     uint64 offset = 2;
     uint64 size = 3;
 }
 
 message Input {
     uint64 batch_id = 1;
     oneof data {
         bytes raw_bytes = 2;
         SystemV sysv = 3;
     }
 }
 
 message Output {
     uint64 batch_id = 1;
 }
 
 
================================================
FILE: trtlab/pybind/CMakeLists.txt
================================================
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

add_subdirectory(trtlab)


================================================
FILE: trtlab/pybind/trtlab/CMakeLists.txt
================================================
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

pybind11_add_module(trtlab 
  infer.cc
  utils.cc
)

target_link_libraries(trtlab
  PUBLIC
    core
    cuda
    nvrpc
    nvrpc-client
    tensorrt
    nv-inference-protos
)

target_include_directories(trtlab
  PRIVATE
    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
)

##find_program(PYTHON "python)
##
##if (PYTHON)
##    set(SETUP_PY_IN "${CMAKE_CURRENT_SOURCE_DIR}/setup.py")
##    set(SETUP_PY    "${CMAKE_CURRENT_BINARY_DIR}/setup.py")
##    set(DEPS        "${CMAKE_CURRENT_SOURCE_DIR}/trtlab/__init__.py")
##    set(OUTPUT      "${CMAKE_CURRENT_BINARY_DIR}/build")
##
##    configure_file(${SETUP_PY_IN} ${SETUP_PY})
##
##    add_custom_command(OUTPUT ${OUTPUT}
##                       COMMAND ${PYTHON} ${SETUP_PY} build
##                       DEPENDS ${DEPS} _trtlab_cpp)
##
##    add_custom_target(target ALL DEPENDS ${OUTPUT})
##
##    install(CODE "execute_process(COMMAND ${PYTHON} ${SETUP_PY} bdist_wheel)")
##endif()


================================================
FILE: trtlab/pybind/trtlab/infer.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <pybind11/stl_bind.h>

namespace py = pybind11;

#include <future>
#include <memory>
#include <string>

#include <gflags/gflags.h>
#include <glog/logging.h>

#include "tensorrt/laboratory/bindings.h"
#include "tensorrt/laboratory/core/async_compute.h"
#include "tensorrt/laboratory/core/thread_pool.h"
#include "tensorrt/laboratory/infer_bench.h"
#include "tensorrt/laboratory/infer_runner.h"
#include "tensorrt/laboratory/inference_manager.h"
#include "tensorrt/laboratory/model.h"
#include "tensorrt/laboratory/runtime.h"
#include "tensorrt/laboratory/utils.h"

#include "utils.h"

using namespace trtlab;
using namespace trtlab;
using namespace trtlab::TensorRT;

#include "nvrpc/context.h"
#include "nvrpc/executor.h"
#include "nvrpc/server.h"
#include "nvrpc/service.h"

#include "nvrpc/client/client_unary.h"
#include "nvrpc/client/executor.h"

using nvrpc::AsyncService;
using nvrpc::Context;
using nvrpc::Executor;
using nvrpc::Server;

// NVIDIA Inference Server Protos
#include "nvidia_inference.grpc.pb.h"
#include "nvidia_inference.pb.h"

namespace trtis = ::nvidia::inferenceserver;

void BasicInferService(std::shared_ptr<InferenceManager> resources, int port = 50052,
                       const std::string& max_recv_msg_size = "100MiB");

class TrtisModel;
class PyInferRunner;
class PyInferRemoteRunner;

class PyInferenceManager final : public InferenceManager
{
  public:
    PyInferenceManager(int max_executions, int max_buffers, int pre_threads, int cuda_threads,
                       int post_threads)
        : InferenceManager(max_executions, max_buffers)
    {
        RegisterThreadPool("pre", std::make_unique<ThreadPool>(pre_threads));
        RegisterThreadPool("cuda", std::make_unique<ThreadPool>(cuda_threads));
        RegisterThreadPool("post", std::make_unique<ThreadPool>(post_threads));
        RegisterRuntime("default", std::make_shared<StandardRuntime>());
        RegisterRuntime("unified", std::make_shared<ManagedRuntime>());
        SetActiveRuntime("default");
    }

    ~PyInferenceManager() override {}

    std::shared_ptr<PyInferRunner> RegisterModelByPath(const std::string& name,
                                                       const std::string& path)
    {
        auto model = ActiveRuntime().DeserializeEngine(path);
        RegisterModel(name, model);
        return this->InferRunner(name);
    }

    std::shared_ptr<PyInferRunner> InferRunner(std::string name)
    {
        return std::make_shared<PyInferRunner>(GetModel(name),
                                               casted_shared_from_this<PyInferenceManager>());
    }

    void Serve(int port) { BasicInferService(casted_shared_from_this<PyInferenceManager>(), port); }

    std::vector<std::string> Models()
    {
        std::vector<std::string> model_names;
        ForEachModel([&model_names](const Model& model) { model_names.push_back(model.Name()); });
        return model_names;
    }
};

class PyRemoteInferenceManager
{
  public:
    PyRemoteInferenceManager(py::kwargs kwargs)
    {
        m_Hostname = "localhost:50052";
        int client_threads = 1;
        for(const auto& item : kwargs)
        {
            auto key = py::cast<std::string>(item.first);
            if(key == "hostname")
            {
                m_Hostname = py::cast<std::string>(item.second);
            }
        }

        ::grpc::ChannelArguments ch_args;
        ch_args.SetMaxReceiveMessageSize(-1);
        m_Channel =
            grpc::CreateCustomChannel(m_Hostname, grpc::InsecureChannelCredentials(), ch_args);
        m_Stub = ::trtis::GRPCService::NewStub(m_Channel);
        m_Executor = std::make_shared<::nvrpc::client::Executor>(client_threads);
    }

    std::vector<std::string> Models()
    {
        const auto& status = TrtisStatus();
        auto model_status = status.server_status().model_status();
        DLOG(INFO) << status.DebugString();
        const ::trtis::ModelConfig* model_config;
        std::vector<std::string> models;
        m_Models.clear();
        for(auto it = model_status.begin(); it != model_status.end(); it++)
        {
            DLOG(INFO) << it->first;
            models.push_back(it->first);
            m_Models[it->first] = std::make_shared<TrtisModel>(it->second.config());
            /*
            if(FLAGS_model == it->first)
            {
                LOG(INFO) << "found model_config for " << FLAGS_model;
                model_config = &(it->second.config());
            }
            */
        }
        return models;
    }

    std::shared_ptr<PyInferRemoteRunner> InferRunner(const std::string& model_name)
    {
        auto infer_prepare_fn = [this](::grpc::ClientContext * context,
                                       const ::trtis::InferRequest& request,
                                       ::grpc::CompletionQueue* cq) -> auto
        {
            return std::move(m_Stub->PrepareAsyncInfer(context, request, cq));
        };

        auto runner = std::make_unique<
            ::nvrpc::client::ClientUnary<::trtis::InferRequest, ::trtis::InferResponse>>(
            infer_prepare_fn, m_Executor);

        return std::make_shared<PyInferRemoteRunner>(GetModel(model_name), std::move(runner));
    }

    std::shared_ptr<TrtisModel> GetModel(const std::string& name) const
    {
        auto search = m_Models.find(name);
        LOG_IF(FATAL, search == m_Models.end()) << "Model: " << name << " not found";
        return search->second;
    }

  protected:
    ::trtis::StatusResponse TrtisStatus()
    {
        ::grpc::ClientContext context;
        ::trtis::StatusRequest request;
        ::trtis::StatusResponse response;
        auto status = m_Stub->Status(&context, request, &response);
        CHECK(status.ok());
        return response;
    }

  private:
    std::string m_Hostname;
    std::map<std::string, std::shared_ptr<TrtisModel>> m_Models;
    std::shared_ptr<::grpc::Channel> m_Channel;
    std::unique_ptr<::trtis::GRPCService::Stub> m_Stub;
    std::shared_ptr<::nvrpc::client::Executor> m_Executor;
};

struct TrtisModel : BaseModel
{
    TrtisModel(const ::trtis::ModelConfig& model)
    {
        SetName(model.name());
        m_MaxBatchSize = model.max_batch_size();
        for(int i = 0; i < model.input_size(); i++)
        {
            const auto& b = model.input(i);
            TensorBindingInfo binding;
            binding.name = b.name();
            binding.isInput = true;
            binding.dtype = nvinfer1::DataType::kFLOAT;
            binding.dtypeSize =
                SizeofDataType(binding.dtype); // TODO: map trtis DataType enum; model.data_type()
            size_t count = 1;
            for(int j = 0; j < b.dims_size(); j++)
            {
                auto val = b.dims(j);
                binding.dims.push_back(val);
                count *= val;
            }

            binding.elementsPerBatchItem = count;
            binding.bytesPerBatchItem = count * binding.dtypeSize;
            AddBinding(std::move(binding));
        }
        for(int i = 0; i < model.output_size(); i++)
        {
            const auto& b = model.output(i);
            TensorBindingInfo binding;
            binding.name = b.name();
            binding.isInput = false;
            binding.dtype = nvinfer1::DataType::kFLOAT;
            binding.dtypeSize =
                SizeofDataType(binding.dtype); // TODO: map trtis DataType enum; model.data_type()
            size_t count = 1;
            for(int j = 0; j < b.dims_size(); j++)
            {
                auto val = b.dims(j);
                binding.dims.push_back(val);
                count *= val;
            }

            binding.elementsPerBatchItem = count;
            binding.bytesPerBatchItem = count * binding.dtypeSize;
            AddBinding(std::move(binding));
        }
    }
    ~TrtisModel() override {}

    int GetMaxBatchSize() const final override { return m_MaxBatchSize; }

  private:
    int m_MaxBatchSize;
};

struct PyInferRemoteRunner
{
    PyInferRemoteRunner(
        std::shared_ptr<TrtisModel> model,
        std::unique_ptr<::nvrpc::client::ClientUnary<::trtis::InferRequest, ::trtis::InferResponse>>
            runner)
        : m_Model(model), m_Runner(std::move(runner))
    {
    }

    using InferResults = py::dict;
    using InferFuture = std::shared_future<InferResults>;

    const BaseModel& GetModel() const { return *m_Model; }

    InferFuture Infer(py::kwargs kwargs)
    {
        const auto& model = GetModel();
        int batch_size = -1; // will be infered from the input tensors

        // Build InferRequest
        ::trtis::InferRequest request;
        request.set_model_name(model.Name());
        // request.set_version("");
        auto meta_data = request.mutable_meta_data();

        DLOG(INFO) << "Processing Python kwargs - holding the GIL";
        for(auto item : kwargs)
        {
            auto key = py::cast<std::string>(item.first);
            DLOG(INFO) << "Processing Python Keyword: " << key;
            const auto& binding = model.GetBinding(key);
            // TODO: throw a python exception
            LOG_IF(FATAL, !binding.isInput) << item.first << " is not an InputBinding";
            if(binding.isInput)
            {
                CHECK(py::isinstance<py::array>(item.second));
                auto data = py::cast<py::array_t<float>>(item.second);
                CHECK_LE(data.shape(0), model.GetMaxBatchSize());
                if(batch_size == -1)
                {
                    DLOG(INFO) << "Inferred batch_size=" << batch_size << " from dimensions ";
                    batch_size = data.shape(0);
                }
                else
                {
                    CHECK_EQ(data.shape(0), batch_size);
                }
                CHECK_EQ(data.nbytes(), binding.bytesPerBatchItem * batch_size);
                request.add_raw_input(data.data(), data.nbytes());
                auto meta = meta_data->add_input();
                meta->set_name(binding.name);
                meta->set_byte_size(binding.elementsPerBatchItem);
            }
        }

        for(auto id : model.GetOutputBindingIds())
        {
            const auto& binding = model.GetBinding(id);
            auto meta = meta_data->add_output();
            meta->set_name(binding.name);
            meta->set_byte_size(binding.elementsPerBatchItem);
        }
        meta_data->set_batch_size(batch_size);

        // Submit to TRTIS
        py::gil_scoped_release release;
        return m_Runner->Enqueue(std::move(request),
                                 [this](::trtis::InferRequest& request,
                                        ::trtis::InferResponse& response,
                                        ::grpc::Status& status) -> py::dict {
                                     // Convert InferResponse to py::dict
                                     LOG(INFO) << response.DebugString();
                                     return ConvertResponseToNumpy(response);
                                 });
    }

    py::dict InputBindings() const
    {
        auto dict = py::dict();
        for(const auto& id : GetModel().GetInputBindingIds())
        {
            AddBindingInfo(dict, id);
        }
        return dict;
    }

    py::dict OutputBindings() const
    {
        auto dict = py::dict();
        for(const auto& id : GetModel().GetOutputBindingIds())
        {
            AddBindingInfo(dict, id);
        }
        return dict;
    }

  protected:
    py::dict ConvertResponseToNumpy(const ::trtis::InferResponse& response)
    {
        py::gil_scoped_acquire acquire;
        py::dict results;
        const auto& meta_data = response.meta_data();
        for(int i = 0; i < meta_data.output_size(); i++)
        {
            const auto& out = meta_data.output(i);
            const auto& binding = GetModel().GetBinding(out.name());
            LOG(INFO) << "Processing binding: " << out.name();
            auto value = py::array(DataTypeToNumpy(binding.dtype), binding.dims);
            py::buffer_info buffer = value.request();
            CHECK_EQ(value.nbytes(), binding.bytesPerBatchItem * meta_data.batch_size());
            const auto& raw = response.raw_output(i);
            std::memcpy(buffer.ptr, raw.c_str(), value.nbytes());
            py::str key = binding.name;
            results[key] = value;
        }
        return results;
    }

    void AddBindingInfo(py::dict& dict, int id) const
    {
        const auto& binding = GetModel().GetBinding(id);
        py::str key = binding.name;
        py::dict value;
        value["shape"] = binding.dims;
        value["dtype"] = DataTypeToNumpy(binding.dtype);
        dict[key] = value;
    }

  private:
    std::shared_ptr<TrtisModel> m_Model;
    std::unique_ptr<::nvrpc::client::ClientUnary<::trtis::InferRequest, ::trtis::InferResponse>>
        m_Runner;
};

struct PyInferRunner : public InferRunner
{
    using InferRunner::InferRunner;
    using InferResults = py::dict;
    using InferFuture = std::shared_future<InferResults>;

    auto Infer(py::kwargs kwargs)
    {
        const auto& model = GetModel();
        auto bindings = InitializeBindings();
        int batch_size = -1;
        DLOG(INFO) << "Processing Python kwargs - holding the GIL";
        {
            // py::gil_scoped_acquire acquire;
            for(auto item : kwargs)
            {
                auto key = py::cast<std::string>(item.first);
                DLOG(INFO) << "Processing Python Keyword: " << key;
                const auto& binding = model.GetBinding(key);
                // TODO: throw a python exception
                LOG_IF(FATAL, !binding.isInput) << item.first << " is not an InputBinding";
                if(binding.isInput)
                {
                    const void* ptr;
                    size_t size;
                    size_t batch;
                    CHECK(py::isinstance<py::array>(item.second));
                    switch(binding.dtype)
                    {
                        case nvinfer1::DataType::kFLOAT:
                        {
                            auto data = py::cast<py::array_t<float>>(item.second);
                            ptr = data.data();
                            size = data.nbytes();
                            batch = data.shape(0);
                            break;
                        }
                        case nvinfer1::DataType::kINT8:
                        {
                            auto data = py::cast<py::array_t<std::int8_t>>(item.second);
                            ptr = data.data();
                            size = data.nbytes();
                            batch = data.shape(0);
                            break;
                        }
                        case nvinfer1::DataType::kINT32:
                        {
                            auto data = py::cast<py::array_t<std::int32_t>>(item.second);
                            ptr = data.data();
                            size = data.nbytes();
                            batch = data.shape(0);
                            break;
                        }
                        default:
                            LOG(FATAL) << "Unknown dtype";
                    }
                    CHECK_LE(batch, model.GetMaxBatchSize());
                    if(batch_size == -1)
                    {
                        batch_size = batch;
                        DLOG(INFO) << "Inferred batch_size=" << batch_size << " from dimensions";
                    }
                    else
                    {
                        CHECK_EQ(batch, batch_size);
                    }
                    CHECK_EQ(size, binding.bytesPerBatchItem * batch_size);
                    auto id = model.BindingId(key);
                    auto host = bindings->HostAddress(id);
                    // TODO: enhance the Copy method for py::buffer_info objects
                    DLOG(INFO) << "Copying data from " << ptr << " to " << host << " " << size << "bytes";
                    std::memcpy(host, ptr, size);
                }
            }
        }
        // py::gil_scoped_release release;
        bindings->SetBatchSize(batch_size);
        return InferRunner::Infer(
            bindings, [](std::shared_ptr<Bindings>& bindings) -> InferResults {
                DLOG(INFO) << "post processing thread - trying to acquire gil";
                py::gil_scoped_acquire acquire;
                DLOG(INFO) << "post processing thread - gil acquired";
                auto results = InferResults();
                DLOG(INFO) << "Copying Output Bindings to Numpy arrays";
                for(const auto& id : bindings->OutputBindings())
                {
                    const auto& binding = bindings->GetModel()->GetBinding(id);
                    DLOG(INFO) << "Processing binding: " << binding.name << "with index " << id;
                    std::vector<int> dims;
                    dims.push_back(bindings->BatchSize());
                    for(const auto& d : binding.dims)
                    {
                        dims.push_back(d);
                    }
                    auto value = py::array(DataTypeToNumpy(binding.dtype), dims);
                    // auto value = py::array_t<float>(binding.dims);
                    // auto value = py::array_t<float>(binding.elementsPerBatchItem *
                    // bindings->BatchSize());
                    py::buffer_info buffer = value.request();
                    CHECK_EQ(value.nbytes(), bindings->BindingSize(id));
                    std::memcpy(buffer.ptr, bindings->HostAddress(id), value.nbytes());
                    py::str key = binding.name;
                    results[key] = value;
                }
                DLOG(INFO) << "Finished Postprocessing Numpy arrays - setting future/promise value";
                return results;
            });
    }

    py::dict InputBindings() const
    {
        auto dict = py::dict();
        for(const auto& id : GetModel().GetInputBindingIds())
        {
            AddBindingInfo(dict, id);
        }
        return dict;
    }

    py::dict OutputBindings() const
    {
        auto dict = py::dict();
        for(const auto& id : GetModel().GetOutputBindingIds())
        {
            AddBindingInfo(dict, id);
        }
        return dict;
    }

  protected:
    void AddBindingInfo(py::dict& dict, int id) const
    {
        const auto& binding = GetModel().GetBinding(id);
        py::str key = binding.name;
        py::dict value;
        value["shape"] = binding.dims;
        value["dtype"] = DataTypeToNumpy(binding.dtype);
        dict[key] = value;
    }
};

class StatusContext final
    : public Context<::trtis::StatusRequest, ::trtis::StatusResponse, InferenceManager>
{
    void ExecuteRPC(::trtis::StatusRequest& request,
                    ::trtis::StatusResponse& response) final override
    {
        GetResources()->AcquireThreadPool("post").enqueue([this, &request, &response] {
            // create a status response
            auto server_status = response.mutable_server_status();
            server_status->set_ready_state(::trtis::ServerReadyState::SERVER_READY);
            auto model_status = server_status->mutable_model_status();

            // populate each model
            GetResources()->ForEachModel([model_status](const Model& model) {
                auto config = (*model_status)[model.Name()].mutable_config();
                config->set_name(model.Name());
                config->set_max_batch_size(model.GetMaxBatchSize());
                for(auto i : model.GetInputBindingIds())
                {
                    const auto& binding = model.GetBinding(i);
                    auto input = config->add_input();
                    input->set_name(binding.name);
                    input->set_data_type(::trtis::DataType::TYPE_FP32);
                    for(auto d : binding.dims)
                    {
                        input->add_dims(d);
                    }
                }
                for(auto i : model.GetOutputBindingIds())
                {
                    const auto& binding = model.GetBinding(i);
                    auto output = config->add_output();
                    output->set_name(binding.name);
                    output->set_data_type(::trtis::DataType::TYPE_FP32);
                    for(auto d : binding.dims)
                    {
                        output->add_dims(d);
                    }
                }
            });

            auto request_status = response.mutable_request_status();
            request_status->set_code(::trtis::RequestStatusCode::SUCCESS);
            LOG(INFO) << response.DebugString();
            this->FinishResponse();
        });
    }
};

class InferContext final
    : public Context<::trtis::InferRequest, ::trtis::InferResponse, InferenceManager>
{
    void ExecuteRPC(RequestType& input, ResponseType& output) final override
    {
        // Executing on a Executor threads - we don't want to block message handling, so we offload
        GetResources()->AcquireThreadPool("pre").enqueue([this, &input, &output]() {
            // Executed on a thread from CudaThreadPool
            auto model = GetResources()->GetModel(input.model_name());
            auto buffers = GetResources()->GetBuffers(); // <=== Limited Resource; May Block !!!
            auto bindings = buffers->CreateBindings(model);

            // prepare input bindings - copy data from input
            const auto& meta_data = input.meta_data();
            bindings->SetBatchSize(meta_data.batch_size());
            for(int input_idx = 0; input_idx < input.raw_input_size(); input_idx++)
            {
                const auto& in = meta_data.input(input_idx);
                auto binding_idx = model->BindingId(in.name());
                const std::string& raw = input.raw_input(input_idx);
                CHECK_EQ(raw.size(), bindings->BindingSize(binding_idx));
                DLOG(INFO) << "Copying binding " << in.name() << " from raw_input " << input_idx
                           << " to binding " << binding_idx;
                std::memcpy(bindings->HostAddress(binding_idx), raw.c_str(), raw.size());
            }
            InferRunner runner(model, GetResources());
            runner.Infer(bindings, [this, &input, &output](std::shared_ptr<Bindings>& bindings) {
                // post processing function - write response
                // for each output binding - write populate the response from data in bindings
                const auto& input_meta_data = input.meta_data();
                auto output_meta_data = output.mutable_meta_data();
                output_meta_data->set_model_name(bindings->GetModel()->Name());
                output_meta_data->set_batch_size(bindings->BatchSize());
                for(int idx = 0; idx < input_meta_data.output_size(); idx++)
                {
                    const auto& out = input_meta_data.output(idx);
                    auto binding_idx = bindings->GetModel()->BindingId(out.name());
                    auto meta = output_meta_data->add_output();
                    meta->set_name(out.name());
                    output.add_raw_output(bindings->HostAddress(binding_idx),
                                          bindings->BindingSize(binding_idx));
                }
                this->FinishResponse();
            });
        });
    }
};

void BasicInferService(std::shared_ptr<InferenceManager> resources, int port,
                       const std::string& max_recv_msg_size)
{
    // registerAllTensorRTPlugins();

    // Create a gRPC server bound to IP:PORT
    std::ostringstream ip_port;
    ip_port << "0.0.0.0:" << port;
    Server server(ip_port.str());

    // Modify MaxReceiveMessageSize
    auto bytes = trtlab::StringToBytes(max_recv_msg_size);
    server.Builder().SetMaxReceiveMessageSize(bytes);
    LOG(INFO) << "gRPC MaxReceiveMessageSize = " << trtlab::BytesToString(bytes);

    // A server can host multiple services
    auto inferenceService = server.RegisterAsyncService<::trtis::GRPCService>();

    auto rpcCompute = inferenceService->RegisterRPC<InferContext>(
        &::trtis::GRPCService::AsyncService::RequestInfer);

    auto rpcStatus = inferenceService->RegisterRPC<StatusContext>(
        &::trtis::GRPCService::AsyncService::RequestStatus);

    // Create Executors - Executors provide the messaging processing resources for the RPCs
    LOG(INFO) << "Initializing Executor";
    auto executor = server.RegisterExecutor(new Executor(1));

    // You can register RPC execution contexts from any registered RPC on any executor.
    executor->RegisterContexts(rpcCompute, resources, 100);
    executor->RegisterContexts(rpcStatus, resources, 10);

    LOG(INFO) << "Running Server";
    server.Run(std::chrono::milliseconds(1000), [] {});
}

using PyInferFuture = std::shared_future<typename PyInferRunner::InferResults>;
PYBIND11_MAKE_OPAQUE(PyInferFuture);

PYBIND11_MODULE(trtlab, m)
{
    py::class_<PyInferenceManager, std::shared_ptr<PyInferenceManager>>(m, "InferenceManager")
        .def(py::init<int, int, int, int, int>(), py::arg("max_exec_concurrency") = 1,
             py::arg("max_copy_concurrency") = 0, py::arg("pre_threads") = 1,
             py::arg("cuda_threads") = 1, py::arg("post_threads") = 3)
        .def("register_tensorrt_engine", &PyInferenceManager::RegisterModelByPath)
        .def("update_resources", &PyInferenceManager::AllocateResources)
        .def("infer_runner", &PyInferenceManager::InferRunner)
        .def("get_model", &PyInferenceManager::GetModel)
        .def("get_models", &PyInferenceManager::Models)
        .def("serve", &PyInferenceManager::Serve, py::arg("port") = 50052);
    // py::call_guard<py::gil_scoped_release>());

    py::class_<PyRemoteInferenceManager, std::shared_ptr<PyRemoteInferenceManager>>(
        m, "RemoteInferenceManager")
        .def(py::init(
            [](py::kwargs kwargs) { return std::make_shared<PyRemoteInferenceManager>(kwargs); }))
        .def("get_models", &PyRemoteInferenceManager::Models)
        .def("infer_runner", &PyRemoteInferenceManager::InferRunner);

    py::class_<PyInferRunner, std::shared_ptr<PyInferRunner>>(m, "InferRunner")
        .def("infer", &PyInferRunner::Infer)
        .def("input_bindings", &PyInferRunner::InputBindings)
        .def("output_bindings", &PyInferRunner::OutputBindings)
        .def("max_batch_size", &PyInferRunner::MaxBatchSize);
    //      .def("__repr__", [](const PyInferRunner& obj) {
    //          return obj.Description();
    //      });

    py::class_<PyInferRemoteRunner, std::shared_ptr<PyInferRemoteRunner>>(m, "InferRemoteRunner")
        .def("infer", &PyInferRemoteRunner::Infer)
        .def("input_bindings", &PyInferRemoteRunner::InputBindings)
        .def("output_bindings", &PyInferRemoteRunner::OutputBindings);

    py::class_<PyInferFuture, std::shared_ptr<PyInferFuture>>(m, "InferFuture")
        .def("wait", &std::shared_future<typename PyInferRunner::InferResults>::wait, py::call_guard<py::gil_scoped_release>())
        .def("get", &std::shared_future<typename PyInferRunner::InferResults>::get, py::call_guard<py::gil_scoped_release>());

    /*
        py::class_<InferBench, std::shared_ptr<InferBench>>(m, "InferBench")
            .def(py::init([](std::shared_ptr<PyInferenceManager> man) {
                return std::make_shared<InferBench>(man);
            }))
            .def("run", py::overload_cast<std::shared_ptr<Model>, uint32_t,
       double>(&InferBench::Run));

        py::class_<Model, std::shared_ptr<Model>>(m, "Model");

        py::class_<typename InferBench::Results>(m, "InferBenchResults");
    */
    // py::bind_map<std::map<std::string, double>>(m, "InferBenchResults");
}


================================================
FILE: trtlab/pybind/trtlab/utils.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "utils.h"

#include <glog/logging.h>

namespace py = pybind11;

namespace trtlab {
namespace TensorRT {

py::dtype DataTypeToNumpy(::nvinfer1::DataType dtype)
{
    switch(dtype)
    {
        case nvinfer1::DataType::kFLOAT:
            return py::dtype("float32");
        case nvinfer1::DataType::kHALF:
            return py::dtype("float16");
        case nvinfer1::DataType::kINT32:
            return py::dtype("int32");
        case nvinfer1::DataType::kINT8:
            return py::dtype("int8");
        default:
            LOG(FATAL) << "Unknown TensorRT DataType";
    }
}

} // namespace TensorRT
} // namespace trtlab


================================================
FILE: trtlab/pybind/trtlab/utils.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>

#include <NvInfer.h>

namespace trtlab {
namespace TensorRT {

/**
 * @brief Number of bytes for a given TensorRT DataType
 */
pybind11::dtype DataTypeToNumpy(::nvinfer1::DataType dtype);

} // namespace TensorRT
} // namespace trtlab


================================================
FILE: trtlab/tensorrt/BUILD.bazel
================================================

cc_library(
    name = "tensorrt",
    srcs = glob([
        "src/**/*.cc",
        "src/**/*.h",
    ]),
    hdrs = glob([
        "include/**/*.h",
    ]),
    deps = [
        "//trtlab/core",
        "//trtlab/cuda",
        "@local_config_tensorrt//:tensorrt_headers",
        "@local_config_tensorrt//:tensorrt_infer",
    ],
    linkopts = [
        "-L/usr/local/cuda/lib64/stubs",
    ],
    strip_include_prefix = "include",
    visibility = ["//visibility:public"],
)


================================================
FILE: trtlab/tensorrt/CMakeLists.txt
================================================
# find tensorrt
find_package(TensorRT REQUIRED 4.0.1)
message(STATUS "TensorRT IncludeDir ${TensorRT_INCLUDE_DIRS}")
message(STATUS "TensorRT Version ${TensorRT_VERSION_STRING}")

add_library(tensorrt
  src/allocator.cc
# src/bindings.cc
# src/buffers.cc
  src/execution_context.cc
# src/inference_manager.cc
# src/infer_bench.cc
  src/model.cc
  src/runtime.cc
  src/utils.cc
  src/workspace.cc
)

add_library(${PROJECT_NAME}::tensorrt ALIAS tensorrt)

target_link_libraries(tensorrt
    ${PROJECT_NAME}::core
    ${PROJECT_NAME}::cuda
    ${TensorRT_LIBRARY}
)

target_include_directories(tensorrt
  PUBLIC
    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
    ${TensorRT_INCLUDE_DIRS}
)

set_target_properties(tensorrt PROPERTIES OUTPUT_NAME ${PROJECT_NAME}_tensorrt)

install(
  TARGETS tensorrt
  EXPORT ${PROJECT_NAME}-targets
  RUNTIME DESTINATION  ${CMAKE_INSTALL_BINDIR}
  LIBRARY DESTINATION  ${CMAKE_INSTALL_LIBDIR}
  ARCHIVE DESTINATION  ${CMAKE_INSTALL_LIBDIR}
  INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
)

install(
  DIRECTORY include/
  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
)

if(ENABLE_TESTING)
  add_subdirectory(tests)
endif()

#if(benchmark_FOUND)
#  add_subdirectory(benchmarks)
#endif()


================================================
FILE: trtlab/tensorrt/include/trtlab/tensorrt/allocator.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

#include <memory>
#include <mutex>
#include <vector>

#include <NvInfer.h>

namespace trtlab
{
    namespace TensorRT
    {
        class NvAllocator : public ::nvinfer1::IGpuAllocator
        {
        public:
            struct Pointer
            {
                void*  addr;
                size_t size;
            };

            NvAllocator() : m_UseWeightAllocator(false) {}
            virtual ~NvAllocator() override {}

            NvAllocator(const NvAllocator&) = delete;
            NvAllocator& operator=(const NvAllocator&) = delete;

            NvAllocator(NvAllocator&&) noexcept = delete;
            NvAllocator& operator=(NvAllocator&&) noexcept = delete;

            // TensorRT IGpuAllocator virtual overrides
            void* allocate(uint64_t size, uint64_t alignment, uint32_t flags) final override;
            void  free(void* ptr) final override;

            template <class F, class... Args>
            auto use_weights_allocator(F&& f, Args&&... args) -> typename std::result_of<F(Args...)>::type
            {
                std::lock_guard<std::recursive_mutex> lock(m_Mutex);
                m_Pointers.clear();
                m_UseWeightAllocator = true;
                auto retval          = f(std::forward<Args>(args)...);
                m_UseWeightAllocator = false;
                m_Pointers.clear();
                return retval;
            }

            const std::vector<Pointer>& get_pointers();

        private:
            bool                 m_UseWeightAllocator;
            std::recursive_mutex m_Mutex;
            std::vector<Pointer> m_Pointers;
            virtual void         weights_allocate(void**, size_t) = 0;
        };

        class StandardAllocator : public NvAllocator
        {
            using NvAllocator::NvAllocator;
            void weights_allocate(void** ptr, size_t) final override;
        };

        class ManagedAllocator : public NvAllocator
        {
            using NvAllocator::NvAllocator;
            void weights_allocate(void** ptr, size_t) override;
        };

    } // namespace TensorRT
} // namespace trtlab

================================================
FILE: trtlab/tensorrt/include/trtlab/tensorrt/bindings.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once
#include <map>

#include "trtlab/tensorrt/buffers.h"
#include "trtlab/tensorrt/common.h"
#include "trtlab/tensorrt/model.h"

//#include "trtlab/core/memory/descriptor.h"
//#include "trtlab/core/memory/host_memory.h"
//#include "trtlab/cuda/memory/device_memory.h"

#include <cuda.h>
#include <cuda_runtime.h>

namespace trtlab {
namespace TensorRT {

class Model;
class Buffers;

/**
 * @brief Manages memory addresses and transfers for input/output tensors.
 *
 * Bindings manages the addresses for the input/output tensors.  Bindings are created
 * from a Buffers object and maintain a reference.  All device bindings must be configured
 * before calling ExecutionContext::Infer.  Similarly, the respective host binding must
 * be set before calling an of the implicit CopyTo/CopyFromDevice methods.
 *
 * A Bindings object holds the state of the input/output tensors over the course of an
 * inference calculation.
 */

/*
class Bindings
{
  public:
    using HostDescriptor = DescriptorHandle<HostMemory>;
    using DeviceDescriptor = DescriptorHandle<DeviceMemory>;

    virtual ~Bindings();

    void* HostAddress(uint32_t binding_id);
    void* DeviceAddress(uint32_t binding_id);
    void** DeviceAddresses();

    [[deprecated]] void SetHostAddress(int binding_id, void* addr);
    [[deprecated]] void SetDeviceAddress(int binding_id, void* addr);

    void SetHostAddress(int binding_id, HostDescriptor);
    void SetDeviceAddress(int binding_id, DeviceDescriptor);

    HostDescriptor& HostMemoryDescriptor(int binding_id);
    // const HostMemory& HostMemory(int binding_id) const;

    void* ActivationsAddress() { return m_ActivationsAddress; }
    void SetActivationsAddress(void* addr) { m_ActivationsAddress = addr; }

    void CopyToDevice(uint32_t);
    void CopyToDevice(const std::vector<uint32_t>&);
    void CopyToDevice(uint32_t, void*, size_t);

    void CopyFromDevice(uint32_t);
    void CopyFromDevice(const std::vector<uint32_t>&);
    void CopyFromDevice(uint32_t, void*, size_t);

    auto InputBindings() const { return m_Model->GetInputBindingIds(); }
    auto OutputBindings() const { return m_Model->GetOutputBindingIds(); }

    auto GetModel() -> const std::shared_ptr<Model>& { return m_Model; }
    auto BatchSize() const { return m_BatchSize; }
    void SetBatchSize(uint32_t);

    inline cudaStream_t Stream() const { return m_Buffers->Stream(); }
    void Synchronize() const { m_Buffers->Synchronize(); }

    size_t BindingSize(uint32_t binding_id) const;

  private:
    Bindings(const std::shared_ptr<Model>, const std::shared_ptr<Buffers>);

    const std::shared_ptr<Model> m_Model;
    const std::shared_ptr<Buffers> m_Buffers;
    uint32_t m_BatchSize;

    std::vector<void*> m_HostAddresses;
    std::vector<void*> m_DeviceAddresses;

    std::map<int, HostDescriptor> m_HostDescriptors;
    std::map<int, DeviceDescriptor> m_DeviceDescriptors;

    void* m_ActivationsAddress;

    friend class Buffers;
};
*/


/*
class Workspace
{
  public:
    void RegisterModel(std::shared_ptr<Model>);

    template<typename T>
    void RegisterObject(const std::string& model_name, T obj);

    template<typename T>
    void RegisterObject(const std::string& model_name, std::shared_ptr<T> shared_obj);

    template<typename T>
    void RegisterObject(const std::string& model_name, std::unique_ptr<T> unique_obj);

    void RegisterBindings(const std::string& model_name, ISmartAllocator& allocator);

    template<typename T>
    void RegisterBindings(const std::string& model_name, std::shared_ptr<MemoryStack<T>>& stack);

    void RegisterBinding(const std::string& model_name, const std::string& binding_name, CoreMemory& memory);
    void RegisterHostBinding(const std::string& model_name, const std::string& binding_name, void*);
    void RegisterDeviceBinding(const std::string& model_name, const std::string& binding_name, void*);

  protected:
    class ModelWorkspace
    {
      private:
        std::vector<void*> Bindings() const;

        std::map<std::string, void*> m_HostPointers;
        std::map<std::string, void*> m_DevicePointers;

        std::vector<std::function<void()> m_CapturedObjects;
    }

    HostMemory& HostDescriptor(const std::string&);
    const HostMemory& HostDescriptor(const std::string&) const;
    void HostDescriptor(const std::string&, DescriptorHandle<HostMemory>);

    DeviceMemory& DeviceDescriptor(const std::string&);
    const HostMemory& DeviceDescriptor(const std::string&) const;
    void DeviceDescriptor(const std::string&, DescriptorHandle<DeviceMemory>);

    std::shared_ptr<Bindings> ExportBindings(const std::string& model);

  private:
    std::map<std::string, DescriptorHandle<HostMemory>> m_HostDescriptors;
    std::map<std::string, DescriptorHandle<DeviceMemory>> m_DeviceDescriptors;
    std::shared_ptr<InferenceManager> m_Resources;
};
*/

} // namespace TensorRT
} // namespace trtlab


================================================
FILE: trtlab/tensorrt/include/trtlab/tensorrt/buffers.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

#include <memory>

#include <cuda.h>
#include <cuda_runtime.h>

//#include "trtlab/tensorrt/common.h"
//#include "trtlab/core/memory/cyclic_allocator.h"
//#include "trtlab/core/memory/memory_stack.h"
//#include "trtlab/cuda/memory/cuda_device.h"
//#include "trtlab/cuda/memory/cuda_pinned_host.h"

namespace trtlab {
namespace TensorRT {

/**
 * @brief Manages inpu/output buffers and CudaStream
 *
 * Primary TensorRT resource class used to manage both a host and a device memory stacks
 * and owns the cudaStream_t that should be used for transfers or compute on these
 * resources.
 */

/*
class Buffers : public std::enable_shared_from_this<Buffers>
{
  public:
    Buffers();
    virtual ~Buffers();

    auto CreateBindings(const std::shared_ptr<Model>&) -> std::shared_ptr<Bindings>;

    inline cudaStream_t Stream() { return m_Stream; }
    void Synchronize();

  protected:
    virtual void Reset() = 0;
    void ConfigureBindings(const std::shared_ptr<Model>& model, std::shared_ptr<Bindings>);

    virtual std::unique_ptr<HostMemory> AllocateHost(size_t size) = 0;
    virtual std::unique_ptr<DeviceMemory> AllocateDevice(size_t size) = 0;

  private:
    cudaStream_t m_Stream;
    friend class InferenceManager;
};

template<typename HostMemoryType, typename DeviceMemoryType>
class FixedBuffers : public Buffers
{
  public:
    FixedBuffers(size_t host_size, size_t device_size)
        : m_HostStack(std::make_unique<MemoryStack<HostMemoryType>>(host_size)),
          m_DeviceStack(std::make_unique<MemoryStack<DeviceMemoryType>>(device_size)), Buffers()
    {
    }

    ~FixedBuffers() override {}

  protected:
    template<typename MemoryType>
    class BufferStackDescriptor final : public Descriptor<MemoryType>
    {
      public:
        BufferStackDescriptor(void* ptr, size_t size)
            : Descriptor<MemoryType>(ptr, size, []{},
                std::string("BufferStack<" + std::string(MemoryType::TypeName()) + ">").c_str())
        {
        }
        ~BufferStackDescriptor() final override {}
    };

    std::unique_ptr<HostMemory> AllocateHost(size_t size) final override
    {
        return std::move(std::make_unique<BufferStackDescriptor<HostMemoryType>>(
            m_HostStack->Allocate(size), size));
    }

    std::unique_ptr<DeviceMemory> AllocateDevice(size_t size) final override
    {
        return std::move(std::make_unique<BufferStackDescriptor<DeviceMemoryType>>(
            m_DeviceStack->Allocate(size), size));
    }

    void Reset() final override
    {
        m_HostStack->Reset();
        m_DeviceStack->Reset();
    }

  private:
    std::unique_ptr<MemoryStack<HostMemoryType>> m_HostStack;
    std::unique_ptr<MemoryStack<DeviceMemoryType>> m_DeviceStack;
};

template<typename HostMemoryType, typename DeviceMemoryType>
class CyclicBuffers : public Buffers
{
  public:
    using HostAllocatorType = std::unique_ptr<CyclicAllocator<HostMemoryType>>;
    using DeviceAllocatorType = std::unique_ptr<CyclicAllocator<DeviceMemoryType>>;

    using HostDescriptor = typename CyclicAllocator<HostMemoryType>::Descriptor;
    using DeviceDescriptor = typename CyclicAllocator<DeviceMemoryType>::Descriptor;

    CyclicBuffers(HostAllocatorType host, DeviceAllocatorType device)
        : m_HostAllocator{std::move(host)}, m_DeviceAllocator{std::move(device)}
    {
    }
    ~CyclicBuffers() override {}

    std::unique_ptr<HostMemory> AllocateHost(size_t size)
    {
        return m_HostAllocator->Allocate(size);
    }

    std::unique_ptr<DeviceMemory> AllocateDevice(size_t size)
    {
        return m_DeviceAllocator->Allocate(size);
    }

    void Reset() final override {}

  private:
    HostAllocatorType m_HostAllocator;
    DeviceAllocatorType m_DeviceAllocator;
};
*/

} // namespace TensorRT
} // namespace trtlab


================================================
FILE: trtlab/tensorrt/include/trtlab/tensorrt/common.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

#include <functional>
#include <memory>

#include <NvInfer.h>

namespace trtlab
{
    namespace TensorRT
    {
        struct NvInferDeleter
        {
            NvInferDeleter()
            {
                NvInferDeleter([] {});
            }
            NvInferDeleter(std::function<void()> capture) : m_Capture{capture} {}

            template <typename T>
            void operator()(T* obj) const
            {
                if (obj)
                {
                    obj->destroy();
                    if (m_Capture)
                    {
                        m_Capture();
                    }
                }
            }

        private:
            std::function<void()> m_Capture;
        };

        template <typename T>
        std::shared_ptr<T> nv_shared(T* obj)
        {
            if (!obj)
            {
                throw std::runtime_error("Failed to create object");
            }
            return std::shared_ptr<T>(obj, NvInferDeleter());
        };

        template <typename T>
        std::shared_ptr<T> nv_shared(T* obj, std::function<void()> capture)
        {
            if (!obj)
            {
                throw std::runtime_error("Failed to create object");
            }
            return std::shared_ptr<T>(obj, NvInferDeleter(capture));
        };

        template <typename T>
        using unique_t = std::unique_ptr<T, NvInferDeleter>;

        template <typename T>
        std::unique_ptr<T, NvInferDeleter> nv_unique(T* obj)
        {
            if (!obj)
            {
                throw std::runtime_error("Failed to create object");
            }
            return std::unique_ptr<T, NvInferDeleter>(obj);
        }

    } // namespace TensorRT
} // namespace trtlab


================================================
FILE: trtlab/tensorrt/include/trtlab/tensorrt/execution_context.h
================================================
#pragma once

#include <trtlab/memory/descriptor.h>
#include <trtlab/core/utils.h>

#include "NvInfer.h"
#include "trtlab/tensorrt/model.h"
#include "trtlab/tensorrt/common.h"

namespace trtlab
{
    namespace TensorRT
    {
        class ExecutionContext
        {
        public:
            using model_t   = std::shared_ptr<Model>;
            using context_t = nvinfer1::IExecutionContext;

            ExecutionContext(model_t);
            virtual ~ExecutionContext();

            ExecutionContext(ExecutionContext&&) noexcept = default;
            ExecutionContext& operator=(ExecutionContext&&) noexcept = default;

            DELETE_COPYABILITY(ExecutionContext);

            context_t& context()
            {
                return *m_Context;
            }

            const Model& model() const
            {
                return *m_Model;
            }

            const nvinfer1::ICudaEngine& engine() const
            {
                return m_Context->getEngine();
            }

            std::string binding_info(std::uint32_t binding_id);
            std::string profile_info(std::uint32_t profile_id);

            std::size_t binding_size_in_bytes(std::uint32_t binding_id);

        private:
            unique_t<context_t> m_Context;
            model_t             m_Model;
        };

    } // namespace TensorRT
} // namespace trtlab

================================================
FILE: trtlab/tensorrt/include/trtlab/tensorrt/infer_bench.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

#include "trtlab/tensorrt/inference_manager.h"
#include "trtlab/tensorrt/model.h"

namespace trtlab {
namespace TensorRT {

enum InferBenchKey
{
    kMaxExecConcurrency = 0,
    kMaxCopyConcurrency,
    kBatchSize,
    kWalltime,
    kBatchesComputed,
    kBatchesPerSecond,
    kInferencesPerSecond,
    kSecondsPerBatch,
    kExecutionTimePerBatch
};

class InferBench
{
  public:
    InferBench(std::shared_ptr<InferenceManager>);
    virtual ~InferBench();

    using ModelsList = std::vector<std::shared_ptr<Model>>;
    using Results = std::map<InferBenchKey, double>;

    std::unique_ptr<Results> Run(const std::shared_ptr<Model> model, uint32_t batch_size,
                                 double seconds = 5.0);
    std::unique_ptr<Results> Run(const ModelsList& models, uint32_t batch_size,
                                 double seconds = 5.0);

  protected:
    InferenceManager& InferResources() { return *m_Resources; }

  private:
    std::shared_ptr<InferenceManager> m_Resources;
};

} // namespace TensorRT
} // namespace trtlab

================================================
FILE: trtlab/tensorrt/include/trtlab/tensorrt/infer_runner.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

#include "trtlab/tensorrt/bindings.h"
#include "trtlab/core/async_compute.h"
#include "trtlab/tensorrt/inference_manager.h"
#include "trtlab/tensorrt/model.h"

namespace trtlab {
namespace TensorRT {

struct InferRunner : public AsyncComputeWrapper<void(std::shared_ptr<Bindings>&)>
{
    InferRunner(std::shared_ptr<Model> model, std::shared_ptr<InferenceManager> resources)
        : m_Model{model}, m_Resources{resources}
    {
    }

    InferRunner(InferRunner&&) = delete;
    InferRunner& operator=(InferRunner&&) = delete;

    InferRunner(const InferRunner&) = delete;
    InferRunner& operator=(const InferRunner&) = delete;

    virtual ~InferRunner() {}

    using BindingsHandle = std::shared_ptr<Bindings>;
    using PreFn = std::function<void(Bindings&)>;

    template<typename Post>
    auto Infer(PreFn pre, Post post)
    {
        auto compute = Wrap(post);
        auto future = compute->Future();
        Enqueue(pre, compute);
        return future.share();
    }

    template<typename Post>
    auto Infer(std::shared_ptr<Bindings> bindings, Post post)
    {
        auto compute = Wrap(post);
        auto future = compute->Future();
        Enqueue(bindings, compute);
        return future.share();
    }

  protected:
    template<typename T>
    void Enqueue(PreFn Pre, std::shared_ptr<AsyncCompute<T>> Post)
    {
        Workers("pre").enqueue([this, Pre, Post]() mutable {
            auto bindings = InitializeBindings();
            Pre(*bindings);
            Enqueue(bindings, Post);
        });
    }

    template<typename T>
    void Enqueue(std::shared_ptr<Bindings> bindings, std::shared_ptr<AsyncCompute<T>> Post)
    {
        Workers("cuda").enqueue([this, bindings, Post]() mutable {
            DLOG(INFO) << "H2D";
            bindings->CopyToDevice(bindings->InputBindings());
            DLOG(INFO) << "Compute";
            auto trt_ctx = Compute(bindings);
            bindings->CopyFromDevice(bindings->OutputBindings());
            Workers("post").enqueue([this, bindings, trt_ctx, Post]() mutable {
                trt_ctx->Synchronize();
                trt_ctx.reset();
                DLOG(INFO) << "Sync TRT";
                bindings->Synchronize();
                DLOG(INFO) << "Sync D2H";
                (*Post)(bindings);
                bindings.reset();
                DLOG(INFO) << "Execute Finished";
            });
        });
    }

    BindingsHandle InitializeBindings()
    {
        auto buffers = m_Resources->GetBuffers();
        return buffers->CreateBindings(m_Model);
    }

    auto Compute(BindingsHandle& bindings) -> std::shared_ptr<ExecutionContext>
    {
        auto trt_ctx = m_Resources->GetExecutionContext(bindings->GetModel());
        trt_ctx->Infer(bindings);
        return trt_ctx;
    }

    inline ThreadPool& Workers(std::string name) { return m_Resources->AcquireThreadPool(name); }
    /*
        void CopyInputsToInputBindings(const HostMap& inputs, BindingsHandle& bindings)
        {
            for (const auto& id : bindings->InputBindings())
            {
                const auto& b = bindings->GetBindings(id);
                auto search = inputs.find(b.name);
                CHECK(search != inputs.end());
                Copy(bindings->HostMemoryDescriptor(id), *inputs[b.name],
       bindings->BindingSize(id));
            }
        }
    */
    /*j
        const Model& Model() const
        {
            return *m_Model;
        }

        const InferenceManager& Resources() const
        {
            return *m_Resources;
        }
    */

  public:
    const int MaxBatchSize() const { return m_Model->GetMaxBatchSize(); }

    const Model& GetModel() const { return *m_Model; }

    const std::shared_ptr<Model> GetModelSmartPtr() const { return m_Model; }

    InferenceManager& Resources() { return *m_Resources; }

  private:
    std::shared_ptr<Model> m_Model;
    std::shared_ptr<InferenceManager> m_Resources;
};

} // namespace TensorRT
} // namespace trtlab

================================================
FILE: trtlab/tensorrt/include/trtlab/tensorrt/inference_manager.h
================================================
#pragma once
#include <memory>

#include <trtlab/tensorrt/model.h>

namespace trtlab
{
    namespace TensorRT
    {
        class InferenceManager : public std::enable_shared_from_this<InferenceManager>
        {
            struct key {};

        public:
            static std::shared_ptr<InferenceManager> Create();

            InferenceManager(key);
            virtual ~InferenceManager();

            void RegisterModel(std::shared_ptr<Model>);
        };

    } // namespace TensorRT
} // namespace trtlab

================================================
FILE: trtlab/tensorrt/include/trtlab/tensorrt/model.h
================================================
#pragma once
#include <memory>
#include <vector>

#include "NvInfer.h"

#include "trtlab/tensorrt/allocator.h"

namespace trtlab
{
    namespace TensorRT
    {
        struct BaseModel
        {
        };

        class Model : public BaseModel
        {
        public:
            using engine_t  = std::shared_ptr<nvinfer1::ICudaEngine>;
            using weights_t = std::vector<typename NvAllocator::Pointer>;

            Model(engine_t, const weights_t&);
            virtual ~Model();

            nvinfer1::ICudaEngine& engine()
            {
                return *m_Engine;
            }

            std::string profiles_info() const;
            std::string profile_info(std::uint32_t profile_id, nvinfer1::OptProfileSelector) const;

            std::string bindings_info() const;
            std::string binding_info(std::uint32_t) const;
            static std::string dims_info(const nvinfer1::Dims&);

            std::size_t binding_element_count(std::uint32_t binding_id) const;
            std::size_t binding_size_in_bytes(std::uint32_t binding_id) const;

        protected:


        private:
            engine_t  m_Engine;
            weights_t m_Weights;
        };

    } // namespace TensorRT
} // namespace trtlab


================================================
FILE: trtlab/tensorrt/include/trtlab/tensorrt/runtime.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#pragma once

#include <fstream>
#include <memory>
#include <mutex>
#include <type_traits>
#include <vector>

#include "trtlab/tensorrt/allocator.h"
#include "trtlab/tensorrt/common.h"
#include "trtlab/tensorrt/model.h"

namespace trtlab
{
    namespace TensorRT
    {
        class Runtime : public std::enable_shared_from_this<Runtime>
        {
            Runtime(const Runtime&) = delete;
            Runtime& operator=(const Runtime&) = delete;

            Runtime(Runtime&&) noexcept = delete;
            Runtime& operator=(Runtime&&) = delete;

        public:
            virtual ~Runtime();

            std::shared_ptr<Model>         deserialize_engine(const std::string&);
            std::shared_ptr<Model>         deserialize_engine(const std::string&, ::nvinfer1::IPluginFactory*);
            std::shared_ptr<Model>         deserialize_engine(const void*, size_t);
            virtual std::shared_ptr<Model> deserialize_engine(const void*, size_t, ::nvinfer1::IPluginFactory*) = 0;

        protected:
            Runtime();

            ::nvinfer1::IRuntime& get_runtime() const;
            std::vector<char>     read_engine_file(const std::string&) const;

        private:
            class Logger : public ::nvinfer1::ILogger
            {
            public:
                virtual ~Logger() override;
                void log(::nvinfer1::ILogger::Severity severity, const char* msg) final override;
            };

            // Order is important.  In C++ variables in the member initializer list are instantiated in
            // the order they are declared, not in the order they appear in the initializer list.
            // Inverting these causes m_Runtime to be initialized with a NULL m_Logger and was the
            // source of much head banging.
            std::unique_ptr<::nvinfer1::ILogger>                  m_Logger;
            std::unique_ptr<::nvinfer1::IRuntime, NvInferDeleter> m_Runtime;
        };

        class RuntimeWithAllocator : public Runtime
        {
        public:
            using Runtime::Runtime;
            virtual ~RuntimeWithAllocator() override;

            using Runtime::deserialize_engine;
            std::shared_ptr<Model> deserialize_engine(const void*, size_t, ::nvinfer1::IPluginFactory*) final override;
        protected:
            RuntimeWithAllocator(std::unique_ptr<NvAllocator> allocator);


            NvAllocator& get_allocator()
            {
                return *m_Allocator;
            }

        private:
            std::unique_ptr<NvAllocator> m_Allocator;
        };

        template <typename AllocatorType>
        struct CustomRuntime : public RuntimeWithAllocator
        {
            CustomRuntime() : RuntimeWithAllocator(std::make_unique<AllocatorType>()) {}
            virtual ~CustomRuntime() override {}
        };

        using StandardRuntime = CustomRuntime<StandardAllocator>;
        using ManagedRuntime  = CustomRuntime<ManagedAllocator>;

    } // namespace TensorRT
} // namespace trtlab


================================================
FILE: trtlab/tensorrt/include/trtlab/tensorrt/utils.h
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <memory>

#include <NvInfer.h>

namespace trtlab {
namespace TensorRT {

/**
 * @brief Number of bytes for a given TensorRT DataType
 */
std::size_t SizeofDataType(const nvinfer1::DataType& dtype);

std::size_t dims_element_count(const nvinfer1::Dims& dims);
std::size_t data_type_size(const nvinfer1::DataType& dtype);

} // namespace TensorRT
} // namespace trtlab


================================================
FILE: trtlab/tensorrt/include/trtlab/tensorrt/workspace.h
================================================
#pragma once

#include <vector>

#include <cuda_runtime.h>

#include <trtlab/core/utils.h>

#include <trtlab/memory/descriptor.h>

#include <trtlab/tensorrt/model.h>
#include <trtlab/tensorrt/execution_context.h>

namespace trtlab
{
    namespace TensorRT
    {
        class WorkspaceBase
        {
            WorkspaceBase();
            virtual ~WorkspaceBase();

            cudaStream_t stream() { return m_Stream; }

        private:
            cudaStream_t m_Stream;
        };

        class StaticSingleModelGraphWorkspace
        {
        public:
            using descriptor_t = trtlab::memory::descriptor;

            StaticSingleModelGraphWorkspace(std::shared_ptr<Model>);
            virtual ~StaticSingleModelGraphWorkspace();

            DELETE_COPYABILITY(StaticSingleModelGraphWorkspace);
            DELETE_MOVEABILITY(StaticSingleModelGraphWorkspace);

            void enqueue();

            descriptor_t& binding(std::uint32_t binding_id);

            cudaStream_t stream()
            {
                return m_Stream;
            }

            std::size_t batch_size();

            std::string name() const
            {
                return m_Name;
            }

        protected:
            ExecutionContext& exec_ctx() { return m_Context; }

        private:
            ExecutionContext          m_Context;
            std::vector<descriptor_t> m_Bindings;
            std::vector<void*>        m_BindingPointers;
            descriptor_t              m_DeviceMemory;
            cudaStream_t              m_Stream;
            cudaGraph_t               m_Graph;
            cudaGraphExec_t           m_GraphExecutor;
            std::string               m_Name;
        };

        class BenchmarkWorkspace : public StaticSingleModelGraphWorkspace
        {
        public:
            BenchmarkWorkspace(std::shared_ptr<Model>);
            ~BenchmarkWorkspace() override = default;

            descriptor_t& host_binding(std::uint32_t binding_id);

            void async_h2d();
            void async_d2h();

        private:
            std::vector<descriptor_t> m_HostBindings;

        };

        class TimedBenchmarkWorkspace : private BenchmarkWorkspace
        {
        public:
            TimedBenchmarkWorkspace(std::shared_ptr<Model>);
            ~TimedBenchmarkWorkspace() override = default;

            void enqueue_pipeline();

            float get_compute_time_ms();
            float get_h2d_time_ms();
            float get_d2h_time_ms();

            using BenchmarkWorkspace::binding;
            using BenchmarkWorkspace::stream;

        private:
            cudaEvent_t m_Start;
            cudaEvent_t m_CompleteAsyncH2D;
            cudaEvent_t m_CompleteCompute;
            cudaEvent_t m_CompleteAsyncD2H;
        };

    } // namespace TensorRT
} // namespace trtlab


================================================
FILE: trtlab/tensorrt/src/allocator.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "trtlab/tensorrt/allocator.h"

#include <cuda.h>
#include <cuda_runtime.h>
#include <glog/logging.h>

#include <trtlab/cuda/common.h>

using namespace trtlab;
using namespace TensorRT;

void* NvAllocator::allocate(size_t size, uint64_t alignment, uint32_t flags)
{
    void*                                 ptr;
    std::lock_guard<std::recursive_mutex> lock(m_Mutex);
    if (m_UseWeightAllocator)
    {
        weights_allocate(&ptr, size);
        m_Pointers.push_back(Pointer{ptr, size});
    }
    else
    {
        CHECK_CUDA(cudaMalloc(&ptr, size));
        VLOG(2) << "TensorRT cudaMalloc size = " << size;
    }
    return ptr;
}

void NvAllocator::free(void* ptr)
{
    VLOG(3) << "TensorRT cudaFree " << ptr;
    CHECK_CUDA(cudaFree(ptr));
}

const std::vector<NvAllocator::Pointer>& NvAllocator::get_pointers()
{
    return m_Pointers;
}

void StandardAllocator::weights_allocate(void** ptr, size_t size)
{
    CHECK_CUDA(cudaMalloc(ptr, size));
    VLOG(2) << "TensorRT cudaMalloc size = " << size << "; " << *ptr;
}

void ManagedAllocator::weights_allocate(void** ptr, size_t size)
{
    CHECK_CUDA(cudaMallocManaged(ptr, size));
    VLOG(2) << "TensorRT cudaMallocManaged size = " << size << "; " << *ptr;
    CHECK_CUDA(cudaMemAdvise(*ptr, size, cudaMemAdviseSetReadMostly, 0));
}


================================================
FILE: trtlab/tensorrt/src/bindings.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "trtlab/tensorrt/bindings.h"

#include <glog/logging.h>

#include "trtlab/core/memory/descriptor.h"

using trtlab::Descriptor;
using trtlab::DescriptorHandle;
using trtlab::DeviceMemory;
using trtlab::HostMemory;

/*
namespace {
class RawHostMemoryDescriptor final : public Descriptor<HostMemory>
{
  public:
    RawHostMemoryDescriptor(void* ptr, size_t size)
        : Descriptor<HostMemory>(ptr, size, "BindingsRawPtr")
    {
    }
    ~RawHostMemoryDescriptor() final override {}
};
} // namespace
*/

namespace trtlab {
namespace TensorRT {

Bindings::Bindings(const std::shared_ptr<Model> model, const std::shared_ptr<Buffers> buffers)
    : m_Model(model), m_Buffers(buffers), m_BatchSize(0)
{
    auto count = model->GetBindingsCount();
    m_HostAddresses.resize(count);
    m_DeviceAddresses.resize(count);
    for(auto i = 0; i < count; i++)
    {
        m_HostAddresses[i] = m_DeviceAddresses[i] = nullptr;
    }
}

Bindings::~Bindings() {}

typename Bindings::HostDescriptor& Bindings::HostMemoryDescriptor(int binding_id)
{
    CHECK_LT(binding_id, m_HostAddresses.size());
    return m_HostDescriptors[binding_id];
}

/*
void Bindings::SetHostAddress(int binding_id, void* addr)
{
    CHECK_LT(binding_id, m_HostAddresses.size());
    auto mdesc = std::make_unique<RawHostMemoryDescriptor>(addr, BindingSize(binding_id));
    m_HostAddresses[binding_id] = addr;
    m_HostDescriptors[binding_id] = std::move(mdesc);
}


void Bindings::SetDeviceAddress(int binding_id, void* addr)
{
    CHECK_LT(binding_id, m_DeviceAddresses.size());
    m_DeviceAddresses[binding_id] = addr;
    m_DeviceDescriptors.erase(binding_id);
}
*/

void Bindings::SetHostAddress(int binding_id, DescriptorHandle<HostMemory> mdesc)
{
    CHECK_LT(binding_id, m_HostAddresses.size());
    m_HostAddresses[binding_id] = mdesc->Data();
    m_HostDescriptors[binding_id] = std::move(mdesc);
}

void Bindings::SetDeviceAddress(int binding_id, DescriptorHandle<DeviceMemory> mdesc)
{
    CHECK_LT(binding_id, m_DeviceAddresses.size());
    m_DeviceAddresses[binding_id] = mdesc->Data();
    m_DeviceDescriptors[binding_id] = std::move(mdesc);
}

void* Bindings::HostAddress(uint32_t binding_id)
{
    CHECK_LT(binding_id, m_HostAddresses.size());
    return m_HostAddresses[binding_id];
}

void* Bindings::DeviceAddress(uint32_t binding_id)
{
    CHECK_LT(binding_id, m_DeviceAddresses.size());
    return m_DeviceAddresses[binding_id];
}

void** Bindings::DeviceAddresses() { return (void**)m_DeviceAddresses.data(); }

void Bindings::CopyToDevice(uint32_t device_binding_id)
{
    auto host_src = HostAddress(device_binding_id);
    auto bytes = BindingSize(device_binding_id);
    CopyToDevice(device_binding_id, host_src, bytes);
}

void Bindings::CopyToDevice(const std::vector<uint32_t>& ids)
{
    for(auto id : ids)
    {
        CopyToDevice(id);
    }
}

void Bindings::CopyToDevice(uint32_t device_binding_id, void* src, size_t bytes)
{
    auto dst = DeviceAddress(device_binding_id);
    DLOG(INFO) << "CopyToDevice binding_id: " << device_binding_id << "; size: " << bytes;
    CHECK_EQ(cudaMemcpyAsync(dst, src, bytes, cudaMemcpyHostToDevice, Stream()), CUDA_SUCCESS);
}

void Bindings::CopyFromDevice(uint32_t device_binding_id)
{
    auto host_dst = HostAddress(device_binding_id);
    auto bytes = BindingSize(device_binding_id);
    CopyFromDevice(device_binding_id, host_dst, bytes);
}

void Bindings::CopyFromDevice(const std::vector<uint32_t>& ids)
{
    for(auto id : ids)
    {
        CopyFromDevice(id);
    }
}

void Bindings::CopyFromDevice(uint32_t device_binding_id, void* dst, size_t bytes)
{
    auto src = DeviceAddress(device_binding_id);
    DLOG(INFO) << "CopyFromDevice binding_id: " << device_binding_id << "; size: " << bytes;
    CHECK_EQ(cudaMemcpyAsync(dst, src, bytes, cudaMemcpyDeviceToHost, Stream()), CUDA_SUCCESS);
}

void Bindings::SetBatchSize(uint32_t batch_size)
{
    CHECK_LE(batch_size, m_Model->GetMaxBatchSize());
    m_BatchSize = batch_size;
}

size_t Bindings::BindingSize(uint32_t binding_id) const
{
    return m_Model->GetBinding(binding_id).bytesPerBatchItem *
           (m_BatchSize ? m_BatchSize : m_Model->GetMaxBatchSize());
}

} // namespace TensorRT
} // namespace trtlab


================================================
FILE: trtlab/tensorrt/src/buffers.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "trtlab/tensorrt/buffers.h"
#include "trtlab/tensorrt/bindings.h"

#include <cuda.h>
#include <cuda_runtime.h>

#include <glog/logging.h>

using trtlab::CudaDeviceMemory;
using trtlab::CudaPinnedHostMemory;
using trtlab::MemoryStack;

namespace trtlab {
namespace TensorRT {

Buffers::Buffers()
{
    // CHECK(cudaStreamCreateWithFlags(&m_Stream, cudaStreamNonBlocking) == cudaSuccess); <-- breaks
    CHECK_EQ(cudaStreamCreate(&m_Stream), cudaSuccess);
}

Buffers::~Buffers()
{
    DLOG(INFO) << "Buffers Deconstructor";
    CHECK_EQ(cudaStreamSynchronize(m_Stream), CUDA_SUCCESS);
    CHECK_EQ(cudaStreamDestroy(m_Stream), CUDA_SUCCESS);
}

auto Buffers::CreateBindings(const std::shared_ptr<Model>& model) -> std::shared_ptr<Bindings>
{
    auto bindings = std::shared_ptr<Bindings>(new Bindings(model, shared_from_this()));
    ConfigureBindings(model, bindings);
    return bindings;
}

void Buffers::ConfigureBindings(const std::shared_ptr<Model>& model,
                                std::shared_ptr<Bindings> bindings)
{
    for(uint32_t i = 0; i < model->GetBindingsCount(); i++)
    {
        auto binding_size = model->GetBinding(i).bytesPerBatchItem * model->GetMaxBatchSize();
        DLOG(INFO) << "Configuring Binding " << i << ": pushing " << binding_size
                   << " to host/device stacks";
        bindings->SetHostAddress(i, AllocateHost(binding_size));
        bindings->SetDeviceAddress(i, AllocateDevice(binding_size));
    }
}

void Buffers::Synchronize()
{
    CHECK_EQ(cudaStreamSynchronize(m_Stream), CUDA_SUCCESS) << "Stream Sync failed";
}

} // namespace TensorRT
} // namespace trtlab


================================================
FILE: trtlab/tensorrt/src/execution_context.cc
================================================
#include "trtlab/tensorrt/execution_context.h"
#include "trtlab/tensorrt/utils.h"

#include <glog/logging.h>

using namespace trtlab;
using namespace TensorRT;

ExecutionContext::ExecutionContext(model_t model)
: m_Model(model), m_Context(nv_unique(model->engine().createExecutionContextWithoutDeviceMemory()))
{
}

ExecutionContext::~ExecutionContext()
{
    if (m_Context)
    {
        VLOG(2) << "Destroying IExecutionContext " << m_Context.get();
    }
}

std::size_t ExecutionContext::binding_size_in_bytes(std::uint32_t binding_id)
{
    auto dims = m_Context->getBindingDimensions(binding_id);
    auto dtype = m_Context->getEngine().getBindingDataType(binding_id);
    return dims_element_count(dims) * data_type_size(dtype);
}


================================================
FILE: trtlab/tensorrt/src/infer_bench.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "trtlab/tensorrt/infer_bench.h"
#include "trtlab/tensorrt/bindings.h"
#include "trtlab/tensorrt/infer_runner.h"

#include <glog/logging.h>

namespace trtlab {
namespace TensorRT {

InferBench::InferBench(std::shared_ptr<InferenceManager> resources) : m_Resources(resources) {}
InferBench::~InferBench() {}

std::unique_ptr<InferBench::Results> InferBench::Run(std::shared_ptr<Model> model,
                                                     uint32_t batch_size, double seconds)
{
    std::vector<std::shared_ptr<Model>> models = {model};
    return std::move(Run(models, batch_size, seconds));
}

std::unique_ptr<InferBench::Results> InferBench::Run(const ModelsList& models, uint32_t batch_size,
                                                     double seconds)
{
    size_t batch_count = 0;
    std::vector<std::shared_future<void>> futures;
    futures.reserve(1024 * 1024);

    // Check ModelsList to ensure the requested batch_size is appropriate
    for(const auto& model : models)
    {
        CHECK_LE(batch_size, model->GetMaxBatchSize());
    }

    // Setup std::chrono deadline - no more elapsed lambda
    auto start = std::chrono::high_resolution_clock::now();
    auto last = start + std::chrono::milliseconds(static_cast<long>(seconds * 1000));

    // Benchmark loop over Models modulo size of ModelsList
    while(std::chrono::high_resolution_clock::now() < last && ++batch_count)
    {
        size_t model_idx = batch_count % models.size();
        const auto& model = models[model_idx];

        auto buffers = InferResources().GetBuffers(); // <=== Limited Resource; May Block !!!
        auto bindings = buffers->CreateBindings(model);
        bindings->SetBatchSize(batch_size);

        InferRunner runner(model, m_Resources);
        futures.push_back(runner.Infer(
            bindings, [](std::shared_ptr<Bindings>& bindings) mutable { bindings.reset(); }));
    }

    // Join worker threads
    for(const auto& f : futures)
    {
        f.wait();
    }

    auto total_time =
        std::chrono::duration<double>(std::chrono::high_resolution_clock::now() - start).count();
    auto inferences = batch_count * batch_size;

    auto results_ptr = std::make_unique<InferBench::Results>();
    Results& results = *results_ptr;
    results[kBatchSize] = batch_size;
    results[kMaxExecConcurrency] = m_Resources->MaxExecConcurrency();
    results[kMaxCopyConcurrency] = m_Resources->MaxCopyConcurrency();
    results[kBatchesComputed] = batch_count;
    results[kWalltime] = total_time;
    results[kBatchesPerSecond] = batch_count / total_time;
    results[kInferencesPerSecond] = inferences / total_time;
    results[kExecutionTimePerBatch] =
        total_time / (batch_count / m_Resources->MaxExecConcurrency());

    DLOG(INFO) << "Benchmark Run Complete";

    DLOG(INFO) << "Inference Results: " << results[kBatchesComputed] << " batches computed in "
               << results[kWalltime] << " seconds on " << results[kMaxExecConcurrency]
               << " compute streams using batch_size: " << results[kBatchSize]
               << "; inf/sec: " << results[kInferencesPerSecond]
               << "; batches/sec: " << results[kBatchesPerSecond]
               << "; execution time per batch: " << results[kExecutionTimePerBatch];

    return std::move(results_ptr);
}

} // namespace TensorRT
} // namespace trtlab

================================================
FILE: trtlab/tensorrt/src/inference_manager.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "trtlab/tensorrt/inference_manager.h"

#include <glog/logging.h>

#include "trtlab/cuda/device_info.h"
#include "trtlab/cuda/memory/cuda_device.h"
#include "trtlab/cuda/memory/cuda_pinned_host.h"

using trtlab::CudaDeviceMemory;
using trtlab::CudaPinnedHostMemory;

namespace trtlab {
namespace TensorRT {

/**
 * @brief General TensorRT Resource class
 *
 * Derived from trtlab::Resources, this InferenceManager object provides the basic memory and
 * compute resources needed for using with a TensorRT Context.  Limited quanity resources such as
 * Buffers and ExecutionContexts are managed by thead-safe Pools.  In general, the compute is always
 * limited by the number of resources. For example, limiting the number of ExecutionContexts to 1
 * will ensure only 1 Inference calcuation is using the GPU.  This will ensure best possible
 * latency. However, if you wish to improve throughput at the cost of increased latency, you can
 * increase the number of in-flight ExecutionContexts.  This will cause competition between the
 * multiple forward passes; however, it will also allow the GPU to better utilize the compute
 * resources of the GPU.
 *
 * Note: the number of Buffers should alway be nExec+1 or larger to ensure you are not resource
 * bound on the Buffers used for the Input and Output Tensors of the DNN.
 *
 * @see Pool for more details on how limited quantity Resources are managed.
 */
InferenceManager::InferenceManager(int max_executions, int max_buffers)
    : m_MaxExecutions(max_executions), m_MaxBuffers(max_buffers ? max_buffers : max_executions * 2),
      m_HostStackSize(0), m_DeviceStackSize(0),
      m_ActivationsSize(0), m_Buffers{nullptr}, m_ActiveRuntime{nullptr}
{
    // RegisterRuntime("default", std::make_unique<CustomRuntime<StandardAllocator>>());
    // SetActiveRuntime("default");
    LOG(INFO) << "-- Initialzing TensorRT Resource Manager --";
    LOG(INFO) << "Maximum Execution Concurrency: " << m_MaxExecutions;
    LOG(INFO) << "Maximum Copy Concurrency: " << m_MaxBuffers;
}

InferenceManager::~InferenceManager() { JoinAllThreads(); }

int InferenceManager::MaxExecConcurrency() const { return m_MaxExecutions; }

int InferenceManager::MaxCopyConcurrency() const { return m_MaxBuffers; }

/**
 * @brief Register a Model with the InferenceManager object
 */
void InferenceManager::RegisterModel(const std::string& name, std::shared_ptr<Model> model)
{
    RegisterModel(name, model, m_MaxExecutions);
}

/**
 * @brief Register a Model with the InferenceManager object
 *
 * This variant allows you to specify an alternate maximum concurrency for this model.  The value
 * must be 1 <= concurrency <= MaxConcurrency.  Larger values will be capped to the maximum
 * concurrency allowed by the InferenceManager object.
 */
void InferenceManager::RegisterModel(const std::string& name, std::shared_ptr<Model> model,
                                     uint32_t max_concurrency)
{
    auto item = m_Models.find(name);
    if(item != m_Models.end())
    {
        LOG(ERROR) << "Model naming collsion; Model with name=" << name
                   << " is already registered.";
        return;
    }

    if(max_concurrency > m_MaxExecutions)
    {
        LOG(WARNING) << "Requested concurrency (" << max_concurrency
                     << ") exceeds max concurrency. "
                     << "Concurrency will be capped to " << m_MaxExecutions;
        max_concurrency = m_MaxExecutions;
    }

    // Size according to largest padding - device alignment
    size_t bindings =
        model->GetBindingMemorySize() + model->GetBindingsCount() * DeviceInfo::Alignment();
    size_t activations = Align(model->GetActivationsMemorySize(), 128 * 1024); // add a cacheline

    size_t host = Align(bindings, 32 * 1024);
    size_t device = Align(bindings, 128 * 1024);

    // TODO: Check to see if m_Buffers has been allocated.  If so, we should thown an exception
    // if the registered model requirements are larger than our allocated buffers.
    if(m_Buffers)
    {
        if(host > m_HostStackSize || device > m_DeviceStackSize)
        {
            throw std::runtime_error(
                "Required binding resources are greater than allocated capacity");
        }
    }
    if(m_ExecutionContexts)
    {
        if(activations > m_ActivationsSize)
        {
            throw std::runtime_error(
                "Required activation workspace is greater than allocated capacity");
        }
    }

    m_HostStackSize = std::max(m_HostStackSize, host);
    m_DeviceStackSize = std::max(m_DeviceStackSize, device);
    m_ActivationsSize = std::max(m_ActivationsSize, activations);

    LOG(INFO) << "-- Registering Model: " << name << " --";
    LOG(INFO) << "Input/Output Tensors require " << BytesToString(model->GetBindingMemorySize());
    LOG(INFO) << "Execution Activations require "
              << BytesToString(model->GetActivationsMemorySize());
    auto weights = model->GetWeightsMemorySize();
    if(weights) LOG(INFO) << "Weights require " << BytesToString(weights);

    model->SetName(name);
    m_Models[name] = model;
    m_ModelExecutionContexts[model.get()] = Pool<::nvinfer1::IExecutionContext>::Create();
    for(int i = 0; i < max_concurrency; i++)
    {
        m_ModelExecutionContexts[model.get()]->Push(model->CreateExecutionContext());
    }
}

Runtime& InferenceManager::ActiveRuntime() { return *m_ActiveRuntime; }

void InferenceManager::RegisterRuntime(const std::string& name, std::shared_ptr<Runtime> runtime)
{
    auto search = m_Runtimes.find(name);
    CHECK(search == m_Runtimes.end());
    m_Runtimes[name] = std::move(runtime);
}

void InferenceManager::SetActiveRuntime(const std::string& name)
{
    auto search = m_Runtimes.find(name);
    CHECK(search != m_Runtimes.end());
    m_ActiveRuntime = search->second.get();
}

/**
 * @brief Allocates Host and Device Resources for Inference
 *
 * Buffers are sized according to the registered models.  Models registered after
 * AllocateInferenceManager has been call that require larger buffers should throw an exception
 * (TODO).
 */
void InferenceManager::AllocateResources()
{
    LOG(INFO) << "-- Allocating TensorRT Resources --";
    LOG(INFO) << "Creating " << m_MaxExecutions << " TensorRT execution tokens.";
    LOG(INFO) << "Creating a Pool of " << m_MaxBuffers << " Host/Device Memory Stacks";
    LOG(INFO) << "Each Host Stack contains " << BytesToString(m_HostStackSize);
    LOG(INFO) << "Each Device Stack contains " << BytesToString(m_DeviceStackSize);
    LOG(INFO) << "Total GPU Memory: "
              << BytesToString(m_MaxBuffers * m_DeviceStackSize +
                               m_MaxExecutions * m_ActivationsSize);

    m_Buffers = Pool<Buffers>::Create();
    for(int i = 0; i < m_MaxBuffers; i++)
    {
        DLOG(INFO) << "Allocating Host/Device Buffers #" << i;
        m_Buffers->Push(std::make_shared<FixedBuffers<CudaPinnedHostMemory, CudaDeviceMemory>>(
            m_HostStackSize, m_DeviceStackSize));
    }

    m_ExecutionContexts = Pool<ExecutionContext>::Create();
    for(int i = 0; i < m_MaxExecutions; i++)
    {
        m_ExecutionContexts->EmplacePush(new ExecutionContext(m_ActivationsSize));
    }
}

/**
 * @brief Get a registered Model by name
 *
 * @param model_name
 * @return std::shared_ptr<Model>
 */
auto InferenceManager::GetModel(std::string model_name) -> std::shared_ptr<Model>
{
    auto item = m_Models.find(model_name);
    CHECK(item != m_Models.end()) << "Unable to find entry for model: " << model_name;
    return item->second;
}

/**
 * @brief Get a Buffers from the Resource Pool (May Block!)
 *
 * This method aquires a limited quantity Buffers object from the Pool of Buffers.  This call may
 * block foward execution of the thread if no resources are available.
 *
 * Note: The resource will be returned to the resource Pool when the reference count of the
 * shared_ptr goes to zero.  No action on the user is required, unless they want to release the
 * object earlier by using the reset() function on all instances of the shared_ptr.
 *
 * @return std::shared_ptr<Buffers>
 */
auto InferenceManager::GetBuffers() -> std::shared_ptr<Buffers>
{
    CHECK(m_Buffers) << "Call AllocateResources() before trying to acquire a Buffers object.";
    return m_Buffers->Pop([](Buffers* ptr) {
        ptr->Reset();
        DLOG(INFO) << "Releasing Buffers";
    });
}

/**
 * @brief Get an Exeuction Context object from the Resource Pool (May Block!)
 *
 * This method aquires a limited quantity ExecutionContext object from the Pool of
 * ExecutionContexts. This call may block foward execution of the thread if no resources are
 * available.
 *
 * Note: The resource will be returned to the resource Pool when the reference count of the
 * shared_ptr goes to zero.  No action on the user is required, unless they want to release the
 * object earlier by using the reset() function on all instances of the shared_ptr.
 *
 * @return std::shared_ptr<ExecutionContext>
 */
auto InferenceManager::GetExecutionContext(const Model* model) -> std::shared_ptr<ExecutionContext>
{
    CHECK(m_ExecutionContexts)
        << "Call AllocateResources() before trying to acquire an ExeuctionContext.";
    auto item = m_ModelExecutionContexts.find(model);
    CHECK(item != m_ModelExecutionContexts.end())
        << "No ExectionContext for model " << model->Name();
    // This is the global concurrency limiter - it owns the activation scratch memory
    auto ctx = m_ExecutionContexts->Pop([](ExecutionContext* ptr) {
        ptr->Reset();
        DLOG(INFO) << "Returning Execution Concurrency Limiter to Pool";
    });
    // This is the model concurrency limiter - it owns the TensorRT IExecutionContext
    // for which the pointer to the global limiter's memory buffer will be set
    ctx->SetContext(item->second->Pop([](::nvinfer1::IExecutionContext* ptr) {
        DLOG(INFO) << "Returning Model IExecutionContext to Pool";
    }));
    DLOG(INFO) << "Acquired Concurrency Limiting Execution Context";
    return ctx;
}

/**
 * @brief Get an Exeuction Context object from the Resource Pool (May Block!)
 *
 * Convenience method for accepting a shared_ptr<Model> as input.
 *
 * @param model
 * @return std::shared_ptr<ExecutionContext>
 */
auto InferenceManager::GetExecutionContext(const std::shared_ptr<Model>& model)
    -> std::shared_ptr<ExecutionContext>
{
    return GetExecutionContext(model.get());
}

auto InferenceManager::AcquireThreadPool(const std::string& name) -> ThreadPool&
{
    // std::shared_lock<std::shared_mutex> lock(m_ThreadPoolMutex);
    auto search = m_ThreadPools.find(name);
    CHECK(search != m_ThreadPools.end());
    return *(search->second);
}

void InferenceManager::RegisterThreadPool(const std::string& name,
                                          std::unique_ptr<ThreadPool> threads)
{
    // std::unique_lock<std::shared_mutex> lock(m_ThreadPoolMutex);
    DLOG(INFO) << "Registering ThreadPool: " << name;
    // Old threadpools will continute to live until all threads are joined.
    // this may need a mutex
    m_ThreadPools[name].swap(threads);
}

bool InferenceManager::HasThreadPool(const std::string& name) const
{
    auto search = m_ThreadPools.find(name);
    return (bool)(search != m_ThreadPools.end());
}

void InferenceManager::JoinAllThreads()
{
    // std::unique_lock<std::shared_mutex> lock(m_ThreadPoolMutex);
    DLOG(INFO) << "Joining All Threads";
    m_ThreadPools.clear();
    DLOG(INFO) << "All Threads Checked-In and Joined";
}

void InferenceManager::ForEachModel(std::function<void(const Model&)> callback)
{
    for(const auto& item : m_Models)
    {
        callback(*(item.second));
    }
}

} // namespace TensorRT
} // namespace trtlab


================================================
FILE: trtlab/tensorrt/src/model.cc
================================================
#include "trtlab/tensorrt/model.h"
#include "trtlab/tensorrt/utils.h"

#include <glog/logging.h>

using namespace trtlab;
using namespace TensorRT;

namespace
{
    std::string ProfileSelectorString(nvinfer1::OptProfileSelector selector)
    {
        if (selector == nvinfer1::OptProfileSelector::kMIN)
        {
            return "MIN";
        }
        else if (selector == nvinfer1::OptProfileSelector::kOPT)
        {
            return "OPT";
        }
        else if (selector == nvinfer1::OptProfileSelector::kMAX)
        {
            return "MAX";
        }
        else
        {
            LOG(FATAL) << "unknown profile selector";
        }
    }
} // namespace

Model::Model(engine_t engine, const weights_t& weights) : m_Engine(engine), m_Weights(weights) {}

Model::~Model()
{
    VLOG(2) << "Destroying ICudaEngine " << m_Engine.get();
}

std::string Model::profiles_info() const
{
    std::stringstream ss;
    ss << "Optimization Profiles" << std::endl;

    for (int profile_id = 0; profile_id < m_Engine->getNbOptimizationProfiles(); profile_id++)
    {
        for (auto selector : {nvinfer1::OptProfileSelector::kMIN, nvinfer1::OptProfileSelector::kOPT, nvinfer1::OptProfileSelector::kMAX})
        {
            ss << "Profile " << profile_id << " "  << profile_info(profile_id, selector);
        }
    }
    return ss.str();
}

std::string Model::profile_info(std::uint32_t profile_id, nvinfer1::OptProfileSelector selector) const
{
    CHECK_LT(profile_id, m_Engine->getNbOptimizationProfiles());
    std::stringstream ss;

    for (int binding_id = 0; binding_id < m_Engine->getNbBindings(); binding_id++)
    {
        if (m_Engine->bindingIsInput(binding_id))
        {
            auto dims = m_Engine->getProfileDimensions(binding_id, profile_id, selector);
            ss << ProfileSelectorString(selector) << " - input_binding " << binding_id << ": name=" << m_Engine->getBindingName(binding_id)
               << "; ";
            ss << TensorRT::Model::dims_info(dims);
            ss << std::endl;
        }
    }
    return ss.str();
}

std::string Model::bindings_info() const
{
    std::stringstream ss;
    for (int i = 0; i < m_Engine->getNbBindings(); i++)
    {
        ss << binding_info(i) << std::endl;
    }
    return ss.str();
}

std::string Model::binding_info(std::uint32_t binding_id) const
{
    std::stringstream ss;
    ss << "binding " << binding_id << ": name=" << m_Engine->getBindingName(binding_id) << "; ";
    ss << dims_info(m_Engine->getBindingDimensions(binding_id));
    ss << "; isInput=" << (m_Engine->bindingIsInput(binding_id) ? "TRUE" : "FALSE");
    return ss.str();
}

std::string Model::dims_info(const nvinfer1::Dims& dims)
{
    std::stringstream ss;
    ss << "ndims=" << dims.nbDims << "; [";
    for (int i = 0; i < dims.nbDims; i++)
        ss << " " << dims.d[i];
    ss << " ]";
    return ss.str();
}

std::size_t Model::binding_element_count(std::uint32_t binding_id) const
{
    auto dims = m_Engine->getBindingDimensions(binding_id);
    std::size_t count = 1;
    for(int i=0; i<dims.nbDims; i++)
    {
        count *= dims.d[i];
    }
    return count;
}

std::size_t Model::binding_size_in_bytes(std::uint32_t binding_id) const
{
    auto dtype = m_Engine->getBindingDataType(binding_id);
    return data_type_size(dtype) * binding_element_count(binding_id);
}

================================================
FILE: trtlab/tensorrt/src/runtime.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "trtlab/tensorrt/runtime.h"

#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>

#include <glog/logging.h>

using namespace trtlab;
using namespace TensorRT;

namespace
{
    bool file_exists(const std::string& name)
    {
        struct stat buffer;
        return (stat(name.c_str(), &buffer) == 0);
    }
} // namespace

Runtime::Runtime() : m_Logger(std::make_unique<Logger>()), m_Runtime(nv_unique(::nvinfer1::createInferRuntime(*(m_Logger.get()))))
{
    m_Logger->log(::nvinfer1::ILogger::Severity::kINFO, "IRuntime Logger Initialized");
}

Runtime::~Runtime()
{
    VLOG(2) << "Destorying Runtime " << this;
}

::nvinfer1::IRuntime& Runtime::get_runtime() const
{
    return *m_Runtime;
}

std::shared_ptr<Model> Runtime::deserialize_engine(const std::string& plan_file)
{
    VLOG(2) << "Deserializing TensorRT ICudaEngine from file: " << plan_file;
    const auto& buffer = read_engine_file(plan_file);
    return deserialize_engine(buffer.data(), buffer.size(), nullptr);
}

std::shared_ptr<Model> Runtime::deserialize_engine(const std::string& plan_file, ::nvinfer1::IPluginFactory* plugin_factory)
{
    VLOG(2) << "Deserializing TensorRT ICudaEngine from file: " << plan_file;
    const auto& buffer = read_engine_file(plan_file);
    return deserialize_engine(buffer.data(), buffer.size(), plugin_factory);
}

std::shared_ptr<Model> Runtime::deserialize_engine(const void* data, size_t size)
{
    return deserialize_engine(data, size, nullptr);
}

std::vector<char> Runtime::read_engine_file(const std::string& plan_file) const
{
    if(!file_exists(plan_file))
    {
        throw std::runtime_error("tensorrt engine file does not exist: " + plan_file);
    }
    VLOG(2) << "Reading Engine: " << plan_file;
    std::ifstream   file(plan_file, std::ios::binary | std::ios::ate);
    std::streamsize size = file.tellg();
    CHECK_GT(size, 0);
    std::vector<char> buffer(size);
    file.seekg(0, std::ios::beg);
    CHECK(file.read(buffer.data(), size)) << "Unable to read engine file: " << plan_file;
    return buffer;
}

Runtime::Logger::~Logger()
{
    VLOG(2) << "Destroying Logger " << this;
}

void Runtime::Logger::log(::nvinfer1::ILogger::Severity severity, const char* msg)
{
    switch (severity)
    {
    case Severity::kINTERNAL_ERROR:
        LOG(FATAL) << "[TensorRT.INTERNAL_ERROR]: " << msg;
        break;
    case Severity::kERROR:
        LOG(FATAL) << "[TensorRT.ERROR]: " << msg;
        break;
    case Severity::kWARNING:
        LOG(WARNING) << "[TensorRT.WARNING]: " << msg;
        break;
    case Severity::kINFO:
        VLOG(2) << "[TensorRT.INFO]: " << msg;
        break;
    default:
        VLOG(2) << "[TensorRT.DEBUG]: " << msg;
        break;
    }
}

RuntimeWithAllocator::RuntimeWithAllocator(std::unique_ptr<NvAllocator> allocator) : Runtime(), m_Allocator(std::move(allocator))
{
    get_runtime().setGpuAllocator(m_Allocator.get());
}

RuntimeWithAllocator::~RuntimeWithAllocator()
{
    get_runtime().setGpuAllocator(nullptr);
}

std::shared_ptr<Model> RuntimeWithAllocator::deserialize_engine(const void* data, size_t size, ::nvinfer1::IPluginFactory* plugin_factory)
{
    VLOG(2) << "Deserializing Custom TensorRT ICudaEngine";
    return get_allocator().use_weights_allocator([this, data, size, plugin_factory]() mutable -> std::shared_ptr<Model> {
        auto runtime = this->shared_from_this();
        auto engine  = nv_shared(get_runtime().deserializeCudaEngine(data, size, plugin_factory), [runtime]() mutable { runtime.reset(); });
        CHECK(engine) << "Unable to create ICudaEngine";
        return std::make_shared<Model>(engine, get_allocator().get_pointers());
    });
}


================================================
FILE: trtlab/tensorrt/src/utils.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "trtlab/core/utils.h"

#include <NvInfer.h>

#include <glog/logging.h>

namespace trtlab {
namespace TensorRT {

std::size_t SizeofDataType(const nvinfer1::DataType& dtype)
{
    switch(dtype)
    {
        case nvinfer1::DataType::kFLOAT:
        case nvinfer1::DataType::kINT32:
            return 4;
        case nvinfer1::DataType::kHALF:
            return 2;
        case nvinfer1::DataType::kINT8:
            return 1;
        default:
            LOG(FATAL) << "Unknown TensorRT DataType";
    }
}

std::size_t data_type_size(const nvinfer1::DataType& dtype)
{
    return SizeofDataType(dtype);
}

std::size_t dims_element_count(const nvinfer1::Dims& dims)
{
    std::size_t count = 1;
    for(int i=0; i<dims.nbDims; i++)
    {
        count *= dims.d[i];
    }
    return count;
}

} // namespace TensorRT
} // namespace trtlab


================================================
FILE: trtlab/tensorrt/src/workspace.cc
================================================
#include <trtlab/tensorrt/workspace.h>
#include <trtlab/tensorrt/utils.h>

#include <trtlab/cuda/common.h>
#include <trtlab/cuda/memory/cuda_allocators.h>

using namespace trtlab;
using namespace TensorRT;

WorkspaceBase::WorkspaceBase()
{
    CHECK_CUDA(cudaStreamCreate(&m_Stream));
}

WorkspaceBase::~WorkspaceBase()
{
    CHECK_CUDA(cudaStreamSynchronize(m_Stream));
    CHECK_CUDA(cudaStreamDestroy(m_Stream));
}

StaticSingleModelGraphWorkspace::StaticSingleModelGraphWorkspace(std::shared_ptr<Model> model) : m_Context(model)
{
    auto& engine      = m_Context.engine();
    auto  cuda_malloc = memory::make_cuda_allocator();

    std::stringstream ss;
    ss << this;
    m_Name = ss.str();

    //m_Context.context().setOptimizationProfile(0);

    for (int i = 0; i < engine.getNbBindings(); i++)
    {
        auto bytes = m_Context.binding_size_in_bytes(i);
        VLOG(2) << "binding " << i << ": bytes = " << bytes;
        m_Bindings.emplace_back(cuda_malloc.allocate_descriptor(bytes));
        m_BindingPointers.push_back(m_Bindings[i].data());
    }

    m_DeviceMemory = cuda_malloc.allocate_descriptor(engine.getDeviceMemorySize());
    m_Context.context().setDeviceMemory(m_DeviceMemory.data());
    VLOG(2) << "execution context device memory: " << m_DeviceMemory.size();

    CHECK_CUDA(cudaStreamCreate(&m_Stream));

    // warm up and let mContext do cublas initialization
    m_Context.context().enqueueV2(m_BindingPointers.data(), m_Stream, nullptr);

    // create graph
    //CHECK_CUDA(cudaStreamBeginCapture(m_Stream, cudaStreamCaptureModeThreadLocal));
    CHECK_CUDA(cudaStreamBeginCapture(m_Stream, cudaStreamCaptureModeRelaxed));
    m_Context.context().enqueueV2(m_BindingPointers.data(), m_Stream, nullptr);
    CHECK_CUDA(cudaStreamEndCapture(m_Stream, &m_Graph));

    // create graph executor
    CHECK_CUDA(cudaGraphInstantiate(&m_GraphExecutor, m_Graph, NULL, NULL, 0));
}

StaticSingleModelGraphWorkspace::~StaticSingleModelGraphWorkspace()
{
    DVLOG(3) << "StaticSingleModelGraphWorkspace Deconstructor";
    CHECK_CUDA(cudaStreamSynchronize(m_Stream));

    DVLOG(4) << "Destroying GraphExecutors";
    CHECK_CUDA(cudaGraphExecDestroy(m_GraphExecutor));

    DVLOG(4) << "Destroying Graphs";
    CHECK_CUDA(cudaGraphDestroy(m_Graph));

    CHECK_CUDA(cudaStreamDestroy(m_Stream));
}

void StaticSingleModelGraphWorkspace::enqueue()
{
    CHECK_CUDA(cudaGraphLaunch(m_GraphExecutor, m_Stream));
}

memory::descriptor& StaticSingleModelGraphWorkspace::binding(std::uint32_t binding_id)
{
    DCHECK_LT(binding_id, m_Bindings.size());
    return m_Bindings[binding_id];
}

std::size_t StaticSingleModelGraphWorkspace::batch_size()
{
    auto dims = m_Context.context().getBindingDimensions(0);
    return dims.d[0];
}

BenchmarkWorkspace::BenchmarkWorkspace(std::shared_ptr<Model> model) : StaticSingleModelGraphWorkspace(model)
{
    auto pinned_alloc = memory::make_allocator(memory::cuda_malloc_host_allocator());

    for (int i = 0; i < exec_ctx().engine().getNbBindings(); i++)
    {
        auto bytes = binding(i).size();
        VLOG(2) << "binding " << i << ": bytes = " << bytes;
        m_HostBindings.emplace_back(pinned_alloc.allocate_descriptor(bytes));
    }
}

void BenchmarkWorkspace::async_h2d()
{
    for (int i = 0; i < m_HostBindings.size(); i++)
    {
        if (exec_ctx().engine().bindingIsInput(i))
        {
            CHECK_CUDA(
                cudaMemcpyAsync(binding(i).data(), m_HostBindings[i].data(), m_HostBindings[i].size(), cudaMemcpyHostToDevice, stream()));
        }
    }
}

void BenchmarkWorkspace::async_d2h()
{
    for (int i = 0; i < m_HostBindings.size(); i++)
    {
        if (!exec_ctx().engine().bindingIsInput(i))
        {
            CHECK_CUDA(
                cudaMemcpyAsync(m_HostBindings[i].data(), binding(i).data(), m_HostBindings[i].size(), cudaMemcpyDeviceToHost, stream()));
        }
    }
}

TimedBenchmarkWorkspace::TimedBenchmarkWorkspace(std::shared_ptr<Model> model) : BenchmarkWorkspace(model)
{
    CHECK_CUDA(cudaEventCreate(&m_Start));
    CHECK_CUDA(cudaEventCreate(&m_CompleteAsyncH2D));
    CHECK_CUDA(cudaEventCreate(&m_CompleteCompute));
    CHECK_CUDA(cudaEventCreate(&m_CompleteAsyncD2H));
}

void TimedBenchmarkWorkspace::enqueue_pipeline()
{
    CHECK_CUDA(cudaEventRecord(m_Start, stream()));
    async_h2d();
    CHECK_CUDA(cudaEventRecord(m_CompleteAsyncH2D, stream()));
    enqueue();
    CHECK_CUDA(cudaEventRecord(m_CompleteCompute, stream()));
    async_d2h();
    CHECK_CUDA(cudaEventRecord(m_CompleteAsyncD2H, stream()));
}

float TimedBenchmarkWorkspace::get_compute_time_ms()
{
    float ms = 0.0;
    CHECK_CUDA(cudaEventElapsedTime(&ms, m_CompleteAsyncH2D, m_CompleteCompute));
    return ms;
}

float TimedBenchmarkWorkspace::get_h2d_time_ms()
{
    float ms = 0.0;
    CHECK_CUDA(cudaEventElapsedTime(&ms, m_Start, m_CompleteAsyncH2D));
    return ms;
}

float TimedBenchmarkWorkspace::get_d2h_time_ms()
{
    float ms = 0.0;
    CHECK_CUDA(cudaEventElapsedTime(&ms, m_CompleteCompute, m_CompleteAsyncD2H));
    return ms;
}


================================================
FILE: trtlab/tensorrt/tests/CMakeLists.txt
================================================
# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include_directories(${GTEST_INCLUDE_DIRS})

add_executable(test_tensorrt
  test_buffers.cc
)

target_link_libraries(test_tensorrt
    ${PROJECT_NAME}::tensorrt
    ${GTEST_BOTH_LIBRARIES}
)

add_test(
  NAME tensorrt
  COMMAND $<TARGET_FILE:test_tensorrt
)


================================================
FILE: trtlab/tensorrt/tests/test_buffers.cc
================================================
/* Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
#include "trtlab/tensorrt/buffers.h"
#include "trtlab/tensorrt/inference_manager.h"

#include "trtlab/core/memory/malloc.h"
#include "trtlab/cuda/memory/cuda_device.h"
#include "trtlab/cuda/memory/cuda_managed.h"
#include "trtlab/cuda/memory/cuda_pinned_host.h"
#include "gtest/gtest.h"

#include <list>

using namespace trtlab;
using namespace trtlab;
using namespace trtlab::TensorRT;

namespace {

static size_t one_mb = 1024 * 1024;

/*
template<typename T>
class TestBuffers : public ::testing::Test
{
};

using MemoryTypes =
    ::testing::Types<Malloc, CudaDeviceMemory, CudaManagedMemory, CudaPinnedHostMemory>;

TYPED_TEST_CASE(TestBuffers, MemoryTypes);

TYPED_TEST(TestBuffers, make_shared)
{
    auto buffers = std::make_shared<FixedBuffers>(one_mb, one_mb);
}
*/

class TestCyclicBuffers : public ::testing::Test
{
};

TEST_F(TestCyclicBuffers, CyclicBuffers)
{
    auto host = std::make_unique<CyclicAllocator<CudaPinnedHostMemory>>(5, 1024 * 1024);
    auto device = std::make_unique<CyclicAllocator<CudaDeviceMemory>>(5, 1024 * 1024);

    auto buffers = std::make_shared<CyclicBuffers<CudaPinnedHostMemory, CudaDeviceMemory>>(
        std::move(host), std::move(device));

    auto b0 = buffers->AllocateHost(1024);
    auto b1 = buffers->AllocateDevice(1024);
}

} // namespace