[
  {
    "path": ".clang-format",
    "content": "---\nBasedOnStyle: Google\n\nIndentWidth: 2\nColumnLimit: 80\nContinuationIndentWidth: 4\nUseTab: Never\nMaxEmptyLinesToKeep: 2\n\nSortIncludes: true\nCompactNamespaces: true\nReflowComments: true\n\nDerivePointerAlignment: false\nPointerAlignment: Left\n\nAllowShortIfStatementsOnASingleLine: false\nAllowShortBlocksOnASingleLine: false\nAllowShortFunctionsOnASingleLine: Inline\n\nAlwaysBreakAfterReturnType: TopLevelDefinitions\nAlignAfterOpenBracket: AlwaysBreak\nBreakBeforeBraces: Custom\nBraceWrapping:\n  AfterClass: false\n  AfterControlStatement: false\n  AfterEnum: false\n  AfterFunction: true\n  AfterNamespace: false\n  AfterStruct: false\n  AfterUnion: false\n  BeforeCatch: true\n\nBinPackArguments: true\nBinPackParameters: true\nConstructorInitializerAllOnOneLineOrOnePerLine: false\n\nIndentCaseLabels: true\n"
  },
  {
    "path": ".github/workflows/pre-commit.yml",
    "content": "# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\nname: pre-commit\n\non:\n  pull_request:\n\njobs:\n  pre-commit:\n    runs-on: ubuntu-latest\n    steps:\n    - uses: actions/checkout@v5.0.0\n    - uses: actions/setup-python@v6.0.0\n    - uses: pre-commit/action@v3.0.1\n"
  },
  {
    "path": ".gitignore",
    "content": "/build\n/.vscode\n*.so\n"
  },
  {
    "path": ".pre-commit-config.yaml",
    "content": "# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\nrepos:\n- repo: https://github.com/PyCQA/isort\n  rev: 5.12.0\n  hooks:\n  - id: isort\n    additional_dependencies: [toml]\n- repo: https://github.com/psf/black\n  rev: 23.1.0\n  hooks:\n  - id: black\n    types_or: [python, cython]\n- repo: https://github.com/PyCQA/flake8\n  rev: 7.3.0\n  hooks:\n  - id: flake8\n    args: [--max-line-length=88, --select=C,E,F,W,B,B950, --extend-ignore = E203,E501]\n    types_or: [python, cython]\n- repo: https://github.com/pre-commit/mirrors-clang-format\n  rev: v16.0.5\n  hooks:\n  - id: clang-format\n    types_or: [c, c++, cuda, proto, textproto, java]\n    args: [\"-fallback-style=none\", \"-style=file\", \"-i\"]\n- repo: https://github.com/codespell-project/codespell\n  rev: v2.2.4\n  hooks:\n  - id: codespell\n    additional_dependencies: [tomli]\n    args: [\"--toml\", \"pyproject.toml\"]\n    exclude: (?x)^(.*stemmer.*|.*stop_words.*|^CHANGELOG.md$)\n# More details about these pre-commit hooks here:\n# https://pre-commit.com/hooks.html\n- repo: https://github.com/pre-commit/pre-commit-hooks\n  rev: v6.0.0\n  hooks:\n  - id: check-case-conflict\n  - id: check-executables-have-shebangs\n  - id: check-merge-conflict\n  - id: check-json\n  - id: check-toml\n  - id: check-yaml\n  - id: check-shebang-scripts-are-executable\n  - id: end-of-file-fixer\n    types_or: [c, c++, cuda, proto, textproto, java, python]\n  - id: mixed-line-ending\n  - id: requirements-txt-fixer\n  - id: trailing-whitespace\n"
  },
  {
    "path": "CMakeLists.txt",
    "content": "# Copyright 2020-2025, NVIDIA CORPORATION. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\ncmake_minimum_required(VERSION 3.31.8)\n\nproject(tritonbackend LANGUAGES C CXX)\n\n#\n# Options\n#\noption(TRITON_ENABLE_GPU \"Enable GPU support in backend utilities\" ON)\noption(TRITON_ENABLE_MALI_GPU \"Enable Arm MALI GPU support in backend utilities\" OFF)\noption(TRITON_ENABLE_STATS \"Include statistics collections in backend utilities\" ON)\n# Default OFF unless backend explicitly request to use provided implementation\noption(TRITON_ENABLE_MEMORY_TRACKER \"Include device memory tracker in backend utilities\" OFF)\n\nset(TRITON_REPO_ORGANIZATION \"https://github.com/triton-inference-server\" CACHE STRING \"Git repository to pull from\")\nset(TRITON_COMMON_REPO_TAG \"main\" CACHE STRING \"Tag for triton-inference-server/common repo\")\nset(TRITON_CORE_REPO_TAG \"main\" CACHE STRING \"Tag for triton-inference-server/core repo\")\n\n#\n# Setting C++ standard\n#\nset(TRITON_MIN_CXX_STANDARD 17 CACHE STRING \"The minimum C++ standard whose features are requested to build this target.\")\n\nif(NOT CMAKE_BUILD_TYPE)\n  set(CMAKE_BUILD_TYPE Release)\nendif()\n\nif(TRITON_ENABLE_MEMORY_TRACKER AND NOT TRITON_ENABLE_GPU)\n  message(WARNING \"TRITON_ENABLE_MEMORY_TRACKER=ON requires TRITON_ENABLE_GPU=ON, TRITON_ENABLE_MEMORY_TRACKER will be disable\")\n  set(TRITON_ENABLE_MEMORY_TRACKER OFF CACHE BOOL \"Device memory tracker disabled\" FORCE)\nendif()\n\n#\n# Dependencies\n#\ninclude(FetchContent)\n\nFetchContent_Declare(\n  repo-common\n  GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/common.git\n  GIT_TAG ${TRITON_COMMON_REPO_TAG}\n  GIT_SHALLOW ON\n)\nFetchContent_Declare(\n  repo-core\n  GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/core.git\n  GIT_TAG ${TRITON_CORE_REPO_TAG}\n  GIT_SHALLOW ON\n)\nFetchContent_MakeAvailable(repo-common repo-core)\n\n#\n# CUDA\n#\nif(${TRITON_ENABLE_GPU})\n  find_package(CUDAToolkit REQUIRED)\n  set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)\n  message(STATUS \"Using CUDA ${CUDAToolkit_VERSION}\")\n\n  if(CUDAToolkit_VERSION VERSION_GREATER \"10.1\" OR CUDAToolkit_VERSION VERSION_EQUAL \"10.1\")\n    add_definitions(-DTRITON_ENABLE_CUDA_GRAPH=1)\n  else()\n    message(WARNING \"CUDA ${CUDA_VERSION} does not support CUDA graphs.\")\n  endif()\nendif() # TRITON_ENABLE_GPU\n\n#\n# Backend library containing useful source and utilities\n#\nset(SRC_FILES\n  \"src/backend_common.cc\"\n  \"src/backend_input_collector.cc\"\n  \"src/backend_memory.cc\"\n  \"src/backend_model_instance.cc\"\n  \"src/backend_model.cc\"\n  \"src/backend_output_responder.cc\"\n)\n\nif(${TRITON_ENABLE_GPU})\n  set(SRC_FILES ${SRC_FILES} \"src/kernel.h\")\n  if(${TRITON_ENABLE_MEMORY_TRACKER})\n    set(SRC_FILES ${SRC_FILES} \"src/device_memory_tracker.cc\")\n  endif() # TRITON_ENABLE_MEMORY_TRACKER\nendif() # TRITON_ENABLE_GPU\n\nadd_library(\n  triton-backend-utils\n  ${SRC_FILES}\n)\n\nif(${TRITON_ENABLE_GPU})\n  add_library(\n    kernel_library_new\n    src/kernel.cu src/kernel.h\n  )\n\n  enable_language(CUDA)\n  include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/define.cuda_architectures.cmake)\n  set_target_properties(kernel_library_new PROPERTIES LANGUAGE CUDA)\n  set_target_properties(kernel_library_new PROPERTIES CUDA_ARCHITECTURES \"${CUDA_ARCHITECTURES}\")\n  set_target_properties(kernel_library_new PROPERTIES POSITION_INDEPENDENT_CODE ON)\n  set_target_properties(kernel_library_new PROPERTIES LINKER_LANGUAGE CUDA)\n  target_compile_features(kernel_library_new PUBLIC cxx_std_${TRITON_MIN_CXX_STANDARD})\n  set_target_properties(kernel_library_new PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)\n\nendif() # TRITON_ENABLE_GPU\n\nadd_library(\n  TritonBackend::triton-backend-utils ALIAS triton-backend-utils\n)\n\ntarget_include_directories(\n  triton-backend-utils\n  PUBLIC\n    $<INSTALL_INTERFACE:include>\n    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>\n  PRIVATE\n    ${CMAKE_CURRENT_SOURCE_DIR}/src\n)\n\nif(CMAKE_CXX_COMPILER_ID STREQUAL \"MSVC\")\n  message(\"Using MSVC as compiler, default target on Windows 10. \"\n      \"If the target system is not Windows 10, please update _WIN32_WINNT \"\n      \"to corresponding value.\")\nendif()\n\ntarget_compile_features(triton-backend-utils PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD})\ntarget_compile_options(\n  triton-backend-utils\n  PRIVATE\n  $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:\n    -Wall -Wextra -Wno-unused-parameter>\n  $<$<CXX_COMPILER_ID:MSVC>:/Wall /D_WIN32_WINNT=0x0A00 /EHsc /Zc:preprocessor>\n)\n\n# TRITON_ENABLE_GPU exposed in header so set PUBLIC\nif(${TRITON_ENABLE_GPU})\n  target_compile_definitions(\n    triton-backend-utils\n    PUBLIC TRITON_ENABLE_GPU=1\n  )\n  if(${TRITON_ENABLE_MEMORY_TRACKER})\n    target_compile_definitions(\n      triton-backend-utils\n      PUBLIC TRITON_ENABLE_MEMORY_TRACKER=1\n    )\n  endif() # TRITON_ENABLE_MEMORY_TRACKER\nendif() # TRITON_ENABLE_GPU\n\n# TRITON_ENABLE_MALI_GPU exposed in header so set PUBLIC\nif(${TRITON_ENABLE_MALI_GPU})\ntarget_compile_definitions(\n  triton-backend-utils\n  PUBLIC TRITON_ENABLE_MALI_GPU=1\n)\nendif() # TRITON_ENABLE_MALI_GPU\n\n# TRITON_ENABLE_STATS exposed in header so set PUBLIC\nif(${TRITON_ENABLE_STATS})\ntarget_compile_definitions(\n  triton-backend-utils\n  PUBLIC TRITON_ENABLE_STATS=1\n)\nendif() # TRITON_ENABLE_STATS\n\nset_target_properties(\n  triton-backend-utils PROPERTIES\n  WINDOWS_EXPORT_ALL_SYMBOLS TRUE\n  POSITION_INDEPENDENT_CODE ON\n  OUTPUT_NAME tritonbackendutils\n)\n\ntarget_link_libraries(\n  triton-backend-utils\n  PUBLIC\n    triton-core-backendapi         # from repo-core\n    triton-core-serverapi          # from repo-core\n    triton-common-async-work-queue # from repo-common\n    triton-common-json             # from repo-common\n)\n\nif(${TRITON_ENABLE_GPU})\n  target_link_libraries(\n    triton-backend-utils\n    PUBLIC\n      CUDA::cudart\n    PRIVATE\n      kernel_library_new\n  )\n  if(${TRITON_ENABLE_MEMORY_TRACKER})\n    target_link_libraries(\n      triton-backend-utils\n      PUBLIC\n        CUDA::cupti\n    )\n  endif() # TRITON_ENABLE_MEMORY_TRACKER\nendif() # TRITON_ENABLE_GPU\n\n#\n# Install\n#\ninclude(GNUInstallDirs)\nset(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TritonBackend)\n\ninstall(\n  TARGETS\n    triton-backend-utils\n  EXPORT\n    triton-backend-targets\n  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}\n  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}\n)\n\nif(${TRITON_ENABLE_GPU})\n  install(\n    TARGETS\n      kernel_library_new\n    EXPORT\n      triton-backend-targets\n    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}\n    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}\n  )\nendif() # TRITON_ENABLE_GPU\n\ninstall(\n  DIRECTORY include/\n  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}\n)\n\ninstall(\n  EXPORT\n    triton-backend-targets\n  FILE\n    TritonBackendTargets.cmake\n  NAMESPACE\n    TritonBackend::\n  DESTINATION\n    ${INSTALL_CONFIGDIR}\n)\n\ninclude(CMakePackageConfigHelpers)\nconfigure_package_config_file(\n  ${CMAKE_CURRENT_LIST_DIR}/cmake/TritonBackendConfig.cmake.in\n  ${CMAKE_CURRENT_BINARY_DIR}/TritonBackendConfig.cmake\n  INSTALL_DESTINATION ${INSTALL_CONFIGDIR}\n)\n\ninstall(\n  FILES\n  ${CMAKE_CURRENT_BINARY_DIR}/TritonBackendConfig.cmake\n  DESTINATION ${INSTALL_CONFIGDIR}\n)\n\n#\n# Export from build tree\n#\nexport(\n  EXPORT triton-backend-targets\n  FILE ${CMAKE_CURRENT_BINARY_DIR}/TritonBackendTargets.cmake\n  NAMESPACE TritonBackend::\n)\n\nexport(PACKAGE TritonBackend)\n"
  },
  {
    "path": "LICENSE",
    "content": "Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.\n\nRedistribution and use in source and binary forms, with or without\nmodification, are permitted provided that the following conditions\nare met:\n * Redistributions of source code must retain the above copyright\n   notice, this list of conditions and the following disclaimer.\n * Redistributions in binary form must reproduce the above copyright\n   notice, this list of conditions and the following disclaimer in the\n   documentation and/or other materials provided with the distribution.\n * Neither the name of NVIDIA CORPORATION nor the names of its\n   contributors may be used to endorse or promote products derived\n   from this software without specific prior written permission.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\nEXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\nIMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\nPURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\nCONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\nEXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\nPROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\nPROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\nOF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\nOF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n"
  },
  {
    "path": "README.md",
    "content": "<!--\n# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n-->\n\n[![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)\n\n# Triton Inference Server Backend\n\nA Triton *backend* is the implementation that executes a model. A\nbackend can be a wrapper around a deep-learning framework, like\nPyTorch, TensorFlow, TensorRT or ONNX Runtime. Or a backend can be\ncustom C/C++ logic performing any operation (for example, image\npre-processing).\n\nThis repo contains documentation on Triton backends and also source,\nscripts and utilities for creating Triton backends. You do not need to\nuse anything provided in this repo to create a Triton backend but you\nwill likely find its contents useful.\n\n## Frequently Asked Questions\n\nFull documentation is included below but these shortcuts can help you\nget started in the right direction.\n\n### Where can I ask general questions about Triton and Triton backends?\n\nBe sure to read all the information below as well as the [general\nTriton\ndocumentation](https://github.com/triton-inference-server/server#triton-inference-server)\navailable in the main\n[server](https://github.com/triton-inference-server/server) repo. If\nyou don't find your answer there you can ask questions on the main\nTriton [issues\npage](https://github.com/triton-inference-server/server/issues).\n\n### Where can I find all the backends that are available for Triton?\n\nAnyone can develop a Triton backend, so it isn't possible for us to\nknow about all available backends. But the Triton project does provide\na set of supported backends that are tested and updated with each\nTriton release.\n\n**TensorRT**: The TensorRT backend is used to execute TensorRT\nmodels. The\n[tensorrt_backend](https://github.com/triton-inference-server/tensorrt_backend)\nrepo contains the source for the backend.\n\n**ONNX Runtime**: The ONNX Runtime backend is used to execute ONNX\nmodels. The\n[onnxruntime_backend](https://github.com/triton-inference-server/onnxruntime_backend)\nrepo contains the documentation and source for the backend.\n\n**TensorFlow**: The TensorFlow backend is used to execute TensorFlow\nmodels in both GraphDef and SavedModel formats. The same backend is\nused to execute both TensorFlow 1 and TensorFlow 2 models. The\n[tensorflow_backend](https://github.com/triton-inference-server/tensorflow_backend)\nrepo contains the documentation and source for the backend.\n\n**PyTorch**: The PyTorch backend is used to execute PyTorch models in both\nTorchScript and PyTorch 2.0 formats. The\n[pytorch_backend](https://github.com/triton-inference-server/pytorch_backend)\nrepo contains the documentation and source for the backend.\n\n**OpenVINO**: The OpenVINO backend is used to execute\n[OpenVINO](https://docs.openvinotoolkit.org/latest/index.html)\nmodels. The\n[openvino_backend](https://github.com/triton-inference-server/openvino_backend)\nrepo contains the documentation and source for the backend.\n\n**Python**: The Python backend allows you to write your model logic in\nPython. For example, you can use this backend to execute pre/post\nprocessing code written in Python, or to execute a PyTorch Python\nscript directly (instead of first converting it to TorchScript and\nthen using the PyTorch backend). The\n[python_backend](https://github.com/triton-inference-server/python_backend)\nrepo contains the documentation and source for the backend.\n\n**DALI**: [DALI](https://github.com/NVIDIA/DALI) is a collection of\nhighly optimized building blocks and an execution engine that\naccelerates the pre-processing of the input data for deep learning\napplications. The DALI backend allows you to execute your DALI\npipeline within Triton. The\n[dali_backend](https://github.com/triton-inference-server/dali_backend)\nrepo contains the documentation and source for the backend.\n\n**FIL**: The FIL ([Forest Inference\nLibrary](https://github.com/rapidsai/cuml/tree/branch-21.10/python/cuml/fil))\nbackend is used to execute a variety of tree-based ML models, including\nXGBoost models, LightGBM models, Scikit-Learn random forest models, and cuML\nrandom forest models. The\n[fil_backend](https://github.com/triton-inference-server/fil_backend) repo\ncontains the documentation and source for the backend.\n\n**TensorRT-LLM**: The TensorRT-LLM backend allows you to serve\n[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) models with Triton Server.\nCheck out the\n[Triton TRT-LLM user guide](https://github.com/triton-inference-server/server/blob/main/docs/getting_started/trtllm_user_guide.md)\nfor more information. The\n[tensorrtllm_backend](https://github.com/triton-inference-server/tensorrtllm_backend)\nrepo contains the documentation and source for the backend.\n\n**vLLM**: The vLLM backend is designed to run\n[supported models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)\non a [vLLM engine](https://github.com/vllm-project/vllm/blob/main/vllm/engine/async_llm_engine.py).\nThis backend depends on [python_backend](https://github.com/triton-inference-server/python_backend)\nto load and serve models. The\n[vllm_backend](https://github.com/triton-inference-server/vllm_backend) repo\ncontains the documentation and source for the backend.\n\n**Important Note!** Not all the above backends are supported on every platform\nsupported by Triton. Look at the\n[Backend-Platform Support Matrix](docs/backend_platform_support_matrix.md)\nto learn about the same.\n\n### How can I develop my own Triton backend?\n\nFirst you probably want to ask on the main Triton [issues\npage](https://github.com/triton-inference-server/server/issues) to\nmake sure you are not duplicating a backend that already exists. Then\nfollow the [tutorial](examples/README.md) to learn how to create your\nfirst simple Triton backend and incrementally improve it to add more\nfeatures. You should also read the complete documentation on [Triton\nbackends](#backends).\n\n### Can I add (or remove) a backend to an existing Triton installation?\n\nYes. See [Backend Shared Library](#backend-shared-library) for general\ninformation about how the shared library implementing a backend is\nmanaged by Triton, and [Triton with Unsupported and Custom\nBackends](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/compose.md#triton-with-unsupported-and-custom-backends)\nfor documentation on how to add your backend to the released Triton\nDocker image. For a standard install the globally available backends\nare in /opt/tritonserver/backends.\n\n### What about backends developed using the \"legacy custom backend\" API.\n\nThe legacy custom API is removed from Triton. If you have custom\nbackends that you developed using this older API you must port them to\nthe new [Triton Backend API](#triton-backend-api).\n\n## Backends\n\nA Triton *backend* is the implementation that executes a model. A\nbackend can be a wrapper around a deep-learning framework, like\nPyTorch, TensorFlow, TensorRT, ONNX Runtime or OpenVINO. A backend can\nalso implement any functionality you want as long as it adheres to the\n[backend API](#triton-backend-api). Triton uses this API to send\nrequests to the backend for execution and the backend uses the API to\ncommunicate with Triton.\n\nEvery model must be associated with a backend. A model's backend is\nspecified in the model's configuration using the `backend` setting.\nFor using TensorRT backend, the value of this setting should be `tensorrt`.\nSimilarly, for using PyTorch, ONNX and TensorFlow backends, the `backend`\nfield should be set to `pytorch`, `onnxruntime` or `tensorflow` respectively.\nFor all other backends, `backend` must be set to the name of the backend.\nSome backends may also check the `platform` setting for categorizing the model,\nfor example, in TensorFlow backend, `platform` should be set to\n`tensorflow_savedmodel` or `tensorflow_graphdef` according to the model format.\nPlease refer to the specific backend repository on whether `platform` is used.\n\n### Backend Shared Library\n\nEach backend must be implemented as a shared library and the name of\nthe shared library must be *libtriton_\\<backend-name\\>.so*. For\nexample, if the name of the backend is \"mybackend\", a model indicates\nthat it uses the backend by setting the model configuration 'backend'\nsetting to \"mybackend\", and Triton looks for *libtriton_mybackend.so*\nas the shared library that implements the backend. The\n[tutorial](examples/README.md) shows examples of how to build your\nbackend logic into the appropriate shared library.\n\nFor a model, *M* that specifies backend *B*, Triton searches for the\nbackend shared library in the following places, in this order:\n\n* \\<model_repository\\>/M/\\<version_directory\\>/libtriton_B.so\n\n* \\<model_repository\\>/M/libtriton_B.so\n\n* \\<global_backend_directory\\>/B/libtriton_B.so\n\nWhere \\<global_backend_directory\\> is by default\n/opt/tritonserver/backends.  The --backend-directory flag can be used\nto override the default.\n\nTypically you will install your backend into the global backend\ndirectory. For example, if using Triton Docker images you can follow\nthe instructions in [Triton with Unsupported and Custom\nBackends](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/compose.md#triton-with-unsupported-and-custom-backends). Continuing\nthe example of a backend names \"mybackend\", you would install into the\nTriton image as:\n\n```\n/opt/\n  tritonserver/\n    backends/\n      mybackend/\n        libtriton_mybackend.so\n        ... # other files needed by mybackend\n```\n\nStarting from 24.01, the default backend shared library name can be changed by\nproviding the `runtime` setting in the model configuration. For example,\n\n```\nruntime: \"my_backend_shared_library_name.so\"\n```\n\nA model may choose a specific runtime implementation provided by the backend.\n\n### Triton Backend API\n\nA Triton backend must implement the C interface defined in\n[tritonbackend.h](https://github.com/triton-inference-server/core/tree/main/include/triton/core/tritonbackend.h). The\nfollowing abstractions are used by the API.\n\n#### TRITONBACKEND_Backend\n\nA TRITONBACKEND_Backend object represents the backend itself. The\nsame backend object is shared across all models that use the\nbackend. The associated API, like TRITONBACKEND_BackendName, is used\nto get information about the backend and to associate a user-defined\nstate with the backend.\n\nA backend can optionally implement TRITONBACKEND_Initialize and\nTRITONBACKEND_Finalize to get notification of when the backend object\nis created and destroyed (for more information see [backend\nlifecycles](#backend-lifecycles)).\n\n#### TRITONBACKEND_Model\n\nA TRITONBACKEND_Model object represents a model. Each model loaded by\nTriton is associated with a TRITONBACKEND_Model. Each model can use\nthe TRITONBACKEND_ModelBackend API to get the backend object\nrepresenting the backend that is used by the model.\n\nThe same model object is shared across all instances of that\nmodel. The associated API, like TRITONBACKEND_ModelName, is used to\nget information about the model and to associate a user-defined state\nwith the model.\n\nMost backends will implement TRITONBACKEND_ModelInitialize and\nTRITONBACKEND_ModelFinalize to initialize the backend for a given\nmodel and to manage the user-defined state associated with the model\n(for more information see [backend lifecycles](#backend-lifecycles)).\n\nThe backend must take into account threading concerns when\nimplementing TRITONBACKEND_ModelInitialize and\nTRITONBACKEND_ModelFinalize.  Triton will not perform multiple\nsimultaneous calls to these functions for a given model; however, if a\nbackend is used by multiple models Triton may simultaneously call the\nfunctions with a different thread for each model. As a result, the\nbackend must be able to handle multiple simultaneous calls to the\nfunctions. Best practice for backend implementations is to use only\nfunction-local and model-specific user-defined state in these\nfunctions, as is shown in the [tutorial](examples/README.md).\n\n#### TRITONBACKEND_ModelInstance\n\nA TRITONBACKEND_ModelInstance object represents a model\n*instance*. Triton creates one or more instances of the model based on\nthe *instance_group* settings specified in the model\nconfiguration. Each of these instances is associated with a\nTRITONBACKEND_ModelInstance object.\n\nThe only function that the backend must implement is\nTRITONBACKEND_ModelInstanceExecute. The\nTRITONBACKEND_ModelInstanceExecute function is called by Triton to\nperform inference/computation on a batch of inference requests. Most\nbackends will also implement TRITONBACKEND_ModelInstanceInitialize\nand TRITONBACKEND_ModelInstanceFinalize to initialize the backend for\na given model instance and to manage the user-defined state associated\nwith the model (for more information see [backend\nlifecycles](#backend-lifecycles)).\n\nA backend can optionally implement TRITONBACKEND_ModelInstanceReady. This\nfunction is called by the Triton server's ready endpoint to check whether\na model instance is ready to handle requests. The function returns\n`nullptr` (indicating success) if the instance is ready, or a\n`TRITONSERVER_Error` if the instance is not ready.\n\nThe backend must take into account threading concerns when\nimplementing TRITONBACKEND_ModelInstanceInitialize,\nTRITONBACKEND_ModelInstanceFinalize and\nTRITONBACKEND_ModelInstanceExecute.  Triton will not perform multiple\nsimultaneous calls to these functions for a given model instance;\nhowever, if a backend is used by a model with multiple instances or by\nmultiple models Triton may simultaneously call the functions with a\ndifferent thread for each model instance. As a result, the backend\nmust be able to handle multiple simultaneous calls to the\nfunctions. Best practice for backend implementations is to use only\nfunction-local and model-specific user-defined state in these\nfunctions, as is shown in the [tutorial](examples/README.md).\n\n#### TRITONBACKEND_Request\n\nA TRITONBACKEND_Request object represents an inference request made\nto the model. The backend takes ownership of the request object(s) in\nTRITONBACKEND_ModelInstanceExecute and must release each request by\ncalling TRITONBACKEND_RequestRelease. However, the ownership of request\nobject is returned back to Triton in case TRITONBACKEND_ModelInstanceExecute\nreturns an error. See [Inference Requests and Responses](#inference-requests-and-responses)\nfor more information about request lifecycle.\n\nThe Triton Backend API allows the backend to get information about the\nrequest as well as the input and request output tensors of the\nrequest. Each request input is represented by a TRITONBACKEND_Input\nobject.\n\n#### TRITONBACKEND_Response\n\nA TRITONBACKEND_Response object represents a response sent by the\nbackend for a specific request. The backend uses the response API to\nset the name, shape, datatype and tensor values for each output tensor\nincluded in the response. The response can indicate either a failed or\na successful request. See [Inference Requests and\nResponses](#inference-requests-and-responses) for more information\nabout request-response lifecycle.\n\n#### TRITONBACKEND_BackendAttribute\n\nA `TRITONBACKEND_BackendAttribute` allows a backend to set certain attributes which\nare queried by Triton to inform certain feature support, preferred configurations, and\nother types of backend-specific behavior.\n\nWhen initializing a backend, Triton will query the `TRITONBACKEND_GetBackendAttribute` function\nif implemented by the backend. This function is optional to implement, but is generally used to call\nthe related `TRITONBACKEND_BackendAttribute` APIs for setting backend-specific attributes.\n\nSome of the relevant BackendAttribute setter APIs are listed below:\n- `TRITONBACKEND_BackendSetExecutionPolicy`\n- `TRITONBACKEND_BackendAttributeAddPreferredInstanceGroup`\n    - Defines a priority list of instance groups to prefer for this backend if a model config doesn't explicitly define any instance groups.\n- `TRITONBACKEND_BackendAttributeSetParallelModelInstanceLoading`\n    - Defines whether the backend can safely handle concurrent calls to `TRITONBACKEND_ModelInstanceInitialize` or not.\n    - Loading model instances in parallel can improve server startup times for large instance counts.\n    - By default, this attribute is set to false, meaning that parallel instance loading is disabled for all backends unless explicitly enabled.\n    - The following official backends currently support loading model instances in parallel:\n        - Python\n        - ONNXRuntime\n\nThe full list of `TRITONBACKEND_BackendAttribute` related APIs are defined in\n[tritonbackend.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonbackend.h).\n\n### Backend Lifecycles\n\nA backend must carefully manage the lifecycle of the backend itself,\nthe models and model instances that use the backend and the inference\nrequests that execute on the model instances using the backend.\n\n#### Backend and Model\n\nBackend, model and model instance initialization is triggered when\nTriton loads a model.\n\n* If the model requires a backend that is not already in use by an\n  already loaded model, then:\n\n  * Triton [loads the shared library](#backend-shared-library) that\n    implements the backend required by the model.\n\n  * Triton creates the TRITONBACKEND_Backend object that represents\n    the backend.\n\n  * Triton calls TRITONBACKEND_Initialize if it is implemented in the\n    backend shared library. TRITONBACKEND_Initialize should not return\n    until the backend is completely initialized. If\n    TRITONBACKEND_Initialize returns an error, Triton will report that\n    the model failed to load.\n\n* Triton creates the TRITONBACKEND_Model object that represents the\n  model. Triton calls TRITONBACKEND_ModelInitialize if it is\n  implemented in the backend shared library.\n  TRITONBACKEND_ModelInitialize should not return until the backend\n  is completely initialized for the model. If\n  TRITONBACKEND_ModelInitialize returns an error, Triton will show\n  that the model failed to load.\n\n* For each model instance specified for the model in the model\n  configuration:\n\n  * Triton creates the TRITONBACKEND_ModelInstance object that\n    represents the model instance.\n\n  * Triton calls TRITONBACKEND_ModelInstanceInitialize if it is\n    implemented in the backend shared library.\n    TRITONBACKEND_ModelInstanceInitialize should not return until the\n    backend is completely initialized for the instance. If\n    TRITONBACKEND_ModelInstanceInitialize returns an error, Triton\n    will show that the model failed to load.\n\nBackend, model and model instance finalization is triggered when\nTriton unloads a model.\n\n* For each model instance:\n\n  * Triton calls TRITONBACKEND_ModelInstanceFinalize if it is\n    implemented in the backend shared library.\n    TRITONBACKEND_ModelInstanceFinalize should not return until the\n    backend is completely finalized, including stopping any threads\n    create for the model instance and freeing any user-defined state\n    created for the model instance.\n\n  * Triton destroys the TRITONBACKEND_ModelInstance object that\n    represents the model instance.\n\n* Triton calls TRITONBACKEND_ModelFinalize if it is implemented in the\n  backend shared library. TRITONBACKEND_ModelFinalize should not\n  return until the backend is completely finalized, including stopping\n  any threads create for the model and freeing any user-defined state\n  created for the model.\n\n* Triton destroys the TRITONBACKEND_Model object that represents the\n  model.\n\n* Even if no other loaded model requires the backend, Triton does not\n  finalize and unload the backend until the tritonserver process is\n  exiting. When the tritonserver process exits:\n\n  * Triton calls TRITONBACKEND_Finalize if it is implemented in the\n    backend shared library. TRITONBACKEND_ModelFinalize should not\n    return until the backend is completely finalized, including\n    stopping any threads create for the backend and freeing any\n    user-defined state created for the backend.\n\n  * Triton destroys the TRITONBACKEND_Backend object that represents\n    the backend.\n\n#### Inference Requests and Responses\n\nTriton calls TRITONBACKEND_ModelInstanceExecute to execute inference\nrequests on a model instance. Each call to\nTRITONBACKEND_ModelInstanceExecute communicates a batch of requests\nto execute and the instance of the model that should be used to\nexecute those requests. The backend should not allow the caller\nthread to return from TRITONBACKEND_ModelInstanceExecute until that\ninstance is ready to handle another set of requests. Typically this\nmeans that the TRITONBACKEND_ModelInstanceExecute function will\ncreate responses and release the requests before returning. However,\nin case TRITONBACKEND_ModelInstanceExecute returns an error, the ownership\nof requests is transferred back to Triton which will then be responsible\nfor releasing them. Therefore, in the case where TRITONBACKEND_ModelInstanceExecute\nreturns an error, the backend must not retain references to the requests\nor access them in any way. For more detailed description of request/response\nlifetimes, study the documentation of TRITONBACKEND_ModelInstanceExecute in\n[tritonbackend.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonbackend.h).\n\n##### Single Response\n\nMost backends will create a single response for each request. For that\nkind of backend, executing a single inference request requires the\nfollowing steps:\n\n* Create a response for the request using TRITONBACKEND_ResponseNew.\n\n* For each request input tensor use TRITONBACKEND_InputProperties to\n  get shape and datatype of the input as well as the buffer(s)\n  containing the tensor contents.\n\n* For each output tensor which the request expects to be returned, use\n  TRITONBACKEND_ResponseOutput to create the output tensor of the\n  required datatype and shape. Use TRITONBACKEND_OutputBuffer to get a\n  pointer to the buffer where the tensor's contents should be written.\n\n* Use the inputs to perform the inference computation that produces\n  the requested output tensor contents into the appropriate output\n  buffers.\n\n* Optionally set parameters in the response.\n\n* Send the response using TRITONBACKEND_ResponseSend.\n\n* Release the request using TRITONBACKEND_RequestRelease.\n\nFor a batch of requests the backend should attempt to combine the\nexecution of the individual requests as much as possible to increase\nperformance.\n\n##### Decoupled Responses\n\nIt is also possible for a backend to send multiple responses\nfor a request. A backend may also\nsend responses out-of-order relative to the order that the request\nbatches are executed. Such backends are called *decoupled* backends.\n\nThe decoupled backends use one `ResponseFactory` object per request to\ncreate and send any number of responses for the request. They must send at\nleast one final response per request (even if it is a flags-only response).\nYou can send a flags-only response with TRITONBACKEND_ResponseFactorySendFlags.\nFor this kind of backend, executing a single inference request typically requires\nthe following steps:\n\n1. For each request input tensor, use TRITONBACKEND_InputProperties to\n  get shape and datatype of the input as well as the buffer(s)\n  containing the tensor contents.\n\n2. Create a `ResponseFactory` object for the request using\n  TRITONBACKEND_ResponseFactoryNew.\n\n3. Create a response from the `ResponseFactory` object using\n  TRITONBACKEND_ResponseNewFromFactory. As long as you have the\n  `ResponseFactory` object, you can continue creating responses.\n\n4. For each output tensor which the request expects to be returned, use\n  TRITONBACKEND_ResponseOutput to create the output tensor of the\n  required datatype and shape. Use TRITONBACKEND_OutputBuffer to get a\n  pointer to the buffer where the tensor's contents should be written.\n\n5. Use the inputs to perform the inference computation that produces\n  the requested output tensor contents into the appropriate output\n  buffers.\n\n6. Optionally set parameters in the response.\n\n7. Send the response using TRITONBACKEND_ResponseSend.\n\n8. Repeat steps 3-7 until there are no more responses.\n\n9. Send the last response for a request using either TRIONBACKEND_ResponseSend\n  with a TRITONSERVER_ResponseCompleteFlag or after all responses have been\n  sent for a request using TRITONBACKEND_ResponseFactorySendFlags.\n   This is required for every request.\n\n10. Release the request using TRITONBACKEND_RequestRelease.\n\n###### Special Cases\n\nThe decoupled API is powerful and supports various special cases:\n\n* The model can also send responses out-of-order in which it received\n  requests.\n\n* The backend can copy out the contents of the input buffer(s) if\n  request is to be released before the contents are completely\n  consumed to generate responses. After copy, the request can be\n  released anytime before exiting TRITONBACKEND_ModelInstanceExecute.\n  The copies and `ResponseFactory` object can be passed to a separate\n  thread in backend. This means main caller thread can exit from\n  TRITONBACKEND_ModelInstanceExecute and the backend can still continue\n  generating responses as long as it holds `ResponseFactory` object.\n\n\nThe [repeat example](examples/README.md) demonstrates full power of\nwhat can be achieved from decoupled API.\n\n\nStudy documentation of these TRITONBACKEND_* functions in\n[tritonbackend.h](https://github.com/triton-inference-server/core/blob/main/include/triton/core/tritonbackend.h)\nfor more details on these APIs. Read\n[Decoupled Backends and Models](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/decoupled_models.md)\nfor more details on how to host a decoupled model.\n\n## Build the Backend Utilities\n\nThe source in this repo builds into a single \"backend utilities\"\nlibrary that is useful when building backends. You don't need to use\nthese utilities but they will be helpful for most backends.\n\nTypically you don't need to build this repo directly but instead you\ncan include it in the build of your backend as is shown in the\nCMakeLists.txt files of the [tutorial examples](examples/README.md).\n\nTo build and install in a local directory use the following commands.\n\n```\n$ mkdir build\n$ cd build\n$ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install ..\n$ make install\n```\n\nThe following required Triton repositories will be pulled and used in\nthe build. By default the \"main\" branch/tag will be used for each repo\nbut the listed CMake argument can be used to override.\n\n* triton-inference-server/common: -DTRITON_COMMON_REPO_TAG=[tag]\n* triton-inference-server/core: -DTRITON_CORE_REPO_TAG=[tag]\n\nSee the [CMakeLists.txt](CMakeLists.txt) file for other build options.\n\n## Python-based Backends\n\nTriton also provides an option to create [Python-based backends](docs/python_based_backends.md).\nThese backends should implement the\n[`TritonPythonModel` interface](https://github.com/triton-inference-server/python_backend#usage),\nwhich could be re-used as a backend by multiple models.\nWhile the only required function is `execute`,\nyou may find it helpful to enhance your implementation by adding `initialize`,\n`finalize`, and any other helper functions. For examples, please refer to\nthe [vLLM backend](https://github.com/triton-inference-server/vllm_backend),\nwhich provides a common python script to serve models supported by vLLM.\n"
  },
  {
    "path": "cmake/TritonBackendConfig.cmake.in",
    "content": "# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\ninclude(CMakeFindDependencyMacro)\n\nget_filename_component(\n  TRITONBACKEND_CMAKE_DIR \"${CMAKE_CURRENT_LIST_FILE}\" PATH\n)\n\nlist(APPEND CMAKE_MODULE_PATH ${TRITONBACKEND_CMAKE_DIR})\n\nif(NOT TARGET TritonBackend::triton-backend-utils)\n  include(\"${TRITONBACKEND_CMAKE_DIR}/TritonBackendTargets.cmake\")\nendif()\n\nset(TRITONBACKEND_LIBRARIES TritonBackend::triton-backend-utils)\n"
  },
  {
    "path": "cmake/define.cuda_architectures.cmake",
    "content": "# Copyright 2025-2026, NVIDIA CORPORATION. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\nfunction(set_cuda_architectures_list)\n    # Check if CUDA_ARCH_LIST environment variable is set\n    if(DEFINED ENV{CUDA_ARCH_LIST})\n        # Parse CUDA_ARCH_LIST: split by spaces, skip PTX, validate each code\n        set(raw_input \"$ENV{CUDA_ARCH_LIST}\")\n        string(REGEX REPLACE \"PTX\" \"\" raw_input \"${raw_input}\")\n        string(REPLACE \" \" \";\" arch_list \"${raw_input}\")\n\n        set(cuda_arch_result_list \"\")\n        foreach(arch IN LISTS arch_list)\n            string(STRIP \"${arch}\" arch)\n            if(arch STREQUAL \"\")\n                continue()\n            endif()\n            # Normalize: remove dots so 10.0 -> 100, 12.0 -> 120\n            string(REGEX REPLACE \"\\\\.\" \"\" arch_num \"${arch}\")\n            if(NOT arch_num MATCHES \"^[0-9]+$\")\n                continue()\n            endif()\n            # Code >= 100 (10.x, 11.x, 12.x): use family code, no -real\n            if(arch_num GREATER_EQUAL 100)\n                math(EXPR arch_major \"${arch_num} / 10\")\n                set(arch_entry \"${arch_major}0f\")\n            else()\n                set(arch_entry \"${arch_num}-real\")\n            endif()\n            list(APPEND cuda_arch_result_list \"${arch_entry}\")\n        endforeach()\n        # If last element is below 100 (has -real), leave it without -real\n        list(LENGTH cuda_arch_result_list result_len)\n        if(result_len GREATER 0)\n            math(EXPR last_index \"${result_len} - 1\")\n            list(GET cuda_arch_result_list ${last_index} last_entry)\n            string(REGEX REPLACE \"-real$\" \"\" last_entry_stripped \"${last_entry}\")\n            if(NOT last_entry_stripped STREQUAL last_entry)\n                list(REMOVE_AT cuda_arch_result_list ${last_index})\n                list(APPEND cuda_arch_result_list \"${last_entry_stripped}\")\n            endif()\n        endif()\n        list(JOIN cuda_arch_result_list \";\" cuda_arch_input)\n\n        set(CUDA_ARCHITECTURES \"${cuda_arch_input}\" PARENT_SCOPE)\n\n        message(STATUS \"CUDA_ARCH_LIST found, defined CUDA_ARCHITECTURES: $ENV{CUDA_ARCH_LIST}\")\n    else()\n        # Set default value if CUDA_ARCH_LIST is not present\n        set(CUDA_ARCHITECTURES \"75-real;80-real;86-real;89-real;90-real;100f;120f\" PARENT_SCOPE)\n        message(STATUS \"CUDA_ARCH_LIST not found, using default values for CUDA_ARCHITECTURES: ${CUDA_ARCHITECTURES}\")\n    endif()\nendfunction()\n\n# Call the function to validate and set CUDA_ARCHITECTURES\nset_cuda_architectures_list()\nmessage(STATUS \"Defined CUDA_ARCHITECTURES: ${CUDA_ARCHITECTURES}\")\n"
  },
  {
    "path": "docs/backend_platform_support_matrix.md",
    "content": "<!--\n# Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n-->\n\n# Backend-Platform Support Matrix\n\nEven though Triton supports inference across various platforms such as\ncloud, data center, edge and embedded devices on NVIDIA GPUs, x86 and\nARM CPU, or AWS Inferentia, it does so by relying on the backends.\nNote that not all Triton backends support every platform. The purpose\nof this document is to go over what all compute platforms are supported\nby each of these Triton backends.\nGPU in this document refers to Nvidia GPU. See\n[GPU, Driver, and CUDA Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)\nto learn more about supported GPUs.\n\n## Ubuntu 22.04\n\nThe table below describes target device(s) supported for inference by\neach backend on different platforms.\n\n| Backend      | x86       | ARM-SBSA      |\n| ------------ | --------- | ------------- |\n| TensorRT     |  :heavy_check_mark: GPU <br/> :x: CPU | :heavy_check_mark: GPU <br/> :x: CPU       |\n| ONNX Runtime |  :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU  |   :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU      |\n| TensorFlow   |  :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU  |   :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU      |\n| PyTorch      |  :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU  |   :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU      |\n| OpenVINO     |  :x: GPU <br/> :heavy_check_mark: CPU    |     :x: GPU <br/> :x: CPU       |\n| Python[^1]   |  :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU  |  :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU  |\n| DALI         |  :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU  | :heavy_check_mark: GPU[^2] <br/> :heavy_check_mark: CPU[^2] |\n| FIL          |  :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU  |  Unsupported  |\n| TensorRT-LLM |  :heavy_check_mark: GPU <br/> :x: CPU | :heavy_check_mark: GPU <br/> :x: CPU       |\n| vLLM         |  :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU  |  Unsupported  |\n\n\n## Windows 10\n\nOnly TensorRT and ONNX Runtime backends are supported on Windows.\n\n| Backend      | x86       | ARM-SBSA      |\n| ------------ | --------- | ------------- |\n| TensorRT     |  :heavy_check_mark: GPU <br/> :x: CPU | :heavy_check_mark: GPU <br/> :x: CPU       |\n| ONNX Runtime |  :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU  |   :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU      |\n\n\n## Jetson JetPack\n\nFollowing backends are currently supported on Jetson Jetpack:\n\n| Backend      |   Jetson  |\n| ------------ | --------- |\n| TensorRT     |  :heavy_check_mark: GPU <br/> :x: CPU    |\n| ONNX Runtime |  :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU  |   :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU  |\n| TensorFlow   |  :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU  |   :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU  |\n| PyTorch      |  :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU  |   :heavy_check_mark: GPU <br/> :heavy_check_mark: CPU  |\n| Python[^1]   |  :x: GPU <br/> :heavy_check_mark: CPU    |\n\n\nLook at the [Triton Inference Server Support for Jetson and JetPack](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/jetson.md).\n\n\n## AWS Inferentia\n\nCurrently, inference on AWS Inferentia is only supported via\n[python backend](https://github.com/triton-inference-server/python_backend#running-with-inferentia)\nwhere the deployed python script invokes AWS Neuron SDK.\n\n\n[^1]: The supported devices for Python Backend are mentioned with\nrespect to Triton. The python script running in Python Backend can\nbe used to execute inference on any hardware if there are available\npython APIs to do so. AWS inferentia is one such example. Triton\ncore is largely unaware of the fact that inference will run on\nInferentia.\n\n[^2]: In case of ARM-SBSA, some operations are not fully supported.\n"
  },
  {
    "path": "docs/python_based_backends.md",
    "content": "<!--\n# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n-->\n\n# Python-based Backends\n\nPython-based backend is a special type of Triton's backends, which does\nnot require any C++ code. However, this type of backends depends on\n[Python backend](https://github.com/triton-inference-server/python_backend)\nand requires the following artifacts being present:\n`libtriton_python.so`, `triton_python_backend_stub`,\nand `triton_python_backend_utils.py`.\n\n## Usage\nTo implement and use a Python-based backend, make sure to follow these steps.\n* Implement the\n[`TritonPythonModel` interface](https://github.com/triton-inference-server/python_backend#usage),\nwhich could be re-used as a backend by multiple models.\nThis script should be named `model.py`.\n* Create a folder for your custom backend under the backends directory\n(ex: /opt/tritonserver/backends) with the corresponding backend name,\ncontaining the `model.py`. For example, for a backend named\n`my_python_based_backend`, Triton would expect to find the full path\n`/opt/tritonserver/backends/my_python_based_backend/model.py`.\n* Make sure that `libtriton_python.so`, `triton_python_backend_stub`,\nand `triton_python_backend_utils.py` are present either under\n`/opt/tritonserver/backends/my_python_based_backend/` or\n`/opt/tritonserver/backends/python/`. When both locations contain\nmentioned artifacts, custom backend's artifacts will take priority over Python\nbackend's artifacts. This way, if custom backends needs to use a different\nPython version than what is shipped by default, it can easily be done. Please,\nrefer to [customization](#customization) section for more details.\n* Specify `my_python_based_backend` as a backend in `config.pbtxt`\nfor any model, that should use this backend.\n\n```\n...\nbackend: \"my_python_based_backend\"\n...\n```\n\nSince Triton uses Python backend under the hood, it is expected,\nto see `python` backend entry in server logs, even when Python backend\nis not explicitly used.\n\n```\nI1013 21:52:45.756456 18668 server.cc:619]\n+-------------------------+-------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------+\n| Backend                 | Path                                                        | Config                                                                                                              |\n+-------------------------+-------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------+\n| python                  | /opt/tritonserver/backends/python/libtriton_python.so       | {\"cmdline\":{\"auto-complete-config\":\"true\",\"backend-directory\":\"/opt/tritonserver/backends\",\"min-compute-capability\" |\n|                         |                                                             | :\"6.000000\",\"default-max-batch-size\":\"4\"}}                                                                          |\n| my_python_based_backend | /opt/tritonserver/backends/my_python_based_backend/model.py | {\"cmdline\":{\"auto-complete-config\":\"true\",\"backend-directory\":\"/opt/tritonserver/backends\",\"min-compute-capability\" |\n|                         |                                                             | :\"6.000000\",\"default-max-batch-size\":\"4\"}}                                                                          |\n+-------------------------+-------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------+\n```\n\n## Customization\n\nPython backend shipped in the NVIDIA GPU Cloud containers uses Python 3.10.\nPython backend is able to use the libraries that exist in the\ncurrent Python environment. These libraries can be installed in a virtualenv,\nconda environment, or the global system Python, and\nwill only be used if the Python version matches the Python version\nof the Python backend's stub executable (`triton_python_backend_stub`).\nFor example, if you install a set of libraries in a Python 3.9 environment\nand your Python backend stub is compiled with Python 3.10 these libraries\nwill *NOT* be available. You would need to\n[compile](https://github.com/triton-inference-server/python_backend#building-custom-python-backend-stub)\nthe stub executable with Python 3.9.\n\nIf you want to create a tar file that contains all your Python dependencies\nor you want to use different Python environments for each Python model\nyou need to create a\n[Custom Execution Environment](https://github.com/triton-inference-server/python_backend#creating-custom-execution-environments)\nin Python backend.\n\n## Background\n\nIn some use cases, it is sufficient to implement\n[`TritonPythonModel` interface](https://github.com/triton-inference-server/python_backend#usage)\nonly once and re-use it across multiple models. As an example, please refer\nto the [vLLM backend](https://github.com/triton-inference-server/vllm_backend),\nwhich provides a common python script to serve models supported by vLLM.\n\nTriton Inference Server can handle this special case and treats common\n`model.py` script as a Python-based backend. In the scenario, when model\nrelies on a custom Python-based backend, Triton loads `libtriton_python.so`\nfirst, this ensures that Triton knows how to send requests to the backend\nfor execution and the backend knows how to communicate with Triton. Then,\nTriton makes sure to use common `model.py` from the backend's repository,\nand not look for it in the model repository.\n\nWhile the only required function is `execute`, it is typically helpful\nto enhance your implementation by adding `initialize`, `finalize`,\nand any other helper functions. Users are also encouraged to make use of the\n[`auto_complete_config`](https://github.com/triton-inference-server/python_backend#auto_complete_config)\nfunction to define standardized input and output properties upfront.\n"
  },
  {
    "path": "examples/README.md",
    "content": "<!--\n# Copyright 2021-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n-->\n\n[![License](https://img.shields.io/badge/License-BSD3-lightgrey.svg)](https://opensource.org/licenses/BSD-3-Clause)\n\n# Triton Example Backends\n\nTo learn how to create a Triton backend, and to see a best-practices\nbaseline onto which you can add your own backend log, follow the\n[Tutorial](#tutorial).\n\nTriton also provides a couple of example backends that demonstrate\nspecific aspects of the backend API not covered by the\n[Tutorial](#tutorial).\n\n* The\n[*repeat*](https://github.com/triton-inference-server/repeat_backend)\nbackend shows a more advanced example of how a backend can produce\nmultiple responses per request.\n\n* The\n[*stateful*](https://github.com/triton-inference-server/stateful_backend)\nbackend shows an example of how a backend can manage model state\ntensors on the server-side for the [sequence\nbatcher](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/batcher.md#sequence-batcher)\nto avoid transferring state tensors between client and server. Triton\nalso implements [Implicit State\nManagement](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/architecture.md#implicit-state-management)\nwhich allows backends to behave in a stateless manner and leave the\nstate management to Triton.\n\n## Tutorial\n\nThe [Triton Backend API](../README.md#triton-backend-api) exposes a\nlarge number of features. The backend utilities and classes provide\nmany functions commonly used when creating a backend. But to create a\nfunctional backend it is not necessary to use most of the backend API\nor utilities. The tutorial starts with an implementation that shows a\n*minimal* backend and then adds on recommended and optional\nenhancements. The tutorial implementations follow best practices for\nTriton backends and so can be used as templates for your own backend.\n\n### *Minimal* Triton Backend\n\nThe source code for the *minimal* backend is contained in\n[minimal.cc](backends/minimal/src/minimal.cc). The source code\ncontains extensive documentation describing the operation of the\nbackend and the use of the [Triton Backend\nAPI](../README.md#triton-backend-api) and the backend\nutilities. Before reading the source code, make sure you understand\nthe concepts associated with Triton backend abstractions\n[TRITONBACKEND_Backend](../README.md#tritonbackend_backend),\n[TRITONBACKEND_Model](../README.md#tritonbackend_model), and\n[TRITONBACKEND_ModelInstance](../README.md#tritonbackend_modelinstance).\n\nThe *minimal* backend does not do any interesting operation, it simply\ncopies a single input tensor to a single output tensor, but it does\ndemonstrate the basic organization required for a Triton backend.\n\nThe *minimal* backend is complete but for clarity leaves out some\nimportant aspects of writing a full-featured backend that are\ndescribed in [*Recommended* Triton\nBackend](#recommended-triton-backend). When creating your own backend\nuse the [*Recommended* Triton Backend](#recommended-triton-backend) as\na starting point.\n\n#### Building the *Minimal* Backend\n\n[backends/minimal/CMakeLists.txt](backends/minimal/CMakeLists.txt)\nshows the recommended build and install script for a Triton\nbackend. To build the *minimal* backend and install in a local directory\nuse the following commands.\n\n```\n$ cd backends/minimal\n$ mkdir build\n$ cd build\n$ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install ..\n$ make install\n```\n\nThe following required Triton repositories will be pulled and used in\nthe build. By default the \"main\" branch/tag will be used for each repo\nbut the listed CMake argument can be used to override.\n\n* triton-inference-server/backend: -DTRITON_BACKEND_REPO_TAG=[tag]\n* triton-inference-server/core: -DTRITON_CORE_REPO_TAG=[tag]\n* triton-inference-server/common: -DTRITON_COMMON_REPO_TAG=[tag]\n\nIf you are building on a release branch (or on a development branch\nthat is based off of a release branch), then you must set these cmake\narguments to point to that release branch as well. For example, if you\nare building the r21.10 identity_backend branch then you need to use\nthe following additional cmake flags:\n\n```\n-DTRITON_BACKEND_REPO_TAG=r21.10\n-DTRITON_CORE_REPO_TAG=r21.10\n-DTRITON_COMMON_REPO_TAG=r21.10\n```\n\nAfter building the install directory will contain a backends/minimal\ndirectory that contains the *minimal* backend. Instructions for adding\nthis backend to the Triton server are described in [Backend Shared\nLibrary](../README.md#backend-shared-library).\n\n#### Running Triton with the *Minimal* Backend\n\nAfter adding the *minimal* backend to the Triton server as described\nin [Backend Shared Library](../README.md#backend-shared-library), you\ncan run Triton and have it load the models in\n[model_repos/minimal_models](model_repos/minimal_models). Assuming you\nhave created a *tritonserver* Docker image by adding the *minimal*\nbackend to Triton, the following command will run Triton:\n\n```\n$ docker run --rm -it --net=host -v/path/to/model_repos/minimal_models:/models tritonserver --model-repository=/models\n```\n\nThe console output will show similar to the following indicating that\nthe *batching* and *nonbatching* models from the minimal_models\nrepository have loaded correctly. Note that the model repository has\ntwo models that both use the *minimal* backend. A backend can support\nany number of different models.\n\n```\nI1215 23:46:00.250284 68 server.cc:589]\n+-------------+---------+--------+\n| Model       | Version | Status |\n+-------------+---------+--------+\n| batching    | 1       | READY  |\n| nonbatching | 1       | READY  |\n+-------------+---------+--------+\n```\n\nThe models are identical except that the *batching* model enabled the\n[dynamic\nbatcher](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/batcher.md#dynamic-batcher)\nand supports batch sizes up to 8. Note that the *batching* model sets\nthe [batch\ndelay](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/batcher.md#delayed-batching)\nto 5 seconds so that the example client described below can\ndemonstrate how the *minimal* backend receives a batch of requests.\n\n#### Testing the *Minimal* Backend\n\nThe [clients](clients) directory holds example clients. The\n[minimal_client](clients/minimal_client) Python script demonstrates\nsending a couple of inference requests to the *minimal* backend. With\nTriton running as described in [Running Triton with the *Minimal*\nBackend](#running-triton-with-the-minimal-backend), execute the\nclient:\n\n```\n$ clients/minimal_client\n```\n\nThe minimal_client first sends a single request to nonbatching\nmodel. From the output you can see that the input value is returned in\nthe output.\n\n```\n=========\nSending request to nonbatching model: IN0 = [1 2 3 4]\nResponse: {'model_name': 'nonbatching', 'model_version': '1', 'outputs': [{'name': 'OUT0', 'datatype': 'INT32', 'shape': [4], 'parameters': {'binary_data_size': 16}}]}\nOUT0 = [1 2 3 4]\n```\n\nIn the Triton console output you can see the log message printed by\nthe *minimal* backend that indicates that it received a batch\ncontaining the single request.\n\n```\nI1221 18:14:12.964836 86 minimal.cc:348] model nonbatching: requests in batch 1\nI1221 18:14:12.964857 86 minimal.cc:356] batched IN0 value: [ 1, 2, 3, 4 ]\n```\n\nThe minimal_client next sends 2 requests at the same time to the\nbatching model. Triton will dynamically batch those requests into a\nsingle batch and send that single batch to the *minimal* backend.\n\n```\n=========\nSending request to batching model: IN0 = [[10 11 12 13]]\nSending request to batching model: IN0 = [[20 21 22 23]]\nResponse: {'model_name': 'batching', 'model_version': '1', 'outputs': [{'name': 'OUT0', 'datatype': 'INT32', 'shape': [1, 4], 'parameters': {'binary_data_size': 16}}]}\nOUT0 = [[10 11 12 13]]\nResponse: {'model_name': 'batching', 'model_version': '1', 'outputs': [{'name': 'OUT0', 'datatype': 'INT32', 'shape': [1, 4], 'parameters': {'binary_data_size': 16}}]}\nOUT0 = [[20 21 22 23]]\n```\n\nIn the Triton console output you can see the log message indicating\nthat the *minimal* backend received a batch containing both requests.\n\n```\nI1221 18:14:17.965982 86 minimal.cc:348] model batching: requests in batch 2\nI1221 18:14:17.966035 86 minimal.cc:356] batched IN0 value: [ 10, 11, 12, 13, 20, 21, 22, 23 ]\n```\n\n### *Recommended* Triton Backend\n\nThe source code for the *recommended* backend is contained in\n[recommended.cc](backends/recommended/src/recommended.cc). The source\ncode contains extensive documentation describing the operation of the\nbackend and the use of the [Triton Backend\nAPI](../README.md#triton-backend-api) and the backend\nutilities. Before reading the source code, make sure you understand\nthe concepts associated with Triton backend abstractions\n[TRITONBACKEND_Backend](../README.md#tritonbackend_backend),\n[TRITONBACKEND_Model](../README.md#tritonbackend_model), and\n[TRITONBACKEND_ModelInstance](../README.md#tritonbackend_modelinstance).\n\nThe *recommended* backend improves the [*minimal*\nbackend](#minimal-triton-backend) to include the following features\nwhich should be present in any robust backend implementation:\n\n* Enhances the backend to support models with input/output tensors\n  that have datatypes other than INT32.\n\n* Enhances the backend to support models with input/output tensors\n  that have any shape.\n\n* Uses the Triton backend metric APIs to record statistics about\n  requests executing in the backend. These metrics can then we queried\n  using the Triton\n  [metrics](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/metrics.md)\n  and\n  [statistics](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_statistics.md)\n  APIs.\n\n* Additional error checking to ensure that the backend's version is\n  compatible with Triton and that each model's configuration is\n  compatible with the backend.\n\nAs with the *minimal* backend, the *recommended* backend just returns\nthe input tensor value in the output tensor. Because of the additions\ndescribed above, the *recommended* backend can serve as a starting\npoint for your backend.\n\n#### Building the *Recommended* Backend\n\n[backends/recommended/CMakeLists.txt](backends/recommended/CMakeLists.txt)\nshows the recommended build and install script for a Triton\nbackend. Building and installing is the same as described in [Building\nthe *Minimal* Backend](#building-the-minimal-backend).\n\n#### Running Triton with the *Recommended* Backend\n\nAfter adding the *recommended* backend to the Triton server as\ndescribed in [Backend Shared\nLibrary](../README.md#backend-shared-library), you can run Triton and\nhave it load the models in\n[model_repos/recommended_models](model_repos/recommended_models). Assuming\nyou have created a *tritonserver* Docker image by adding the\n*recommended* backend to Triton, the following command will run\nTriton:\n\n```\n$ docker run --rm -it --net=host -v/path/to/model_repos/recommended_models:/models tritonserver --model-repository=/models\n```\n\nThe console output will show similar to the following indicating that\nthe *batching* model from the recommended_models repository have\nloaded correctly.\n\n```\nI1215 23:46:00.250284 68 server.cc:589]\n+-------------+---------+--------+\n| Model       | Version | Status |\n+-------------+---------+--------+\n| batching    | 1       | READY  |\n+-------------+---------+--------+\n```\n\n#### Testing the *Recommended* Backend\n\nThe [clients](clients) directory holds example clients. The\n[recommended_client](clients/recommended_client) Python script\ndemonstrates sending a couple of inference requests to the\n*recommended* backend. With Triton running as described in [Running\nTriton with the *Recommended*\nBackend](#running-triton-with-the-recommended-backend), execute the\nclient:\n\n```\n$ clients/recommended_client\n```\n\nThe recommended_client next sends 2 requests at the same time to the\nbatching model, similar to what was done above with the *minimal*\nbackend. Triton will dynamically batch those requests into a single\nbatch and send that single batch to the *recommended* backend. In this\nmodel, batching is supported, the datatype is FP32 and the tensor\nshape is [ -1, 4, 4 ].\n\n```\n=========\nSending request to batching model: input = [[[1.  1.1 1.2 1.3]\n  [2.  2.1 2.2 2.3]\n  [3.  3.1 3.2 3.3]\n  [4.  4.1 4.2 4.3]]]\nSending request to batching model: input = [[[10.  10.1 10.2 10.3]\n  [20.  20.1 20.2 20.3]\n  [30.  30.1 30.2 30.3]\n  [40.  40.1 40.2 40.3]]]\nResponse: {'model_name': 'batching', 'model_version': '1', 'outputs': [{'name': 'OUTPUT', 'datatype': 'FP32', 'shape': [1, 4, 4], 'parameters': {'binary_data_size': 64}}]}\nOUTPUT = [[[1.  1.1 1.2 1.3]\n  [2.  2.1 2.2 2.3]\n  [3.  3.1 3.2 3.3]\n  [4.  4.1 4.2 4.3]]]\nResponse: {'model_name': 'batching', 'model_version': '1', 'outputs': [{'name': 'OUTPUT', 'datatype': 'FP32', 'shape': [1, 4, 4], 'parameters': {'binary_data_size': 64}}]}\nOUTPUT = [[[10.  10.1 10.2 10.3]\n  [20.  20.1 20.2 20.3]\n  [30.  30.1 30.2 30.3]\n  [40.  40.1 40.2 40.3]]]\n```\n\nIn the Triton console output you can see the log message indicating\nthat the *recommended* backend received a batch containing both\nrequests.\n\n```\nI1221 18:30:52.223226 127 recommended.cc:604] model batching: requests in batch 2\nI1221 18:30:52.223313 127 recommended.cc:613] batched INPUT value: [ 1.000000, 1.100000, 1.200000, 1.300000, 2.000000, 2.100000, 2.200000, 2.300000, 3.000000, 3.100000, 3.200000, 3.300000, 4.000000, 4.100000, 4.200000, 4.300000, 10.000000, 10.100000, 10.200000, 10.300000, 20.000000, 20.100000, 20.200001, 20.299999, 30.000000, 30.100000, 30.200001, 30.299999, 40.000000, 40.099998, 40.200001, 40.299999 ]\n```\n\nBecause the *recommended* backend can support models that have\ninput/output tensors with any datatype and shape, you can edit the\nmodel configuration and the client to experiment with these options.\n\nTo see the metrics collected for these two inference requests, use the following command to access Triton's metrics endpoint.\n\n```\n$ curl localhost:8002/metrics\n```\n\nThe output will be metric values in Prometheus data format. The\n[metrics\ndocumentation](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/metrics.md)\ngives a description of these metric values.\n\n```\n# HELP nv_inference_request_success Number of successful inference requests, all batch sizes\n# TYPE nv_inference_request_success counter\nnv_inference_request_success{model=\"batching\",version=\"1\"} 2.000000\n# HELP nv_inference_request_failure Number of failed inference requests, all batch sizes\n# TYPE nv_inference_request_failure counter\nnv_inference_request_failure{model=\"batching\",version=\"1\"} 0.000000\n# HELP nv_inference_count Number of inferences performed\n# TYPE nv_inference_count counter\nnv_inference_count{model=\"batching\",version=\"1\"} 2.000000\n# HELP nv_inference_exec_count Number of model executions performed\n# TYPE nv_inference_exec_count counter\nnv_inference_exec_count{model=\"batching\",version=\"1\"} 1.000000\n...\n```\n\nYou can also see the collected statistics using the [statistics\nendpoint](https://github.com/triton-inference-server/server/blob/main/docs/protocol/extension_statistics.md).\n\n```\n$ curl localhost:8000/v2/models/batching/stats\n{\"model_stats\":[{\"name\":\"batching\",\"version\":\"1\",\"last_inference\":1640111452223,\"inference_count\":2,\"execution_count\":1,\"inference_stats\":{\"success\":{\"count\":2,\"ns\":9997025869},\"fail\":{\"count\":0,\"ns\":0},\"queue\":{\"count\":2,\"ns\":9996491319},\"compute_input\":{\"count\":2,\"ns\":95288},\"compute_infer\":{\"count\":2,\"ns\":232202},\"compute_output\":{\"count\":2,\"ns\":195850}},\"batch_stats\":[{\"batch_size\":2,\"compute_input\":{\"count\":1,\"ns\":47644},\"compute_infer\":{\"count\":1,\"ns\":116101},\"compute_output\":{\"count\":1,\"ns\":97925}}]}]}\n```\n\n### *BLS* Triton Backend\n\nPlease see the [documentation](backends/bls/README.md) of *BLS* Backend.\n\n### Custom Batching\n\nWhen using the dynamic batcher, Triton allows you to set custom batching rules.\nThese rules are added on top of the specified dynamic batcher behavior.\nTo set them, you pass in a library that implements the custom batching API.\nTwo example batching libraries are located in the [batching_strategies directory](batching_strategies).\n\nFor this tutorial, you can use the [volume_batching](batching_strategies/volume_batching) example\nto set up a maximum byte volume per request. To build the library and install in a local directory, use the following commands:\n```\n$ cd batch_strategies/volume_batching\n$ mkdir build\n$ cd build\n$ cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install ..\n$ make install\n```\n\nNext, move the library to the desired location.\nYou can pass the file location via the model configuration.\nIf not specified, Triton will look for a library called `batchstrategy.so` in the model version,\nmodel, and backend directories, in that order. For ease, we'll pass it via the model configuration.\nSelect a model to use this strategy with. Then, update the model configuration to have these fields:\n\n```\n  dynamic_batching { }\n  parameters: { key: \"TRITON_BATCH_STRATEGY_PATH\", value: {string_value: \"/path/to/libtriton_volumebatching.so\"}}\n  parameters { key: \"MAX_BATCH_VOLUME_BYTES\" value: {string_value: \"96\"}}\n```\n\nYou can update the path to the filepath of your library.\nYou can also update the value of `MAX_BATCH_VOLUME_BYTES` to the maximum volume per batch for your\nuse case. After starting Triton, you should see the scheduler apply a volume constraint per batch\non top of default batching behavior for your model. This can be made more visible by setting a\n[max queue delay](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/batcher.md#delayed-batching)\nto give the scheduler more time for each batch to be completed. For example, you could set the\ndelay to 100,000 microseconds.\n\n### Enhancements\n\nThis section describes several optional features that you can add to\nenhance the capabilities of your backend.\n\n#### Automatically Model Configuration Generation\n\n[Automatic model configuration\ngeneration](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/model_configuration.md#auto-generated-model-configuration)\nis enabled by the backend implementing the appropriate logic (for\nexample, in a function called AutoCompleteConfig) during\nTRITONBACKEND_ModelInitialize. For the *recommended* backend you would\nadd a call to AutoCompleteConfig in the ModelState constructor just\nbefore the call to ValidateModelConfig. The AutoCompleteConfig\nfunction can update the model configuration with input tensor, output\ntensor, and max-batch-size configuration; and then update the\nconfiguration using TRITONBACKEND_ModelSetConfig. Examples can be\nfound in [ONNXRuntime\nbackend](https://github.com/triton-inference-server/onnxruntime_backend),\n[TensorFlow\nbackend](https://github.com/triton-inference-server/tensorflow_backend)\nand other backends.\n\n#### Add Key-Value Parameters to a Response\n\nA backend can add a key-value pair to a response any time after the\nresponse is created and before it is sent. The parameter key must be a\nstring and the parameter value can be a string, integer or\nboolean. The following example shows the TRITONBACKEND API used to set\nresponse parameters. Error checking code is not shown to improve\nclarity.\n\n```\nTRITONBACKEND_ResponseSetStringParameter(response, \"param0\", \"an example string parameter\");\nTRITONBACKEND_ResponseSetIntParameter(responses[r], \"param1\", 42);\nTRITONBACKEND_ResponseSetBoolParameter(responses[r], \"param2\", false);\n```\n\n#### Access Model Artifacts in the Model Repository\n\nA backend can access any of the files in a model's area of the model\nregistry. These files are typically needed during\nTRITONBACKEND_ModelInitialize but can be accessed at other times as\nwell. The TRITONBACKEND_ModelRepository API gives the location of the\nmodel's repository. For example, the following code can be run during\nTRITONBACKEND_ModelInitialize to write the location to the log.\n\n```\n// Can get location of the model artifacts. Normally we would need\n// to check the artifact type to make sure it was something we can\n// handle... but we are just going to log the location so we don't\n// need the check. We would use the location if we wanted to load\n// something from the model's repo.\nTRITONBACKEND_ArtifactType artifact_type;\nconst char* clocation;\nRETURN_IF_ERROR(\n    TRITONBACKEND_ModelRepository(model, &artifact_type, &clocation));\nLOG_MESSAGE(\n    TRITONSERVER_LOG_INFO,\n    (std::string(\"Repository location: \") + clocation).c_str());\n```\n\nThe framework backends (for example, TensorRT, ONNXRuntime,\nTensorFlow, PyTorch) read the actual model file from the model\nrepository using this API. See those backends for examples of how it\ncan be used.\n"
  },
  {
    "path": "examples/backends/bls/CMakeLists.txt",
    "content": "# Copyright 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\ncmake_minimum_required(VERSION 3.31.8)\n\nproject(tritonblsbackend LANGUAGES C CXX)\n\n#\n# Options\n#\n# Must include options required for this project as well as any\n# projects included in this one by FetchContent.\n#\n# GPU support is disabled by default because BLS backend doesn't\n# support GPUs.\n#\noption(TRITON_ENABLE_GPU \"Enable GPU support in backend\" OFF)\noption(TRITON_ENABLE_STATS \"Include statistics collections in backend\" ON)\n\nset(TRITON_REPO_ORGANIZATION \"https://github.com/triton-inference-server\" CACHE STRING \"Git repository to pull from\")\nset(TRITON_BACKEND_REPO_TAG \"main\" CACHE STRING \"Tag for triton-inference-server/backend repo\")\nset(TRITON_CORE_REPO_TAG \"main\" CACHE STRING \"Tag for triton-inference-server/core repo\")\nset(TRITON_COMMON_REPO_TAG \"main\" CACHE STRING \"Tag for triton-inference-server/common repo\")\n\n#\n# Setting C++ min standard\n#\nset(TRITON_MIN_CXX_STANDARD 17 CACHE STRING \"The minimum C++ standard whose features are requested to build this target.\")\n\nif(NOT CMAKE_BUILD_TYPE)\n  set(CMAKE_BUILD_TYPE Release)\nendif()\n\n#\n# Dependencies\n#\n# FetchContent's composability isn't very good. We must include the\n# transitive closure of all repos so that we can override the tag.\n#\ninclude(FetchContent)\n\nFetchContent_Declare(\n  repo-common\n  GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/common.git\n  GIT_TAG ${TRITON_COMMON_REPO_TAG}\n  GIT_SHALLOW ON\n)\nFetchContent_Declare(\n  repo-core\n  GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/core.git\n  GIT_TAG ${TRITON_CORE_REPO_TAG}\n  GIT_SHALLOW ON\n)\nFetchContent_Declare(\n  repo-backend\n  GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/backend.git\n  GIT_TAG ${TRITON_BACKEND_REPO_TAG}\n  GIT_SHALLOW ON\n)\nFetchContent_MakeAvailable(repo-common repo-core repo-backend)\n\n#\n# Shared library implementing the Triton Backend API\n#\nconfigure_file(src/libtriton_bls.ldscript libtriton_bls.ldscript COPYONLY)\n\nadd_library(\n  triton-bls-backend SHARED\n  src/backend.cc\n  src/bls.h\n  src/bls.cc\n  src/bls_utils.h\n  src/bls_utils.cc\n)\n\nadd_library(\n  TritonBLSBackend::triton-bls-backend ALIAS triton-bls-backend\n)\n\ntarget_include_directories(\n  triton-bls-backend\n  PRIVATE\n    ${CMAKE_CURRENT_SOURCE_DIR}/src\n)\n\ntarget_compile_features(triton-bls-backend PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD})\ntarget_compile_options(\n  triton-bls-backend PRIVATE\n  $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:\n    -Wall -Wextra -Wno-unused-parameter -Wno-error=maybe-uninitialized>\n)\n\ntarget_link_libraries(\n  triton-bls-backend\n  PRIVATE\n    triton-core-serverstub  # from repo-core\n    triton-backend-utils    # from repo-backend\n)\n\nset_target_properties(\n  triton-bls-backend PROPERTIES\n  POSITION_INDEPENDENT_CODE ON\n  OUTPUT_NAME triton_bls\n  LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_bls.ldscript\n  LINK_FLAGS \"-Wl,--version-script libtriton_bls.ldscript\"\n)\n\n#\n# Install\n#\ninclude(GNUInstallDirs)\nset(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TritonBLSBackend)\n\ninstall(\n  TARGETS\n    triton-bls-backend\n  EXPORT\n    triton-bls-backend-targets\n  LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/bls\n  ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/bls\n)\n\ninstall(\n  EXPORT\n    triton-bls-backend-targets\n  FILE\n    TritonBLSBackendTargets.cmake\n  NAMESPACE\n    TritonBLSBackend::\n  DESTINATION\n    ${INSTALL_CONFIGDIR}\n)\n\ninclude(CMakePackageConfigHelpers)\nconfigure_package_config_file(\n  ${CMAKE_CURRENT_LIST_DIR}/cmake/TritonBLSBackendConfig.cmake.in\n  ${CMAKE_CURRENT_BINARY_DIR}/TritonBLSBackendConfig.cmake\n  INSTALL_DESTINATION ${INSTALL_CONFIGDIR}\n)\n\ninstall(\n  FILES\n  ${CMAKE_CURRENT_BINARY_DIR}/TritonBLSBackendConfig.cmake\n  DESTINATION ${INSTALL_CONFIGDIR}\n)\n\n#\n# Export from build tree\n#\nexport(\n  EXPORT triton-bls-backend-targets\n  FILE ${CMAKE_CURRENT_BINARY_DIR}/TritonBLSBackendTargets.cmake\n  NAMESPACE TritonBLSBackend::\n)\n\nexport(PACKAGE TritonBLSBackend)\n"
  },
  {
    "path": "examples/backends/bls/README.md",
    "content": "<!--\n# Copyright 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n-->\n\n# *BLS* Triton Backend\n\nThe [*BLS*](../bls) backend demonstrates using in-process C-API to\nexecute inferences within the backend. This backend serves as an example to\nbackend developers for implementing their own custom pipeline in C++.\nFor Python use cases, please refer to\n[Business Logic Scripting](https://github.com/triton-inference-server/python_backend/blob/main/README.md#business-logic-scripting)\nsection in Python backend.\n\nThe source code for the *bls* backend is contained in\n[src](./src).\n\n* [backend.cc](./src/backend.cc) contains the main backend\nimplementation. The content of this file is not BLS specific. It only includes\nthe required Triton backend functions that is standard for any backend\nimplementation. The BLS logic is set off in the\n`TRITONBACKEND_ModelInstanceExecute` with lines `bls_executor.Execute(requests[r], &responses[r]);`.\n\n* [bls.h](./src/bls.h) is where the BLS (class `BLSExecutor`) of\nthis example is located. You can refer to this file to see how to interact with\nTriton in-process C-API to build the custom execution pipeline.\n\n* [bls_utils.h](./src/bls_utils.h) is where all the utilities that\nare not BLS dependent are located.\n\nThe source code contains extensive documentation describing the operation of\nthe backend and the use of the\n[Triton Backend API](../../../README.md#triton-backend-api) and the\n[Triton Server API](https://github.com/triton-inference-server/server/blob/main/docs/customization_guide/inprocess_c_api.md).\nBefore reading the source code, make sure you understand\nthe concepts associated with Triton backend abstractions\n[TRITONBACKEND_Backend](../../../README.md#tritonbackend_backend),\n[TRITONBACKEND_Model](../../../README.md#tritonbackend_model), and\n[TRITONBACKEND_ModelInstance](../../../README.md#tritonbackend_modelinstance).\n\nThe *bls* backend will send two requests on the 'addsub_python' and 'addsub_onnx'\nmodels. After the inference requests are completed, this backend will extract\nOUTPUT0 from the 'addsub_python' and OUTPUT1 from the 'addsub_onnx' model to\nconstruct the final inference response object using these tensors.\n\nThere are some self-imposed limitations that were made for the simplicity of\nthis example:\n1. This backend does not support batching.\n2. This backend does not support decoupled models.\n3. This backend does not support GPU tensors.\n4. The model configuration should be strictly set as the comments described in\n[backend.cc](./src/backend.cc).\n\nYou can implement your custom backend that is not limited to the limitations\nmentioned above.\n\n## Building the *BLS* Backend\n\n[backends/bls/CMakeLists.txt](CMakeLists.txt)\nshows the recommended build and install script for a Triton\nbackend. Building and installing is the same as described in [Building\nthe *Minimal* Backend](../../README.md#building-the-minimal-backend).\n\n## Running Triton with the *BLS* Backend\n\nAfter adding the *bls* backend to the Triton server as\ndescribed in [Backend Shared\nLibrary](../../../README.md#backend-shared-library), you can run Triton and\nhave it load the models in\n[model_repos/bls_models](../../model_repos/bls_models). Assuming you have created a\n*tritonserver* Docker image by adding the *bls* backend to Triton, the\nfollowing command will run Triton:\n\n```\n$ docker run --rm -it --net=host -v/path/to/model_repos/bls_models:/models tritonserver --model-repository=/models\n```\n\nThe console output will show similar to the following indicating that\nthe *bls_fp32*, *addsub_python* and *addsub_onnx* models from the bls_models repository have\nloaded correctly.\n\n```\nI0616 09:34:47.767433 19214 server.cc:629]\n+---------------+---------+--------+\n| Model         | Version | Status |\n+---------------+---------+--------+\n| addsub_python | 1       | READY  |\n| addsub_onnx     | 1       | READY  |\n| bls_fp32      | 1       | READY  |\n+---------------+---------+--------+\n```\n\n## Testing the *BLS* Backend\n\nThe [clients](../../clients) directory holds example clients. The\n[bls_client](../../clients/bls_client) Python script demonstrates sending an\ninference requests to the *bls* backend. With Triton running as\ndescribed in [Running Triton with the *BLS* Backend](#running-triton-with-the-bls-backend),\nexecute the client:\n\n```\n$ clients/bls_client\n```\n\nYou should see an output similar to the output below:\n\n```\nINPUT0 ([0.42935285 0.51512766 0.43625894 ... 0.6670954  0.17747518 0.7976901 ]) + INPUT1 ([6.7752063e-01 2.4223252e-01 6.7743927e-01 ... 4.1531715e-01 2.5451833e-01 7.9097062e-01]) = OUTPUT0 ([1.1068735  0.75736016 1.1136982 ... 1.0824126  0.4319935  1.5886607 ])\nINPUT0 ([0.42935285 0.51512766 0.43625894 ... 0.6670954  0.17747518 0.7976901 ]) - INPUT1 ([6.7752063e-01 2.4223252e-01 6.7743927e-01 ... 4.1531715e-01 2.5451833e-01 7.9097062e-01]) = OUTPUT1 ([-0.24816778  0.27289516 -0.24118033 ... 0.25177827 -0.07704315  0.00671947])\n\nPASS\n```\n"
  },
  {
    "path": "examples/backends/bls/cmake/TritonBLSBackendConfig.cmake.in",
    "content": "# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\ninclude(CMakeFindDependencyMacro)\n\nget_filename_component(\n  TRITONBLSBACKEND_CMAKE_DIR \"${CMAKE_CURRENT_LIST_FILE}\" PATH\n)\n\nlist(APPEND CMAKE_MODULE_PATH ${TRITONBLSBACKEND_CMAKE_DIR})\n\nif(NOT TARGET TritonBLSBackend::triton-bls-backend)\n  include(\"${TRITONBLSBACKEND_CMAKE_DIR}/TritonBLSBackendTargets.cmake\")\nendif()\n\nset(TRITONBLSBACKEND_LIBRARIES TritonBLSBackend::triton-bls-backend)\n"
  },
  {
    "path": "examples/backends/bls/src/backend.cc",
    "content": "// Copyright 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n//\n// Redistribution and use in source and binary forms, with or without\n// modification, are permitted provided that the following conditions\n// are met:\n//  * Redistributions of source code must retain the above copyright\n//    notice, this list of conditions and the following disclaimer.\n//  * Redistributions in binary form must reproduce the above copyright\n//    notice, this list of conditions and the following disclaimer in the\n//    documentation and/or other materials provided with the distribution.\n//  * Neither the name of NVIDIA CORPORATION nor the names of its\n//    contributors may be used to endorse or promote products derived\n//    from this software without specific prior written permission.\n//\n// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n#include \"bls.h\"\n#include \"triton/backend/backend_model.h\"\n#include \"triton/backend/backend_model_instance.h\"\n\n//\n// Backend that demonstrates using in-process C-API to execute inferences\n// within the backend.\n//\n// Two particular models, 'addsub_python' and 'addsub_onnx', must be loaded on\n// the server for a successful inference execution on this backend.\n//\n// The model configuration should be set as follows in order to be in line with\n// the 'addsub_python' and 'addsub_onnx' models. This backend does not support\n// batching. These limitations are only for this specific backend. You can\n// implement your custom BLS backend with less limitations.\n//\n// Model Configuration:\n//   - Input 'INPUT0' must have shape [16] and datatype must be TYPE_FP32.\n//\n//   - Input 'INPUT1' must have shape [16] and datatype must be TYPE_FP32.\n//\n//   - For each response, output 'OUTPUT0' must have shape [16] and\n//     datatype TYPE_FP32.\n//\n//   - For each response, output 'OUTPUT1' must have shape [16] and\n//     datatype TYPE_FP32.\n//\n// This backend will send two requests on the 'addsub_python' and 'addsub_onnx'\n// models. After the inference requests are completed, this backend\n// will extract OUTPUT0 from the 'addsub_python' and OUTPUT1 from the\n// 'addsub_onnx' model to construct the final inference response object using\n// these tensors.\n\nnamespace triton { namespace backend { namespace bls {\n\n//\n// ModelState\n//\n// State associated with a model that is using this backend. An object\n// of this class is created and associated with each\n// TRITONBACKEND_Model.\n//\nclass ModelState : public BackendModel {\n public:\n  static TRITONSERVER_Error* Create(\n      TRITONBACKEND_Model* triton_model, ModelState** state);\n  virtual ~ModelState() = default;\n\n  // Validate that model configuration is supported by this backend.\n  TRITONSERVER_Error* ValidateModelConfig();\n\n private:\n  ModelState(TRITONBACKEND_Model* triton_model) : BackendModel(triton_model) {}\n};\n\nTRITONSERVER_Error*\nModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)\n{\n  try {\n    *state = new ModelState(triton_model);\n  }\n  catch (const BackendModelException& ex) {\n    RETURN_ERROR_IF_TRUE(\n        ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,\n        std::string(\"unexpected nullptr in BackendModelException\"));\n    RETURN_IF_ERROR(ex.err_);\n  }\n\n  return nullptr;  // success\n}\n\nTRITONSERVER_Error*\nModelState::ValidateModelConfig()\n{\n  // We have the json DOM for the model configuration...\n  common::TritonJson::WriteBuffer buffer;\n  RETURN_IF_ERROR(model_config_.PrettyWrite(&buffer));\n  LOG_MESSAGE(\n      TRITONSERVER_LOG_INFO,\n      (std::string(\"model configuration:\\n\") + buffer.Contents()).c_str());\n\n  // max_batch_size must be 0 because this backend does not support\n  // batching\n  int64_t max_batch_size;\n  RETURN_IF_ERROR(model_config_.MemberAsInt(\"max_batch_size\", &max_batch_size));\n  RETURN_ERROR_IF_FALSE(\n      max_batch_size == 0, TRITONSERVER_ERROR_INVALID_ARG,\n      std::string(\"bls backend only supports models with max_batch_size == 0\"));\n\n  common::TritonJson::Value inputs, outputs;\n  RETURN_IF_ERROR(model_config_.MemberAsArray(\"input\", &inputs));\n  RETURN_IF_ERROR(model_config_.MemberAsArray(\"output\", &outputs));\n\n  // There must be 2 inputs and 2 outputs.\n  RETURN_ERROR_IF_FALSE(\n      inputs.ArraySize() == 2, TRITONSERVER_ERROR_INVALID_ARG,\n      std::string(\"expected 2 inputs, got \") +\n          std::to_string(inputs.ArraySize()));\n  RETURN_ERROR_IF_FALSE(\n      outputs.ArraySize() == 2, TRITONSERVER_ERROR_INVALID_ARG,\n      std::string(\"expected 2 outputs, got \") +\n          std::to_string(outputs.ArraySize()));\n\n  // Here we rely on the model configuration listing the inputs and\n  // outputs in a specific order, which we shouldn't really require...\n  common::TritonJson::Value input0, input1, output0, output1;\n  RETURN_IF_ERROR(inputs.IndexAsObject(0, &input0));\n  RETURN_IF_ERROR(inputs.IndexAsObject(1, &input1));\n  RETURN_IF_ERROR(outputs.IndexAsObject(0, &output0));\n  RETURN_IF_ERROR(outputs.IndexAsObject(1, &output1));\n\n  // Check tensor names\n  std::string in0_name, in1_name, out0_name, out1_name;\n  RETURN_IF_ERROR(input0.MemberAsString(\"name\", &in0_name));\n  RETURN_IF_ERROR(input1.MemberAsString(\"name\", &in1_name));\n  RETURN_IF_ERROR(output0.MemberAsString(\"name\", &out0_name));\n  RETURN_IF_ERROR(output1.MemberAsString(\"name\", &out1_name));\n\n  RETURN_ERROR_IF_FALSE(\n      in0_name == \"INPUT0\", TRITONSERVER_ERROR_INVALID_ARG,\n      std::string(\"expected first input tensor name to be INPUT0, got \") +\n          in0_name);\n  RETURN_ERROR_IF_FALSE(\n      in1_name == \"INPUT1\", TRITONSERVER_ERROR_INVALID_ARG,\n      std::string(\"expected second input tensor name to be INPUT1, got \") +\n          in1_name);\n  RETURN_ERROR_IF_FALSE(\n      out0_name == \"OUTPUT0\", TRITONSERVER_ERROR_INVALID_ARG,\n      std::string(\"expected first output tensor name to be OUTPUT0, got \") +\n          out0_name);\n  RETURN_ERROR_IF_FALSE(\n      out1_name == \"OUTPUT1\", TRITONSERVER_ERROR_INVALID_ARG,\n      std::string(\"expected second output tensor name to be OUTPUT1, got \") +\n          out1_name);\n\n  // Check shapes\n  std::vector<int64_t> in0_shape, in1_shape, out0_shape, out1_shape;\n  RETURN_IF_ERROR(backend::ParseShape(input0, \"dims\", &in0_shape));\n  RETURN_IF_ERROR(backend::ParseShape(input1, \"dims\", &in1_shape));\n  RETURN_IF_ERROR(backend::ParseShape(output0, \"dims\", &out0_shape));\n  RETURN_IF_ERROR(backend::ParseShape(output1, \"dims\", &out1_shape));\n\n  RETURN_ERROR_IF_FALSE(\n      in0_shape.size() == 1, TRITONSERVER_ERROR_INVALID_ARG,\n      std::string(\"expected INPUT0 shape to have one dimension, got \") +\n          backend::ShapeToString(in0_shape));\n  RETURN_ERROR_IF_FALSE(\n      in1_shape.size() == 1, TRITONSERVER_ERROR_INVALID_ARG,\n      std::string(\"expected INPUT1 shape to have one dimension, got \") +\n          backend::ShapeToString(in1_shape));\n  RETURN_ERROR_IF_FALSE(\n      out0_shape.size() == 1, TRITONSERVER_ERROR_INVALID_ARG,\n      std::string(\"expected OUTPUT0 shape to have one dimension, got \") +\n          backend::ShapeToString(out0_shape));\n  RETURN_ERROR_IF_FALSE(\n      out1_shape.size() == 1, TRITONSERVER_ERROR_INVALID_ARG,\n      std::string(\"expected OUTPUT1 shape to have one dimension, got \") +\n          backend::ShapeToString(out1_shape));\n\n  // Check datatypes\n  std::string in0_dtype, in1_dtype, out0_dtype, out1_dtype;\n  RETURN_IF_ERROR(input0.MemberAsString(\"data_type\", &in0_dtype));\n  RETURN_IF_ERROR(input1.MemberAsString(\"data_type\", &in1_dtype));\n  RETURN_IF_ERROR(output0.MemberAsString(\"data_type\", &out0_dtype));\n  RETURN_IF_ERROR(output1.MemberAsString(\"data_type\", &out1_dtype));\n\n  RETURN_ERROR_IF_FALSE(\n      in0_dtype == \"TYPE_FP32\", TRITONSERVER_ERROR_INVALID_ARG,\n      std::string(\"expected INPUT0 datatype to be TYPE_FP32, got \") +\n          in0_dtype);\n  RETURN_ERROR_IF_FALSE(\n      in1_dtype == \"TYPE_FP32\", TRITONSERVER_ERROR_INVALID_ARG,\n      std::string(\"expected INPUT1 datatype to be TYPE_FP32, got \") +\n          in1_dtype);\n  RETURN_ERROR_IF_FALSE(\n      out0_dtype == \"TYPE_FP32\", TRITONSERVER_ERROR_INVALID_ARG,\n      std::string(\"expected OUTPUT0 datatype to be TYPE_FP32, got \") +\n          out0_dtype);\n  RETURN_ERROR_IF_FALSE(\n      out1_dtype == \"TYPE_FP32\", TRITONSERVER_ERROR_INVALID_ARG,\n      std::string(\"expected OUTPUT1 datatype to be TYPE_FP32, got \") +\n          out1_dtype);\n\n  return nullptr;  // success\n}\n\n//\n// ModelInstanceState\n//\n// State associated with a model instance. An object of this class is\n// created and associated with each TRITONBACKEND_ModelInstance.\n//\nclass ModelInstanceState : public BackendModelInstance {\n public:\n  static TRITONSERVER_Error* Create(\n      ModelState* model_state,\n      TRITONBACKEND_ModelInstance* triton_model_instance,\n      ModelInstanceState** state);\n  virtual ~ModelInstanceState() = default;\n\n  void ProcessRequests(\n      TRITONBACKEND_Request** requests, const uint32_t request_count);\n\n private:\n  ModelInstanceState(\n      ModelState* model_state,\n      TRITONBACKEND_ModelInstance* triton_model_instance)\n      : BackendModelInstance(model_state, triton_model_instance)\n  {\n  }\n};\n\nTRITONSERVER_Error*\nModelInstanceState::Create(\n    ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance,\n    ModelInstanceState** state)\n{\n  try {\n    *state = new ModelInstanceState(model_state, triton_model_instance);\n  }\n  catch (const BackendModelInstanceException& ex) {\n    RETURN_ERROR_IF_TRUE(\n        ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,\n        std::string(\"unexpected nullptr in BackendModelInstanceException\"));\n    RETURN_IF_ERROR(ex.err_);\n  }\n\n  return nullptr;  // success\n}\n\nvoid\nModelInstanceState::ProcessRequests(\n    TRITONBACKEND_Request** requests, const uint32_t request_count)\n{\n  uint64_t exec_start_ns = 0;\n  SET_TIMESTAMP(exec_start_ns);\n\n  for (size_t i = 0; i < request_count; i++) {\n    // If we get a nullptr request then something is badly wrong. Fail\n    // and release all requests.\n    if (requests[i] == nullptr) {\n      RequestsRespondWithError(\n          requests, request_count,\n          TRITONSERVER_ErrorNew(\n              TRITONSERVER_ERROR_INTERNAL,\n              std::string(\n                  \"null request given to BLS backend for '\" + Name() + \"'\")\n                  .c_str()));\n      return;\n    }\n  }\n\n  // At this point we accept ownership of 'requests', which means that\n  // even if something goes wrong we must still return success from\n  // this function. If something does go wrong in processing a\n  // particular request then we send an error response just for the\n  // specific request.\n  std::vector<TRITONBACKEND_Response*> responses;\n  responses.reserve(request_count);\n\n  for (size_t i = 0; i < request_count; i++) {\n    TRITONBACKEND_Response* response;\n    auto err = TRITONBACKEND_ResponseNew(&response, requests[i]);\n    if (err == nullptr) {\n      responses.emplace_back(response);\n    } else {\n      responses.emplace_back(nullptr);\n      LOG_MESSAGE(TRITONSERVER_LOG_ERROR, \"Fail to create response\");\n      TRITONSERVER_ErrorDelete(err);\n    }\n  }\n\n  ModelState* model_state = reinterpret_cast<ModelState*>(Model());\n\n  // The way we collect these batch timestamps is not entirely\n  // accurate. Normally, in a performant backend you would execute all\n  // the requests at the same time, and so there would be a single\n  // compute-start / compute-end time-range. But here we execute each\n  // request separately so there is no single range. As a result we\n  // just show the entire execute time as being the compute time as\n  // well.\n  uint64_t compute_start_ns = 0;\n  SET_TIMESTAMP(compute_start_ns);\n\n  // Create a BLSExecutor object. To separate from standard backend\n  // implementation, the BLS logic is placed inside class BLSExecutor.\n  BLSExecutor bls_executor(model_state->TritonServer());\n\n  for (size_t r = 0; r < request_count; r++) {\n    bls_executor.Execute(requests[r], &responses[r]);\n  }\n\n  uint64_t compute_end_ns = 0;\n  SET_TIMESTAMP(compute_end_ns);\n\n  uint64_t exec_end_ns = 0;\n  SET_TIMESTAMP(exec_end_ns);\n\n  // Send all the responses that haven't already been sent because of\n  // an earlier error. Note that the responses are not set to nullptr\n  // here as we need that indication below to determine if the request\n  // we successful or not.\n  for (auto& response : responses) {\n    if (response != nullptr) {\n      LOG_IF_ERROR(\n          TRITONBACKEND_ResponseSend(\n              response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr),\n          \"failed to send BLS backend response\");\n    }\n  }\n\n  // Report statistics for each request.\n  for (uint32_t r = 0; r < request_count; ++r) {\n    auto& request = requests[r];\n    LOG_IF_ERROR(\n        TRITONBACKEND_ModelInstanceReportStatistics(\n            TritonModelInstance(), request,\n            (responses[r] != nullptr) /* success */, exec_start_ns,\n            compute_start_ns, compute_end_ns, exec_end_ns),\n        \"failed reporting request statistics\");\n\n    LOG_IF_ERROR(\n        TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL),\n        \"failed releasing request\");\n  }\n\n  // Report the entire batch statistics.\n  LOG_IF_ERROR(\n      TRITONBACKEND_ModelInstanceReportBatchStatistics(\n          TritonModelInstance(), 1 /*total_batch_size*/, exec_start_ns,\n          compute_start_ns, compute_end_ns, exec_end_ns),\n      \"failed reporting batch request statistics\");\n\n  LOG_MESSAGE(\n      TRITONSERVER_LOG_VERBOSE,\n      (std::string(\"TRITONBACKEND_ModelExecute: model \") + Name() +\n       \" released \" + std::to_string(request_count) + \" requests\")\n          .c_str());\n}\n\n/////////////\n\nextern \"C\" {\n\n// Implementing TRITONBACKEND_ModelInitialize is optional. The backend\n// should initialize any state that is intended to be shared across\n// all instances of the model.\nTRITONSERVER_Error*\nTRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model)\n{\n  const char* cname;\n  RETURN_IF_ERROR(TRITONBACKEND_ModelName(model, &cname));\n  std::string name(cname);\n\n  uint64_t version;\n  RETURN_IF_ERROR(TRITONBACKEND_ModelVersion(model, &version));\n\n  LOG_MESSAGE(\n      TRITONSERVER_LOG_INFO,\n      (std::string(\"TRITONBACKEND_ModelInitialize: \") + name + \" (version \" +\n       std::to_string(version) + \")\")\n          .c_str());\n\n  // With each model we create a ModelState object and associate it\n  // with the TRITONBACKEND_Model.\n  ModelState* model_state;\n  RETURN_IF_ERROR(ModelState::Create(model, &model_state));\n  RETURN_IF_ERROR(\n      TRITONBACKEND_ModelSetState(model, reinterpret_cast<void*>(model_state)));\n\n  // One of the primary things to do in ModelInitialize is to examine\n  // the model configuration to ensure that it is something that this\n  // backend can support. If not, returning an error from this\n  // function will prevent the model from loading.\n  RETURN_IF_ERROR(model_state->ValidateModelConfig());\n\n  return nullptr;  // success\n}\n\n// Implementing TRITONBACKEND_ModelFinalize is optional unless state\n// is set using TRITONBACKEND_ModelSetState. The backend must free\n// this state and perform any other cleanup.\nTRITONSERVER_Error*\nTRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model)\n{\n  void* vstate;\n  RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate));\n  ModelState* model_state = reinterpret_cast<ModelState*>(vstate);\n\n  LOG_MESSAGE(\n      TRITONSERVER_LOG_INFO, \"TRITONBACKEND_ModelFinalize: delete model state\");\n\n  delete model_state;\n\n  return nullptr;  // success\n}\n\n// Implementing TRITONBACKEND_ModelInstanceInitialize is optional. The\n// backend should initialize any state that is required for a model\n// instance.\nTRITONSERVER_Error*\nTRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)\n{\n  const char* cname;\n  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceName(instance, &cname));\n  std::string name(cname);\n\n  int32_t device_id;\n  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceDeviceId(instance, &device_id));\n  TRITONSERVER_InstanceGroupKind kind;\n  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceKind(instance, &kind));\n\n  LOG_MESSAGE(\n      TRITONSERVER_LOG_INFO,\n      (std::string(\"TRITONBACKEND_ModelInstanceInitialize: \") + name + \" (\" +\n       TRITONSERVER_InstanceGroupKindString(kind) + \" device \" +\n       std::to_string(device_id) + \")\")\n          .c_str());\n\n  // The instance can access the corresponding model as well... here\n  // we get the model and from that get the model's state.\n  TRITONBACKEND_Model* model;\n  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model));\n\n  void* vmodelstate;\n  RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate));\n  ModelState* model_state = reinterpret_cast<ModelState*>(vmodelstate);\n\n  // With each instance we create a ModelInstanceState object and\n  // associate it with the TRITONBACKEND_ModelInstance.\n  ModelInstanceState* instance_state;\n  RETURN_IF_ERROR(\n      ModelInstanceState::Create(model_state, instance, &instance_state));\n  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState(\n      instance, reinterpret_cast<void*>(instance_state)));\n\n  LOG_MESSAGE(\n      TRITONSERVER_LOG_VERBOSE,\n      (std::string(\"TRITONBACKEND_ModelInstanceInitialize: instance \"\n                   \"initialization successful \") +\n       name + \" (device \" + std::to_string(device_id) + \")\")\n          .c_str());\n\n  return nullptr;  // success\n}\n\n// Implementing TRITONBACKEND_ModelInstanceFinalize is optional unless\n// state is set using TRITONBACKEND_ModelInstanceSetState. The backend\n// must free this state and perform any other cleanup.\nTRITONSERVER_Error*\nTRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)\n{\n  void* vstate;\n  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate));\n  ModelInstanceState* instance_state =\n      reinterpret_cast<ModelInstanceState*>(vstate);\n\n  LOG_MESSAGE(\n      TRITONSERVER_LOG_INFO,\n      \"TRITONBACKEND_ModelInstanceFinalize: delete instance state\");\n\n  delete instance_state;\n\n  return nullptr;  // success\n}\n\n// Implementing TRITONBACKEND_ModelInstanceExecute is required.\nTRITONSERVER_Error*\nTRITONBACKEND_ModelInstanceExecute(\n    TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests,\n    const uint32_t request_count)\n{\n  // Triton will not call this function simultaneously for the same\n  // 'instance'. But since this backend could be used by multiple\n  // instances from multiple models the implementation needs to handle\n  // multiple calls to this function at the same time (with different\n  // 'instance' objects). Suggested practice for this is to use only\n  // function-local and model-instance-specific state (obtained from\n  // 'instance'), which is what we do here.\n  ModelInstanceState* instance_state;\n  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(\n      instance, reinterpret_cast<void**>(&instance_state)));\n  ModelState* model_state =\n      reinterpret_cast<ModelState*>(instance_state->Model());\n\n  LOG_MESSAGE(\n      TRITONSERVER_LOG_VERBOSE,\n      (std::string(\"model \") + model_state->Name() + \", instance \" +\n       instance_state->Name() + \", executing \" + std::to_string(request_count) +\n       \" requests\")\n          .c_str());\n\n  instance_state->ProcessRequests(requests, request_count);\n\n  return nullptr;  // success\n}\n\n}  // extern \"C\"\n\n}}}  // namespace triton::backend::bls\n"
  },
  {
    "path": "examples/backends/bls/src/bls.cc",
    "content": "// Copyright 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n//\n// Redistribution and use in source and binary forms, with or without\n// modification, are permitted provided that the following conditions\n// are met:\n//  * Redistributions of source code must retain the above copyright\n//    notice, this list of conditions and the following disclaimer.\n//  * Redistributions in binary form must reproduce the above copyright\n//    notice, this list of conditions and the following disclaimer in the\n//    documentation and/or other materials provided with the distribution.\n//  * Neither the name of NVIDIA CORPORATION nor the names of its\n//    contributors may be used to endorse or promote products derived\n//    from this software without specific prior written permission.\n//\n// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n#include \"bls.h\"\n\nnamespace triton { namespace backend { namespace bls {\n\nBLSExecutor::BLSExecutor(TRITONSERVER_Server* server)\n    : server_(server), model_executor_(server)\n{\n}\n\nTRITONSERVER_Error*\nBLSExecutor::PrepareInferenceRequest(\n    TRITONBACKEND_Request* bls_request,\n    TRITONSERVER_InferenceRequest** irequest, const std::string model_name)\n{\n  // Get request_id, correlation_id, and flags from the current request\n  // for preparing a new inference request that we will send to 'addsub_python'\n  // or 'addsub_onnx' model later.\n  const char* request_id;\n  uint64_t correlation_id;\n  uint32_t flags;\n  RETURN_IF_ERROR(TRITONBACKEND_RequestId(bls_request, &request_id));\n  RETURN_IF_ERROR(\n      TRITONBACKEND_RequestCorrelationId(bls_request, &correlation_id));\n  RETURN_IF_ERROR(TRITONBACKEND_RequestFlags(bls_request, &flags));\n\n  // Create an inference request object. The inference request object\n  // is where we set the name of the model we want to use for\n  // inference and the input tensors.\n  RETURN_IF_ERROR(TRITONSERVER_InferenceRequestNew(\n      irequest, server_, model_name.c_str(), -1 /* model_version */));\n  // Set request_id, correlation_id, and flags for the new request.\n  RETURN_IF_ERROR(TRITONSERVER_InferenceRequestSetId(*irequest, request_id));\n  RETURN_IF_ERROR(\n      TRITONSERVER_InferenceRequestSetCorrelationId(*irequest, correlation_id));\n  RETURN_IF_ERROR(TRITONSERVER_InferenceRequestSetFlags(*irequest, flags));\n  RETURN_IF_ERROR(TRITONSERVER_InferenceRequestSetReleaseCallback(\n      *irequest, InferRequestComplete, nullptr /* request_release_userp */));\n\n  return nullptr;  // success\n}\n\nTRITONSERVER_Error*\nBLSExecutor::PrepareInferenceInput(\n    TRITONBACKEND_Request* bls_request, TRITONSERVER_InferenceRequest* irequest)\n{\n  // Get the properties of the two inputs from the current request.\n  // Then, add the two input tensors and append the input data to the new\n  // request.\n  uint32_t input_count;\n  RETURN_IF_ERROR(TRITONBACKEND_RequestInputCount(bls_request, &input_count));\n\n  TRITONBACKEND_Input* input;\n  const char* name;\n  TRITONSERVER_DataType datatype;\n  const int64_t* shape;\n  uint32_t dims_count;\n  size_t data_byte_size;\n  TRITONSERVER_MemoryType data_memory_type;\n  int64_t data_memory_id;\n  const char* data_buffer;\n\n  for (size_t count = 0; count < input_count; count++) {\n    RETURN_IF_ERROR(TRITONBACKEND_RequestInputByIndex(\n        bls_request, count /* index */, &input));\n    RETURN_IF_ERROR(TRITONBACKEND_InputProperties(\n        input, &name, &datatype, &shape, &dims_count, nullptr, nullptr));\n    RETURN_IF_ERROR(TRITONBACKEND_InputBuffer(\n        input, 0 /* idx */, reinterpret_cast<const void**>(&data_buffer),\n        &data_byte_size, &data_memory_type, &data_memory_id));\n    RETURN_IF_ERROR(TRITONSERVER_InferenceRequestAddInput(\n        irequest, name, datatype, shape, dims_count));\n    RETURN_IF_ERROR(TRITONSERVER_InferenceRequestAppendInputData(\n        irequest, name, &data_buffer[0], data_byte_size, data_memory_type,\n        data_memory_id));\n  }\n\n  return nullptr;  // success\n}\n\nTRITONSERVER_Error*\nBLSExecutor::PrepareInferenceOutput(\n    TRITONBACKEND_Request* bls_request, TRITONSERVER_InferenceRequest* irequest)\n{\n  // Indicate the output tensors to be calculated and returned\n  // for the inference request.\n  uint32_t output_count;\n  RETURN_IF_ERROR(TRITONBACKEND_RequestOutputCount(bls_request, &output_count));\n  const char* output_name;\n  for (size_t count = 0; count < output_count; count++) {\n    RETURN_IF_ERROR(TRITONBACKEND_RequestOutputName(\n        bls_request, count /* index */, &output_name));\n    RETURN_IF_ERROR(\n        TRITONSERVER_InferenceRequestAddRequestedOutput(irequest, output_name));\n  }\n\n  return nullptr;  // success\n}\n\nvoid\nBLSExecutor::Execute(\n    TRITONBACKEND_Request* bls_request, TRITONBACKEND_Response** response)\n{\n  // The names of the models that we will send internal requests on.\n  std::vector<std::string> model_names = {\"addsub_python\", \"addsub_onnx\"};\n\n  // Check if both models are valid before executing request.\n  try {\n    for (size_t i = 0; i < 2; i++) {\n      // Check if the model is ready.\n      bool is_ready = false;\n      THROW_IF_TRITON_ERROR(TRITONSERVER_ServerModelIsReady(\n          server_, model_names[i].c_str(), -1 /* model_version */, &is_ready));\n      if (!is_ready) {\n        throw BLSBackendException(\n            (std::string(\"Failed to execute the inference request. Model '\") +\n             model_names[i].c_str() + \"' is not ready.\")\n                .c_str());\n      }\n      // For simplicity, decoupled API is not supported in this BLS backend. You\n      // can implement your own backend that supports decoupled models.\n      uint32_t txn_flags;\n      THROW_IF_TRITON_ERROR(TRITONSERVER_ServerModelTransactionProperties(\n          server_, model_names[i].c_str(), -1 /* model_version */, &txn_flags,\n          nullptr /* voidp */));\n      if ((txn_flags & TRITONSERVER_TXN_DECOUPLED) != 0) {\n        throw BLSBackendException(\n            std::string(\"Model '\") + model_names[i].c_str() +\n            \"' is using the decoupled. This BLS Backend doesn't support models \"\n            \"using the decoupled transaction policy.\");\n      }\n    }\n  }\n  catch (const BLSBackendException& bls_exception) {\n    LOG_MESSAGE(TRITONSERVER_LOG_ERROR, bls_exception.what());\n    RESPOND_AND_SET_NULL_IF_ERROR(\n        response,\n        TRITONSERVER_ErrorNew(\n            TRITONSERVER_ERROR_INTERNAL, \"Failed to send inference requests\"));\n    return;\n  }\n\n  // Prepare std::future for each model. Since this BLS backend\n  // can handle requests in parallel, we will send all the inference\n  // requests first and then retrieve them later.\n  std::vector<std::future<TRITONSERVER_InferenceResponse*>> futures(2);\n\n  // The inference request object for sending internal requests.\n  TRITONSERVER_InferenceRequest* irequest = nullptr;\n\n  // For each inference request, the backend sends two requests on the\n  // 'addsub_python' and 'addsub_onnx' models.\n  try {\n    for (size_t icount = 0; icount < 2; icount++) {\n      // Initialize the inference request with required information.\n      THROW_IF_TRITON_ERROR(\n          PrepareInferenceRequest(bls_request, &irequest, model_names[icount]));\n      THROW_IF_TRITON_ERROR(PrepareInferenceInput(bls_request, irequest));\n      THROW_IF_TRITON_ERROR(PrepareInferenceOutput(bls_request, irequest));\n\n      // Execute inference request.\n      THROW_IF_TRITON_ERROR(\n          model_executor_.AsyncExecute(irequest, &futures[icount]));\n    }\n  }\n  catch (const BLSBackendException& bls_exception) {\n    LOG_MESSAGE(TRITONSERVER_LOG_ERROR, bls_exception.what());\n    LOG_IF_ERROR(\n        TRITONSERVER_InferenceRequestDelete(irequest),\n        \"Failed to delete inference request.\");\n    RESPOND_AND_SET_NULL_IF_ERROR(\n        response,\n        TRITONSERVER_ErrorNew(\n            TRITONSERVER_ERROR_INTERNAL, \"Failed to send inference requests\"));\n    return;\n  }\n\n  // If both internal requests are sent successfully, retrieve the output from\n  // each request and construct the final response.\n  ConstructFinalResponse(response, std::move(futures));\n}\n\nvoid\nBLSExecutor::ConstructFinalResponse(\n    TRITONBACKEND_Response** response,\n    std::vector<std::future<TRITONSERVER_InferenceResponse*>> futures)\n{\n  // Prepare two TRITONSERVER_InferenceResponse* objects for 'addsub_python' and\n  // 'addsub_onnx' respectively.\n  std::vector<TRITONSERVER_InferenceResponse*> completed_responses = {\n      nullptr, nullptr};\n\n  const char* output_name;\n  TRITONSERVER_DataType output_datatype;\n  const int64_t* output_shape;\n  uint64_t dims_count;\n  size_t output_byte_size;\n  TRITONSERVER_MemoryType output_memory_type;\n  int64_t output_memory_id;\n  const void* output_base;\n  void* userp;\n  for (size_t icount = 0; icount < 2; icount++) {\n    // Retrieve the corresponding TRITONSERVER_InferenceResponse object from\n    // 'futures'. The InferResponseComplete function sets the std::promise\n    // so that this thread will block until the response is returned.\n    completed_responses[icount] = futures[icount].get();\n    try {\n      THROW_IF_TRITON_ERROR(\n          TRITONSERVER_InferenceResponseError(completed_responses[icount]));\n    }\n    catch (const BLSBackendException& bls_exception) {\n      LOG_MESSAGE(TRITONSERVER_LOG_ERROR, bls_exception.what());\n\n      if (completed_responses[icount] != nullptr) {\n        LOG_IF_ERROR(\n            TRITONSERVER_InferenceResponseDelete(completed_responses[icount]),\n            \"Failed to delete inference response.\");\n      }\n      return;\n    }\n    // Retrieve outputs from 'completed_responses'.\n    // Extract OUTPUT0 from the 'addsub_python' and OUTPUT1 from the\n    // 'addsub_onnx' model to form the final inference response object.\n    // Get all the information about the output tensor.\n    RESPOND_AND_SET_NULL_IF_ERROR(\n        response,\n        TRITONSERVER_InferenceResponseOutput(\n            completed_responses[icount], icount, &output_name, &output_datatype,\n            &output_shape, &dims_count, &output_base, &output_byte_size,\n            &output_memory_type, &output_memory_id, &userp));\n\n    // Create an output tensor in the final response with\n    // the information retrieved above.\n    TRITONBACKEND_Output* output;\n    RESPOND_AND_SET_NULL_IF_ERROR(\n        response, TRITONBACKEND_ResponseOutput(\n                      *response, &output, output_name, output_datatype,\n                      output_shape, dims_count));\n\n    // Get a buffer that holds the tensor data for the output.\n    // We request a buffer in CPU memory but we have to handle any returned\n    // type. If we get back a buffer in GPU memory we just fail the request.\n    void* output_buffer;\n    output_memory_type = TRITONSERVER_MEMORY_CPU;\n    RESPOND_AND_SET_NULL_IF_ERROR(\n        response, TRITONBACKEND_OutputBuffer(\n                      output, &output_buffer, output_byte_size,\n                      &output_memory_type, &output_memory_id));\n    if (output_memory_type == TRITONSERVER_MEMORY_GPU) {\n      RESPOND_AND_SET_NULL_IF_ERROR(\n          response, TRITONSERVER_ErrorNew(\n                        TRITONSERVER_ERROR_INTERNAL,\n                        \"failed to create output buffer in CPU memory\"));\n    }\n\n    // Fill the BLS output buffer with output data returned by internal\n    // requests.\n    memcpy(output_buffer, output_base, output_byte_size);\n\n    LOG_IF_ERROR(\n        TRITONSERVER_InferenceResponseDelete(completed_responses[icount]),\n        \"Failed to delete inference response.\");\n  }\n}\n\n}}}  // namespace triton::backend::bls\n"
  },
  {
    "path": "examples/backends/bls/src/bls.h",
    "content": "// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n//\n// Redistribution and use in source and binary forms, with or without\n// modification, are permitted provided that the following conditions\n// are met:\n//  * Redistributions of source code must retain the above copyright\n//    notice, this list of conditions and the following disclaimer.\n//  * Redistributions in binary form must reproduce the above copyright\n//    notice, this list of conditions and the following disclaimer in the\n//    documentation and/or other materials provided with the distribution.\n//  * Neither the name of NVIDIA CORPORATION nor the names of its\n//    contributors may be used to endorse or promote products derived\n//    from this software without specific prior written permission.\n//\n// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n#include <future>\n\n#include \"bls_utils.h\"\n#include \"triton/backend/backend_common.h\"\n#include \"triton/core/tritonbackend.h\"\n#include \"triton/core/tritonserver.h\"\n\nnamespace triton { namespace backend { namespace bls {\n\n//\n// BLSExecutor\n//\n// Includes the custom BLS logic for this backend.\n// This class shows how to utilize Triton in-process C-API to build the\n// execution pipeline.\n//\nclass BLSExecutor {\n public:\n  BLSExecutor(TRITONSERVER_Server* server);\n\n  // Prepares the inference request that will be used internally.\n  TRITONSERVER_Error* PrepareInferenceRequest(\n      TRITONBACKEND_Request* bls_request,\n      TRITONSERVER_InferenceRequest** irequest, const std::string model_name);\n\n  // Prepares the input for the internal inference request.\n  TRITONSERVER_Error* PrepareInferenceInput(\n      TRITONBACKEND_Request* bls_request,\n      TRITONSERVER_InferenceRequest* irequest);\n\n  // Prepares the output for the internal inference request.\n  TRITONSERVER_Error* PrepareInferenceOutput(\n      TRITONBACKEND_Request* bls_request,\n      TRITONSERVER_InferenceRequest* irequest);\n\n  // Performs the whole BLS pipeline.\n  void Execute(\n      TRITONBACKEND_Request* bls_request, TRITONBACKEND_Response** response);\n\n  // Constructs the final response.\n  void ConstructFinalResponse(\n      TRITONBACKEND_Response** response,\n      std::vector<std::future<TRITONSERVER_InferenceResponse*>> futures);\n\n private:\n  // The server object that encapsulates all the functionality of the Triton\n  // server and allows access to the Triton server API.\n  TRITONSERVER_Server* server_;\n\n  // The ModelExecutor object for executing inference request on a model.\n  ModelExecutor model_executor_;\n};\n\n}}}  // namespace triton::backend::bls\n"
  },
  {
    "path": "examples/backends/bls/src/bls_utils.cc",
    "content": "// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n//\n// Redistribution and use in source and binary forms, with or without\n// modification, are permitted provided that the following conditions\n// are met:\n//  * Redistributions of source code must retain the above copyright\n//    notice, this list of conditions and the following disclaimer.\n//  * Redistributions in binary form must reproduce the above copyright\n//    notice, this list of conditions and the following disclaimer in the\n//    documentation and/or other materials provided with the distribution.\n//  * Neither the name of NVIDIA CORPORATION nor the names of its\n//    contributors may be used to endorse or promote products derived\n//    from this software without specific prior written permission.\n//\n// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n#include \"bls_utils.h\"\n\nnamespace triton { namespace backend { namespace bls {\n\nTRITONSERVER_Error*\nCPUAllocator(\n    TRITONSERVER_ResponseAllocator* allocator, const char* tensor_name,\n    size_t byte_size, TRITONSERVER_MemoryType preferred_memory_type,\n    int64_t preferred_memory_type_id, void* userp, void** buffer,\n    void** buffer_userp, TRITONSERVER_MemoryType* actual_memory_type,\n    int64_t* actual_memory_type_id)\n{\n  // For simplicity, this backend example always uses CPU memory regardless of\n  // the preferred memory type. You can make the actual memory type and id that\n  // we allocate be the same as preferred memory type. You can also provide a\n  // customized allocator to support different preferred_memory_type, and reuse\n  // memory buffer when possible.\n  *actual_memory_type = TRITONSERVER_MEMORY_CPU;\n  *actual_memory_type_id = preferred_memory_type_id;\n\n  // If 'byte_size' is zero just return 'buffer' == nullptr, we don't\n  // need to do any other book-keeping.\n  if (byte_size == 0) {\n    *buffer = nullptr;\n    *buffer_userp = nullptr;\n    LOG_MESSAGE(\n        TRITONSERVER_LOG_VERBOSE, (\"allocated \" + std::to_string(byte_size) +\n                                   \" bytes for result tensor \" + tensor_name)\n                                      .c_str());\n  } else {\n    void* allocated_ptr = nullptr;\n    *actual_memory_type = TRITONSERVER_MEMORY_CPU;\n    allocated_ptr = malloc(byte_size);\n\n    // Pass the tensor name with buffer_userp so we can show it when\n    // releasing the buffer.\n    if (allocated_ptr != nullptr) {\n      *buffer = allocated_ptr;\n      *buffer_userp = new std::string(tensor_name);\n      LOG_MESSAGE(\n          TRITONSERVER_LOG_VERBOSE,\n          (\"allocated \" + std::to_string(byte_size) + \" bytes in \" +\n           TRITONSERVER_MemoryTypeString(*actual_memory_type) +\n           \" for result tensor \" + tensor_name)\n              .c_str());\n    }\n  }\n\n  return nullptr;  // Success\n}\n\nTRITONSERVER_Error*\nResponseRelease(\n    TRITONSERVER_ResponseAllocator* allocator, void* buffer, void* buffer_userp,\n    size_t byte_size, TRITONSERVER_MemoryType memory_type,\n    int64_t memory_type_id)\n{\n  std::string* name = nullptr;\n  if (buffer_userp != nullptr) {\n    name = reinterpret_cast<std::string*>(buffer_userp);\n  } else {\n    name = new std::string(\"<unknown>\");\n  }\n\n  std::stringstream ss;\n  ss << buffer;\n  std::string buffer_str = ss.str();\n\n  LOG_MESSAGE(\n      TRITONSERVER_LOG_VERBOSE,\n      (\"Releasing buffer \" + buffer_str + \" of size \" +\n       std::to_string(byte_size) + \" in \" +\n       TRITONSERVER_MemoryTypeString(memory_type) + \" for result '\" + *name)\n          .c_str());\n\n  switch (memory_type) {\n    case TRITONSERVER_MEMORY_CPU:\n      free(buffer);\n      break;\n    default:\n      LOG_MESSAGE(\n          TRITONSERVER_LOG_ERROR,\n          std::string(\n              \"error: unexpected buffer allocated in CUDA managed memory\")\n              .c_str());\n      break;\n  }\n\n  delete name;\n\n  return nullptr;  // Success\n}\n\nvoid\nInferRequestComplete(\n    TRITONSERVER_InferenceRequest* request, const uint32_t flags, void* userp)\n{\n  if (request != nullptr) {\n    LOG_IF_ERROR(\n        TRITONSERVER_InferenceRequestDelete(request),\n        \"Failed to delete inference request.\");\n  }\n}\n\nvoid\nInferResponseComplete(\n    TRITONSERVER_InferenceResponse* response, const uint32_t flags, void* userp)\n{\n  // The following logic only works for non-decoupled models as for decoupled\n  // models it may send multiple responses for a request or not send any\n  // responses for a request. Need to modify this function if the model is using\n  // decoupled API.\n  if (response != nullptr) {\n    // Send 'response' to the future.\n    std::promise<TRITONSERVER_InferenceResponse*>* p =\n        reinterpret_cast<std::promise<TRITONSERVER_InferenceResponse*>*>(userp);\n    p->set_value(response);\n    delete p;\n  }\n}\n\nModelExecutor::ModelExecutor(TRITONSERVER_Server* server) : server_(server)\n{\n  // When triton needs a buffer to hold an output tensor, it will ask\n  // us to provide the buffer. In this way we can have any buffer\n  // management and sharing strategy that we want. To communicate to\n  // triton the functions that we want it to call to perform the\n  // allocations, we create a \"response allocator\" object. We pass\n  // this response allocate object to triton when requesting\n  // inference. We can reuse this response allocator object for any\n  // number of inference requests.\n  allocator_ = nullptr;\n  THROW_IF_TRITON_ERROR(TRITONSERVER_ResponseAllocatorNew(\n      &allocator_, CPUAllocator, ResponseRelease, nullptr /* start_fn */));\n}\n\nTRITONSERVER_Error*\nModelExecutor::AsyncExecute(\n    TRITONSERVER_InferenceRequest* irequest,\n    std::future<TRITONSERVER_InferenceResponse*>* future)\n{\n  // Perform inference by calling TRITONSERVER_ServerInferAsync. This\n  // call is asynchronous and therefore returns immediately. The\n  // completion of the inference and delivery of the response is done\n  // by triton by calling the \"response complete\" callback functions\n  // (InferResponseComplete in this case).\n  auto p = new std::promise<TRITONSERVER_InferenceResponse*>();\n  *future = p->get_future();\n\n  RETURN_IF_ERROR(TRITONSERVER_InferenceRequestSetResponseCallback(\n      irequest, allocator_, nullptr /* response_allocator_userp */,\n      InferResponseComplete, reinterpret_cast<void*>(p)));\n\n  RETURN_IF_ERROR(\n      TRITONSERVER_ServerInferAsync(server_, irequest, nullptr /* trace */));\n\n  return nullptr;  // success\n}\n\n}}}  // namespace triton::backend::bls\n"
  },
  {
    "path": "examples/backends/bls/src/bls_utils.h",
    "content": "// Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n//\n// Redistribution and use in source and binary forms, with or without\n// modification, are permitted provided that the following conditions\n// are met:\n//  * Redistributions of source code must retain the above copyright\n//    notice, this list of conditions and the following disclaimer.\n//  * Redistributions in binary form must reproduce the above copyright\n//    notice, this list of conditions and the following disclaimer in the\n//    documentation and/or other materials provided with the distribution.\n//  * Neither the name of NVIDIA CORPORATION nor the names of its\n//    contributors may be used to endorse or promote products derived\n//    from this software without specific prior written permission.\n//\n// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n#include <future>\n#include <sstream>\n\n#include \"triton/backend/backend_common.h\"\n#include \"triton/core/tritonbackend.h\"\n#include \"triton/core/tritonserver.h\"\n\nnamespace triton { namespace backend { namespace bls {\n\n#define THROW_IF_TRITON_ERROR(X)                                       \\\n  do {                                                                 \\\n    TRITONSERVER_Error* tie_err__ = (X);                               \\\n    if (tie_err__ != nullptr) {                                        \\\n      throw BLSBackendException(TRITONSERVER_ErrorMessage(tie_err__)); \\\n    }                                                                  \\\n  } while (false)\n\n//\n// BLSBackendException\n//\n// Exception thrown if error occurs in BLSBackend.\n//\nstruct BLSBackendException : std::exception {\n  BLSBackendException(const std::string& message) : message_(message) {}\n\n  const char* what() const throw() { return message_.c_str(); }\n\n  std::string message_;\n};\n\n// Performs the allocations of output tensors.\nTRITONSERVER_Error* CPUAllocator(\n    TRITONSERVER_ResponseAllocator* allocator, const char* tensor_name,\n    size_t byte_size, TRITONSERVER_MemoryType preferred_memory_type,\n    int64_t preferred_memory_type_id, void* userp, void** buffer,\n    void** buffer_userp, TRITONSERVER_MemoryType* actual_memory_type,\n    int64_t* actual_memory_type_id);\n\n// Callback functions for server inference.\nTRITONSERVER_Error* ResponseRelease(\n    TRITONSERVER_ResponseAllocator* allocator, void* buffer, void* buffer_userp,\n    size_t byte_size, TRITONSERVER_MemoryType memory_type,\n    int64_t memory_type_id);\nvoid InferRequestComplete(\n    TRITONSERVER_InferenceRequest* request, const uint32_t flags, void* userp);\nvoid InferResponseComplete(\n    TRITONSERVER_InferenceResponse* response, const uint32_t flags,\n    void* userp);\n\n//\n// ModelExecutor\n//\n// Execute inference request on a model.\n//\nclass ModelExecutor {\n public:\n  ModelExecutor(TRITONSERVER_Server* server);\n\n  // Performs async inference request.\n  TRITONSERVER_Error* AsyncExecute(\n      TRITONSERVER_InferenceRequest* irequest,\n      std::future<TRITONSERVER_InferenceResponse*>* future);\n\n private:\n  // The server object that encapsulates all the functionality of the Triton\n  // server and allows access to the Triton server API.\n  TRITONSERVER_Server* server_;\n\n  // The allocator object that will be used for allocating output tensors.\n  TRITONSERVER_ResponseAllocator* allocator_;\n};\n\n}}}  // namespace triton::backend::bls\n"
  },
  {
    "path": "examples/backends/bls/src/libtriton_bls.ldscript",
    "content": "# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n{\n  global:\n    TRITONBACKEND_*;\n  local: *;\n};\n"
  },
  {
    "path": "examples/backends/minimal/CMakeLists.txt",
    "content": "# Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\ncmake_minimum_required(VERSION 3.31.8)\n\nproject(tutorialminimalbackend LANGUAGES C CXX)\n\n#\n# Options\n#\n# Must include options required for this project as well as any\n# projects included in this one by FetchContent.\n#\n# GPU support is disabled by default because minimal backend doesn't\n# use GPUs.\n#\noption(TRITON_ENABLE_GPU \"Enable GPU support in backend\" OFF)\noption(TRITON_ENABLE_STATS \"Include statistics collections in backend\" ON)\n\nset(TRITON_REPO_ORGANIZATION \"https://github.com/triton-inference-server\" CACHE STRING \"Git repository to pull from\")\nset(TRITON_COMMON_REPO_TAG \"main\" CACHE STRING \"Tag for triton-inference-server/common repo\")\nset(TRITON_CORE_REPO_TAG \"main\" CACHE STRING \"Tag for triton-inference-server/core repo\")\nset(TRITON_BACKEND_REPO_TAG \"main\" CACHE STRING \"Tag for triton-inference-server/backend repo\")\n\n#\n# Setting C++ min standard\n#\nset(TRITON_MIN_CXX_STANDARD 17 CACHE STRING \"The minimum C++ standard whose features are requested to build this target.\")\n\nif(NOT CMAKE_BUILD_TYPE)\n  set(CMAKE_BUILD_TYPE Release)\nendif()\n\n#\n# Dependencies\n#\n# FetchContent requires us to include the transitive closure of all\n# repos that we depend on so that we can override the tags.\n#\ninclude(FetchContent)\n\nFetchContent_Declare(\n  repo-common\n  GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/common.git\n  GIT_TAG ${TRITON_COMMON_REPO_TAG}\n  GIT_SHALLOW ON\n)\nFetchContent_Declare(\n  repo-core\n  GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/core.git\n  GIT_TAG ${TRITON_CORE_REPO_TAG}\n  GIT_SHALLOW ON\n)\nFetchContent_Declare(\n  repo-backend\n  GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/backend.git\n  GIT_TAG ${TRITON_BACKEND_REPO_TAG}\n  GIT_SHALLOW ON\n)\nFetchContent_MakeAvailable(repo-common repo-core repo-backend)\n\n#\n# The backend must be built into a shared library. Use an ldscript to\n# hide all symbols except for the TRITONBACKEND API.\n#\nconfigure_file(src/libtriton_minimal.ldscript libtriton_minimal.ldscript COPYONLY)\n\nadd_library(\n  triton-minimal-backend SHARED\n  src/minimal.cc\n)\n\nadd_library(\n  TutorialMinimalBackend::triton-minimal-backend ALIAS triton-minimal-backend\n)\n\ntarget_include_directories(\n  triton-minimal-backend\n  PRIVATE\n    ${CMAKE_CURRENT_SOURCE_DIR}/src\n)\n\ntarget_compile_features(triton-minimal-backend PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD})\ntarget_compile_options(\n  triton-minimal-backend PRIVATE\n  $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:\n    -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror>\n  $<$<CXX_COMPILER_ID:MSVC>:/Wall /D_WIN32_WINNT=0x0A00 /EHsc /Zc:preprocessor>\n)\n\ntarget_link_libraries(\n  triton-minimal-backend\n  PRIVATE\n    triton-core-serverapi   # from repo-core\n    triton-core-backendapi  # from repo-core\n    triton-core-serverstub  # from repo-core\n    triton-backend-utils    # from repo-backend\n)\n\nif(WIN32)\n  set_target_properties(\n    triton-minimal-backend PROPERTIES\n    POSITION_INDEPENDENT_CODE ON\n    OUTPUT_NAME triton_minimal\n  )\nelse()\n  set_target_properties(\n    triton-minimal-backend PROPERTIES\n    POSITION_INDEPENDENT_CODE ON\n    OUTPUT_NAME triton_minimal\n    LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_minimal.ldscript\n    LINK_FLAGS \"-Wl,--version-script libtriton_minimal.ldscript\"\n  )\nendif()\n\n#\n# Install\n#\ninclude(GNUInstallDirs)\nset(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TutorialMinimalBackend)\n\ninstall(\n  TARGETS\n    triton-minimal-backend\n  EXPORT\n    triton-minimal-backend-targets\n  LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/minimal\n  RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/minimal\n)\n\ninstall(\n  EXPORT\n    triton-minimal-backend-targets\n  FILE\n    TutorialMinimalBackendTargets.cmake\n  NAMESPACE\n    TutorialMinimalBackend::\n  DESTINATION\n    ${INSTALL_CONFIGDIR}\n)\n\ninclude(CMakePackageConfigHelpers)\nconfigure_package_config_file(\n  ${CMAKE_CURRENT_LIST_DIR}/cmake/TutorialMinimalBackendConfig.cmake.in\n  ${CMAKE_CURRENT_BINARY_DIR}/TutorialMinimalBackendConfig.cmake\n  INSTALL_DESTINATION ${INSTALL_CONFIGDIR}\n)\n\ninstall(\n  FILES\n  ${CMAKE_CURRENT_BINARY_DIR}/TutorialMinimalBackendConfig.cmake\n  DESTINATION ${INSTALL_CONFIGDIR}\n)\n\n#\n# Export from build tree\n#\nexport(\n  EXPORT triton-minimal-backend-targets\n  FILE ${CMAKE_CURRENT_BINARY_DIR}/TutorialMinimalBackendTargets.cmake\n  NAMESPACE TutorialMinimalBackend::\n)\n\nexport(PACKAGE TutorialMinimalBackend)\n"
  },
  {
    "path": "examples/backends/minimal/cmake/TutorialMinimalBackendConfig.cmake.in",
    "content": "# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\ninclude(CMakeFindDependencyMacro)\n\nget_filename_component(\n  TUTORIALMINIMALBACKEND_CMAKE_DIR \"${CMAKE_CURRENT_LIST_FILE}\" PATH\n)\n\nlist(APPEND CMAKE_MODULE_PATH ${TUTORIALMINIMALBACKEND_CMAKE_DIR})\n\nif(NOT TARGET TutorialMinimalBackend::triton-minimal-backend)\n  include(\"${TUTORIALMINIMALBACKEND_CMAKE_DIR}/TutorialMinimalBackendTargets.cmake\")\nendif()\n\nset(TUTORIALMINIMALBACKEND_LIBRARIES TutorialMinimalBackend::triton-minimal-backend)\n"
  },
  {
    "path": "examples/backends/minimal/src/libtriton_minimal.ldscript",
    "content": "# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n{\n  global:\n    TRITONBACKEND_*;\n  local: *;\n};\n"
  },
  {
    "path": "examples/backends/minimal/src/minimal.cc",
    "content": "// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n//\n// Redistribution and use in source and binary forms, with or without\n// modification, are permitted provided that the following conditions\n// are met:\n//  * Redistributions of source code must retain the above copyright\n//    notice, this list of conditions and the following disclaimer.\n//  * Redistributions in binary form must reproduce the above copyright\n//    notice, this list of conditions and the following disclaimer in the\n//    documentation and/or other materials provided with the distribution.\n//  * Neither the name of NVIDIA CORPORATION nor the names of its\n//    contributors may be used to endorse or promote products derived\n//    from this software without specific prior written permission.\n//\n// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n#include \"triton/backend/backend_common.h\"\n#include \"triton/backend/backend_input_collector.h\"\n#include \"triton/backend/backend_model.h\"\n#include \"triton/backend/backend_model_instance.h\"\n#include \"triton/backend/backend_output_responder.h\"\n#include \"triton/core/tritonbackend.h\"\n\nnamespace triton { namespace backend { namespace minimal {\n\n//\n// Minimal backend that demonstrates the TRITONBACKEND API. This\n// backend works for any model that has 1 input called \"IN0\" with\n// INT32 datatype and shape [ 4 ] and 1 output called \"OUT0\" with\n// INT32 datatype and shape [ 4 ]. The backend supports both batching\n// and non-batching models.\n//\n// For each batch of requests, the backend returns the input tensor\n// value in the output tensor.\n//\n\n/////////////\n\n//\n// ModelState\n//\n// State associated with a model that is using this backend. An object\n// of this class is created and associated with each\n// TRITONBACKEND_Model. ModelState is derived from BackendModel class\n// provided in the backend utilities that provides many common\n// functions.\n//\nclass ModelState : public BackendModel {\n public:\n  static TRITONSERVER_Error* Create(\n      TRITONBACKEND_Model* triton_model, ModelState** state);\n  virtual ~ModelState() = default;\n\n private:\n  ModelState(TRITONBACKEND_Model* triton_model) : BackendModel(triton_model) {}\n};\n\nTRITONSERVER_Error*\nModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)\n{\n  try {\n    *state = new ModelState(triton_model);\n  }\n  catch (const BackendModelException& ex) {\n    RETURN_ERROR_IF_TRUE(\n        ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,\n        std::string(\"unexpected nullptr in BackendModelException\"));\n    RETURN_IF_ERROR(ex.err_);\n  }\n\n  return nullptr;  // success\n}\n\nextern \"C\" {\n\n// Triton calls TRITONBACKEND_ModelInitialize when a model is loaded\n// to allow the backend to create any state associated with the model,\n// and to also examine the model configuration to determine if the\n// configuration is suitable for the backend. Any errors reported by\n// this function will prevent the model from loading.\n//\nTRITONSERVER_Error*\nTRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model)\n{\n  // Create a ModelState object and associate it with the\n  // TRITONBACKEND_Model. If anything goes wrong with initialization\n  // of the model state then an error is returned and Triton will fail\n  // to load the model.\n  ModelState* model_state;\n  RETURN_IF_ERROR(ModelState::Create(model, &model_state));\n  RETURN_IF_ERROR(\n      TRITONBACKEND_ModelSetState(model, reinterpret_cast<void*>(model_state)));\n\n  return nullptr;  // success\n}\n\n// Triton calls TRITONBACKEND_ModelFinalize when a model is no longer\n// needed. The backend should cleanup any state associated with the\n// model. This function will not be called until all model instances\n// of the model have been finalized.\n//\nTRITONSERVER_Error*\nTRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model)\n{\n  void* vstate;\n  RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate));\n  ModelState* model_state = reinterpret_cast<ModelState*>(vstate);\n  delete model_state;\n\n  return nullptr;  // success\n}\n\n}  // extern \"C\"\n\n/////////////\n\n//\n// ModelInstanceState\n//\n// State associated with a model instance. An object of this class is\n// created and associated with each\n// TRITONBACKEND_ModelInstance. ModelInstanceState is derived from\n// BackendModelInstance class provided in the backend utilities that\n// provides many common functions.\n//\nclass ModelInstanceState : public BackendModelInstance {\n public:\n  static TRITONSERVER_Error* Create(\n      ModelState* model_state,\n      TRITONBACKEND_ModelInstance* triton_model_instance,\n      ModelInstanceState** state);\n  virtual ~ModelInstanceState() = default;\n\n  // Get the state of the model that corresponds to this instance.\n  ModelState* StateForModel() const { return model_state_; }\n\n private:\n  ModelInstanceState(\n      ModelState* model_state,\n      TRITONBACKEND_ModelInstance* triton_model_instance)\n      : BackendModelInstance(model_state, triton_model_instance),\n        model_state_(model_state)\n  {\n  }\n\n  ModelState* model_state_;\n};\n\nTRITONSERVER_Error*\nModelInstanceState::Create(\n    ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance,\n    ModelInstanceState** state)\n{\n  try {\n    *state = new ModelInstanceState(model_state, triton_model_instance);\n  }\n  catch (const BackendModelInstanceException& ex) {\n    RETURN_ERROR_IF_TRUE(\n        ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,\n        std::string(\"unexpected nullptr in BackendModelInstanceException\"));\n    RETURN_IF_ERROR(ex.err_);\n  }\n\n  return nullptr;  // success\n}\n\nextern \"C\" {\n\n// Triton calls TRITONBACKEND_ModelInstanceInitialize when a model\n// instance is created to allow the backend to initialize any state\n// associated with the instance.\n//\nTRITONSERVER_Error*\nTRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)\n{\n  // Get the model state associated with this instance's model.\n  TRITONBACKEND_Model* model;\n  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model));\n\n  void* vmodelstate;\n  RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate));\n  ModelState* model_state = reinterpret_cast<ModelState*>(vmodelstate);\n\n  // Create a ModelInstanceState object and associate it with the\n  // TRITONBACKEND_ModelInstance.\n  ModelInstanceState* instance_state;\n  RETURN_IF_ERROR(\n      ModelInstanceState::Create(model_state, instance, &instance_state));\n  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState(\n      instance, reinterpret_cast<void*>(instance_state)));\n\n  return nullptr;  // success\n}\n\n// Triton calls TRITONBACKEND_ModelInstanceFinalize when a model\n// instance is no longer needed. The backend should cleanup any state\n// associated with the model instance.\n//\nTRITONSERVER_Error*\nTRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)\n{\n  void* vstate;\n  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate));\n  ModelInstanceState* instance_state =\n      reinterpret_cast<ModelInstanceState*>(vstate);\n  delete instance_state;\n\n  return nullptr;  // success\n}\n\n}  // extern \"C\"\n\n/////////////\n\nextern \"C\" {\n\n// When Triton calls TRITONBACKEND_ModelInstanceExecute it is required\n// that a backend create a response for each request in the batch. A\n// response may be the output tensors required for that request or may\n// be an error that is returned in the response.\n//\nTRITONSERVER_Error*\nTRITONBACKEND_ModelInstanceExecute(\n    TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests,\n    const uint32_t request_count)\n{\n  // Triton will not call this function simultaneously for the same\n  // 'instance'. But since this backend could be used by multiple\n  // instances from multiple models the implementation needs to handle\n  // multiple calls to this function at the same time (with different\n  // 'instance' objects). Best practice for a high-performance\n  // implementation is to avoid introducing mutex/lock and instead use\n  // only function-local and model-instance-specific state.\n  ModelInstanceState* instance_state;\n  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(\n      instance, reinterpret_cast<void**>(&instance_state)));\n  ModelState* model_state = instance_state->StateForModel();\n\n  // 'responses' is initialized as a parallel array to 'requests',\n  // with one TRITONBACKEND_Response object for each\n  // TRITONBACKEND_Request object. If something goes wrong while\n  // creating these response objects, the backend simply returns an\n  // error from TRITONBACKEND_ModelInstanceExecute, indicating to\n  // Triton that this backend did not create or send any responses and\n  // so it is up to Triton to create and send an appropriate error\n  // response for each request. RETURN_IF_ERROR is one of several\n  // useful macros for error handling that can be found in\n  // backend_common.h.\n\n  std::vector<TRITONBACKEND_Response*> responses;\n  responses.reserve(request_count);\n  for (uint32_t r = 0; r < request_count; ++r) {\n    TRITONBACKEND_Request* request = requests[r];\n    TRITONBACKEND_Response* response;\n    RETURN_IF_ERROR(TRITONBACKEND_ResponseNew(&response, request));\n    responses.push_back(response);\n  }\n\n  // At this point, the backend takes ownership of 'requests', which\n  // means that it is responsible for sending a response for every\n  // request. From here, even if something goes wrong in processing,\n  // the backend must return 'nullptr' from this function to indicate\n  // success. Any errors and failures must be communicated via the\n  // response objects.\n  //\n  // To simplify error handling, the backend utilities manage\n  // 'responses' in a specific way and it is recommended that backends\n  // follow this same pattern. When an error is detected in the\n  // processing of a request, an appropriate error response is sent\n  // and the corresponding TRITONBACKEND_Response object within\n  // 'responses' is set to nullptr to indicate that the\n  // request/response has already been handled and no further processing\n  // should be performed for that request. Even if all responses fail,\n  // the backend still allows execution to flow to the end of the\n  // function. RESPOND_AND_SET_NULL_IF_ERROR, and\n  // RESPOND_ALL_AND_SET_NULL_IF_ERROR are macros from\n  // backend_common.h that assist in this management of response\n  // objects.\n\n  // The backend could iterate over the 'requests' and process each\n  // one separately. But for performance reasons it is usually\n  // preferred to create batched input tensors that are processed\n  // simultaneously. This is especially true for devices like GPUs\n  // that are capable of exploiting the large amount parallelism\n  // exposed by larger data sets.\n  //\n  // The backend utilities provide a \"collector\" to facilitate this\n  // batching process. The 'collector's ProcessTensor function will\n  // combine a tensor's value from each request in the batch into a\n  // single contiguous buffer. The buffer can be provided by the\n  // backend or 'collector' can create and manage it. In this backend,\n  // there is not a specific buffer into which the batch should be\n  // created, so use ProcessTensor arguments that cause collector to\n  // manage it.\n\n  BackendInputCollector collector(\n      requests, request_count, &responses, model_state->TritonMemoryManager(),\n      false /* pinned_enabled */, nullptr /* stream*/);\n\n  // To instruct ProcessTensor to \"gather\" the entire batch of IN0\n  // input tensors into a single contiguous buffer in CPU memory, set\n  // the \"allowed input types\" to be the CPU ones (see tritonserver.h\n  // in the triton-inference-server/core repo for allowed memory\n  // types).\n  std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>> allowed_input_types =\n      {{TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}};\n\n  const char* input_buffer;\n  size_t input_buffer_byte_size;\n  TRITONSERVER_MemoryType input_buffer_memory_type;\n  int64_t input_buffer_memory_type_id;\n\n  RESPOND_ALL_AND_SET_NULL_IF_ERROR(\n      responses, request_count,\n      collector.ProcessTensor(\n          \"IN0\", nullptr /* existing_buffer */,\n          0 /* existing_buffer_byte_size */, allowed_input_types, &input_buffer,\n          &input_buffer_byte_size, &input_buffer_memory_type,\n          &input_buffer_memory_type_id));\n\n  // Finalize the collector. If 'true' is returned, 'input_buffer'\n  // will not be valid until the backend synchronizes the CUDA\n  // stream or event that was used when creating the collector. For\n  // this backend, GPU is not supported and so no CUDA sync should\n  // be needed; so if 'true' is returned simply log an error.\n  const bool need_cuda_input_sync = collector.Finalize();\n  if (need_cuda_input_sync) {\n    LOG_MESSAGE(\n        TRITONSERVER_LOG_ERROR,\n        \"'minimal' backend: unexpected CUDA sync required by collector\");\n  }\n\n  // 'input_buffer' contains the batched \"IN0\" tensor. The backend can\n  // implement whatever logic is necessary to produce \"OUT0\". This\n  // backend simply returns the IN0 value in OUT0 so no actual\n  // computation is needed.\n\n  LOG_MESSAGE(\n      TRITONSERVER_LOG_INFO,\n      (std::string(\"model \") + model_state->Name() + \": requests in batch \" +\n       std::to_string(request_count))\n          .c_str());\n  std::string tstr;\n  IGNORE_ERROR(BufferAsTypedString(\n      tstr, input_buffer, input_buffer_byte_size, TRITONSERVER_TYPE_INT32));\n  LOG_MESSAGE(\n      TRITONSERVER_LOG_INFO,\n      (std::string(\"batched IN0 value: \") + tstr).c_str());\n\n  const char* output_buffer = input_buffer;\n  TRITONSERVER_MemoryType output_buffer_memory_type = input_buffer_memory_type;\n  int64_t output_buffer_memory_type_id = input_buffer_memory_type_id;\n\n  // This backend supports models that batch along the first dimension\n  // and those that don't batch. For non-batch models the output shape\n  // will be [ 4 ]. For batch models the output shape will be [ -1, 4\n  // ] and the backend \"responder\" utility below will set the\n  // appropriate batch dimension value for each response.\n  std::vector<int64_t> output_batch_shape;\n  bool supports_first_dim_batching;\n  RESPOND_ALL_AND_SET_NULL_IF_ERROR(\n      responses, request_count,\n      model_state->SupportsFirstDimBatching(&supports_first_dim_batching));\n  if (supports_first_dim_batching) {\n    output_batch_shape.push_back(-1);\n  }\n  output_batch_shape.push_back(4);\n\n  // Because the OUT0 values are concatenated into a single contiguous\n  // 'output_buffer', the backend must \"scatter\" them out to the\n  // individual response OUT0 tensors.  The backend utilities provide\n  // a \"responder\" to facilitate this scattering process.\n\n  // The 'responders's ProcessTensor function will copy the portion of\n  // 'output_buffer' corresponding to each request's output into the\n  // response for that request.\n\n  BackendOutputResponder responder(\n      requests, request_count, &responses, model_state->TritonMemoryManager(),\n      supports_first_dim_batching, false /* pinned_enabled */,\n      nullptr /* stream*/);\n\n  responder.ProcessTensor(\n      \"OUT0\", TRITONSERVER_TYPE_INT32, output_batch_shape, output_buffer,\n      output_buffer_memory_type, output_buffer_memory_type_id);\n\n  // Finalize the responder. If 'true' is returned, the OUT0\n  // tensors' data will not be valid until the backend synchronizes\n  // the CUDA stream or event that was used when creating the\n  // responder. For this backend, GPU is not supported and so no\n  // CUDA sync should be needed; so if 'true' is returned simply log\n  // an error.\n  const bool need_cuda_output_sync = responder.Finalize();\n  if (need_cuda_output_sync) {\n    LOG_MESSAGE(\n        TRITONSERVER_LOG_ERROR,\n        \"'minimal' backend: unexpected CUDA sync required by responder\");\n  }\n\n  // Send all the responses that haven't already been sent because of\n  // an earlier error.\n  for (auto& response : responses) {\n    if (response != nullptr) {\n      LOG_IF_ERROR(\n          TRITONBACKEND_ResponseSend(\n              response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr),\n          \"failed to send response\");\n    }\n  }\n\n  // Done with the request objects so release them.\n  for (uint32_t r = 0; r < request_count; ++r) {\n    auto& request = requests[r];\n    LOG_IF_ERROR(\n        TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL),\n        \"failed releasing request\");\n  }\n\n  return nullptr;  // success\n}\n\n}  // extern \"C\"\n\n}}}  // namespace triton::backend::minimal\n"
  },
  {
    "path": "examples/backends/recommended/CMakeLists.txt",
    "content": "# Copyright 2021-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\ncmake_minimum_required(VERSION 3.31.8)\n\nproject(tutorialrecommendedbackend LANGUAGES C CXX)\n\n#\n# Options\n#\n# Must include options required for this project as well as any\n# projects included in this one by FetchContent.\n#\n# GPU support is disabled by default because recommended backend\n# doesn't use GPUs.\n#\noption(TRITON_ENABLE_GPU \"Enable GPU support in backend\" OFF)\noption(TRITON_ENABLE_STATS \"Include statistics collections in backend\" ON)\n\nset(TRITON_REPO_ORGANIZATION \"https://github.com/triton-inference-server\" CACHE STRING \"Git repository to pull from\")\nset(TRITON_COMMON_REPO_TAG \"main\" CACHE STRING \"Tag for triton-inference-server/common repo\")\nset(TRITON_CORE_REPO_TAG \"main\" CACHE STRING \"Tag for triton-inference-server/core repo\")\nset(TRITON_BACKEND_REPO_TAG \"main\" CACHE STRING \"Tag for triton-inference-server/backend repo\")\n\n#\n# Setting C++ min standard\n#\nset(TRITON_MIN_CXX_STANDARD 17 CACHE STRING \"The minimum C++ standard whose features are requested to build this target.\")\n\nif(NOT CMAKE_BUILD_TYPE)\n  set(CMAKE_BUILD_TYPE Release)\nendif()\n\n#\n# Dependencies\n#\n# FetchContent requires us to include the transitive closure of all\n# repos that we depend on so that we can override the tags.\n#\ninclude(FetchContent)\n\nFetchContent_Declare(\n  repo-common\n  GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/common.git\n  GIT_TAG ${TRITON_COMMON_REPO_TAG}\n  GIT_SHALLOW ON\n)\nFetchContent_Declare(\n  repo-core\n  GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/core.git\n  GIT_TAG ${TRITON_CORE_REPO_TAG}\n  GIT_SHALLOW ON\n)\nFetchContent_Declare(\n  repo-backend\n  GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/backend.git\n  GIT_TAG ${TRITON_BACKEND_REPO_TAG}\n  GIT_SHALLOW ON\n)\nFetchContent_MakeAvailable(repo-common repo-core repo-backend)\n\n#\n# The backend must be built into a shared library. Use an ldscript to\n# hide all symbols except for the TRITONBACKEND API.\n#\nconfigure_file(src/libtriton_recommended.ldscript libtriton_recommended.ldscript COPYONLY)\n\nadd_library(\n  triton-recommended-backend SHARED\n  src/recommended.cc\n)\n\nadd_library(\n  TutorialRecommendedBackend::triton-recommended-backend ALIAS triton-recommended-backend\n)\n\ntarget_include_directories(\n  triton-recommended-backend\n  PRIVATE\n    ${CMAKE_CURRENT_SOURCE_DIR}/src\n)\n\ntarget_compile_features(triton-recommended-backend PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD})\ntarget_compile_options(\n  triton-recommended-backend PRIVATE\n  $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:\n    -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror>\n  $<$<CXX_COMPILER_ID:MSVC>:/Wall /D_WIN32_WINNT=0x0A00 /EHsc /Zc:preprocessor>\n)\n\ntarget_link_libraries(\n  triton-recommended-backend\n  PRIVATE\n    triton-core-serverapi   # from repo-core\n    triton-core-backendapi  # from repo-core\n    triton-core-serverstub  # from repo-core\n    triton-backend-utils    # from repo-backend\n)\n\nif(WIN32)\n  set_target_properties(\n    triton-recommended-backend PROPERTIES\n    POSITION_INDEPENDENT_CODE ON\n    OUTPUT_NAME triton_recommended\n  )\nelse()\n  set_target_properties(\n    triton-recommended-backend PROPERTIES\n    POSITION_INDEPENDENT_CODE ON\n    OUTPUT_NAME triton_recommended\n    LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_recommended.ldscript\n    LINK_FLAGS \"-Wl,--version-script libtriton_recommended.ldscript\"\n  )\nendif()\n\n#\n# Install\n#\ninclude(GNUInstallDirs)\nset(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TutorialRecommendedBackend)\n\ninstall(\n  TARGETS\n    triton-recommended-backend\n  EXPORT\n    triton-recommended-backend-targets\n  LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/recommended\n  RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/recommended\n)\n\ninstall(\n  EXPORT\n    triton-recommended-backend-targets\n  FILE\n    TutorialRecommendedBackendTargets.cmake\n  NAMESPACE\n    TutorialRecommendedBackend::\n  DESTINATION\n    ${INSTALL_CONFIGDIR}\n)\n\ninclude(CMakePackageConfigHelpers)\nconfigure_package_config_file(\n  ${CMAKE_CURRENT_LIST_DIR}/cmake/TutorialRecommendedBackendConfig.cmake.in\n  ${CMAKE_CURRENT_BINARY_DIR}/TutorialRecommendedBackendConfig.cmake\n  INSTALL_DESTINATION ${INSTALL_CONFIGDIR}\n)\n\ninstall(\n  FILES\n  ${CMAKE_CURRENT_BINARY_DIR}/TutorialRecommendedBackendConfig.cmake\n  DESTINATION ${INSTALL_CONFIGDIR}\n)\n\n#\n# Export from build tree\n#\nexport(\n  EXPORT triton-recommended-backend-targets\n  FILE ${CMAKE_CURRENT_BINARY_DIR}/TutorialRecommendedBackendTargets.cmake\n  NAMESPACE TutorialRecommendedBackend::\n)\n\nexport(PACKAGE TutorialRecommendedBackend)\n"
  },
  {
    "path": "examples/backends/recommended/cmake/TutorialRecommendedBackendConfig.cmake.in",
    "content": "# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\ninclude(CMakeFindDependencyMacro)\n\nget_filename_component(\n  TUTORIALRECOMMENDEDBACKEND_CMAKE_DIR \"${CMAKE_CURRENT_LIST_FILE}\" PATH\n)\n\nlist(APPEND CMAKE_MODULE_PATH ${TUTORIALRECOMMENDEDBACKEND_CMAKE_DIR})\n\nif(NOT TARGET TutorialRecommendedBackend::triton-recommended-backend)\n  include(\"${TUTORIALRECOMMENDEDBACKEND_CMAKE_DIR}/TutorialRecommendedBackendTargets.cmake\")\nendif()\n\nset(TUTORIALRECOMMENDEDBACKEND_LIBRARIES TutorialRecommendedBackend::triton-recommended-backend)\n"
  },
  {
    "path": "examples/backends/recommended/src/libtriton_recommended.ldscript",
    "content": "# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n{\n  global:\n    TRITONBACKEND_*;\n  local: *;\n};\n"
  },
  {
    "path": "examples/backends/recommended/src/recommended.cc",
    "content": "// Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n//\n// Redistribution and use in source and binary forms, with or without\n// modification, are permitted provided that the following conditions\n// are met:\n//  * Redistributions of source code must retain the above copyright\n//    notice, this list of conditions and the following disclaimer.\n//  * Redistributions in binary form must reproduce the above copyright\n//    notice, this list of conditions and the following disclaimer in the\n//    documentation and/or other materials provided with the distribution.\n//  * Neither the name of NVIDIA CORPORATION nor the names of its\n//    contributors may be used to endorse or promote products derived\n//    from this software without specific prior written permission.\n//\n// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n#include \"triton/backend/backend_common.h\"\n#include \"triton/backend/backend_input_collector.h\"\n#include \"triton/backend/backend_model.h\"\n#include \"triton/backend/backend_model_instance.h\"\n#include \"triton/backend/backend_output_responder.h\"\n#include \"triton/core/tritonbackend.h\"\n\nnamespace triton { namespace backend { namespace recommended {\n\n//\n// Backend that demonstrates the TRITONBACKEND API. This backend works\n// for any model that has 1 input with any datatype and any shape and\n// 1 output with the same shape and datatype as the input. The backend\n// supports both batching and non-batching models.\n//\n// For each batch of requests, the backend returns the input tensor\n// value in the output tensor.\n//\n\n/////////////\n\nextern \"C\" {\n\n// Triton calls TRITONBACKEND_Initialize when a backend is loaded into\n// Triton to allow the backend to create and initialize any state that\n// is intended to be shared across all models and model instances that\n// use the backend. The backend should also verify version\n// compatibility with Triton in this function.\n//\nTRITONSERVER_Error*\nTRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend)\n{\n  const char* cname;\n  RETURN_IF_ERROR(TRITONBACKEND_BackendName(backend, &cname));\n  std::string name(cname);\n\n  LOG_MESSAGE(\n      TRITONSERVER_LOG_INFO,\n      (std::string(\"TRITONBACKEND_Initialize: \") + name).c_str());\n\n  // Check the backend API version that Triton supports vs. what this\n  // backend was compiled against. Make sure that the Triton major\n  // version is the same and the minor version is >= what this backend\n  // uses.\n  uint32_t api_version_major, api_version_minor;\n  RETURN_IF_ERROR(\n      TRITONBACKEND_ApiVersion(&api_version_major, &api_version_minor));\n\n  LOG_MESSAGE(\n      TRITONSERVER_LOG_INFO,\n      (std::string(\"Triton TRITONBACKEND API version: \") +\n       std::to_string(api_version_major) + \".\" +\n       std::to_string(api_version_minor))\n          .c_str());\n  LOG_MESSAGE(\n      TRITONSERVER_LOG_INFO,\n      (std::string(\"'\") + name + \"' TRITONBACKEND API version: \" +\n       std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + \".\" +\n       std::to_string(TRITONBACKEND_API_VERSION_MINOR))\n          .c_str());\n\n  if ((api_version_major != TRITONBACKEND_API_VERSION_MAJOR) ||\n      (api_version_minor < TRITONBACKEND_API_VERSION_MINOR)) {\n    return TRITONSERVER_ErrorNew(\n        TRITONSERVER_ERROR_UNSUPPORTED,\n        \"triton backend API version does not support this backend\");\n  }\n\n  // The backend configuration may contain information needed by the\n  // backend, such as tritonserver command-line arguments. This\n  // backend doesn't use any such configuration but for this example\n  // print whatever is available.\n  TRITONSERVER_Message* backend_config_message;\n  RETURN_IF_ERROR(\n      TRITONBACKEND_BackendConfig(backend, &backend_config_message));\n\n  const char* buffer;\n  size_t byte_size;\n  RETURN_IF_ERROR(TRITONSERVER_MessageSerializeToJson(\n      backend_config_message, &buffer, &byte_size));\n  LOG_MESSAGE(\n      TRITONSERVER_LOG_INFO,\n      (std::string(\"backend configuration:\\n\") + buffer).c_str());\n\n  // This backend does not require any \"global\" state but as an\n  // example create a string to demonstrate.\n  std::string* state = new std::string(\"backend state\");\n  RETURN_IF_ERROR(\n      TRITONBACKEND_BackendSetState(backend, reinterpret_cast<void*>(state)));\n\n  return nullptr;  // success\n}\n\n// Triton calls TRITONBACKEND_Finalize when a backend is no longer\n// needed.\n//\nTRITONSERVER_Error*\nTRITONBACKEND_Finalize(TRITONBACKEND_Backend* backend)\n{\n  // Delete the \"global\" state associated with the backend.\n  void* vstate;\n  RETURN_IF_ERROR(TRITONBACKEND_BackendState(backend, &vstate));\n  std::string* state = reinterpret_cast<std::string*>(vstate);\n\n  LOG_MESSAGE(\n      TRITONSERVER_LOG_INFO,\n      (std::string(\"TRITONBACKEND_Finalize: state is '\") + *state + \"'\")\n          .c_str());\n\n  delete state;\n\n  return nullptr;  // success\n}\n\n}  // extern \"C\"\n\n/////////////\n\n//\n// ModelState\n//\n// State associated with a model that is using this backend. An object\n// of this class is created and associated with each\n// TRITONBACKEND_Model. ModelState is derived from BackendModel class\n// provided in the backend utilities that provides many common\n// functions.\n//\nclass ModelState : public BackendModel {\n public:\n  static TRITONSERVER_Error* Create(\n      TRITONBACKEND_Model* triton_model, ModelState** state);\n  virtual ~ModelState() = default;\n\n  // Name of the input and output tensor\n  const std::string& InputTensorName() const { return input_name_; }\n  const std::string& OutputTensorName() const { return output_name_; }\n\n  // Datatype of the input and output tensor\n  TRITONSERVER_DataType TensorDataType() const { return datatype_; }\n\n  // Shape of the input and output tensor as given in the model\n  // configuration file. This shape will not include the batch\n  // dimension (if the model has one).\n  const std::vector<int64_t>& TensorNonBatchShape() const { return nb_shape_; }\n\n  // Shape of the input and output tensor, including the batch\n  // dimension (if the model has one). This method cannot be called\n  // until the model is completely loaded and initialized, including\n  // all instances of the model. In practice, this means that backend\n  // should only call it in TRITONBACKEND_ModelInstanceExecute.\n  TRITONSERVER_Error* TensorShape(std::vector<int64_t>& shape);\n\n  // Validate that this model is supported by this backend.\n  TRITONSERVER_Error* ValidateModelConfig();\n\n private:\n  ModelState(TRITONBACKEND_Model* triton_model);\n\n  std::string input_name_;\n  std::string output_name_;\n\n  TRITONSERVER_DataType datatype_;\n\n  bool shape_initialized_;\n  std::vector<int64_t> nb_shape_;\n  std::vector<int64_t> shape_;\n};\n\nModelState::ModelState(TRITONBACKEND_Model* triton_model)\n    : BackendModel(triton_model), shape_initialized_(false)\n{\n  // Validate that the model's configuration matches what is supported\n  // by this backend.\n  THROW_IF_BACKEND_MODEL_ERROR(ValidateModelConfig());\n}\n\nTRITONSERVER_Error*\nModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)\n{\n  try {\n    *state = new ModelState(triton_model);\n  }\n  catch (const BackendModelException& ex) {\n    RETURN_ERROR_IF_TRUE(\n        ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,\n        std::string(\"unexpected nullptr in BackendModelException\"));\n    RETURN_IF_ERROR(ex.err_);\n  }\n\n  return nullptr;  // success\n}\n\nTRITONSERVER_Error*\nModelState::TensorShape(std::vector<int64_t>& shape)\n{\n  // This backend supports models that batch along the first dimension\n  // and those that don't batch. For non-batch models the output shape\n  // will be the shape from the model configuration. For batch models\n  // the output shape will be the shape from the model configuration\n  // prepended with [ -1 ] to represent the batch dimension. The\n  // backend \"responder\" utility used below will set the appropriate\n  // batch dimension value for each response. The shape needs to be\n  // initialized lazily because the SupportsFirstDimBatching function\n  // cannot be used until the model is completely loaded.\n  if (!shape_initialized_) {\n    bool supports_first_dim_batching;\n    RETURN_IF_ERROR(SupportsFirstDimBatching(&supports_first_dim_batching));\n    if (supports_first_dim_batching) {\n      shape_.push_back(-1);\n    }\n\n    shape_.insert(shape_.end(), nb_shape_.begin(), nb_shape_.end());\n    shape_initialized_ = true;\n  }\n\n  shape = shape_;\n\n  return nullptr;  // success\n}\n\nTRITONSERVER_Error*\nModelState::ValidateModelConfig()\n{\n  // If verbose logging is enabled, dump the model's configuration as\n  // JSON into the console output.\n  if (TRITONSERVER_LogIsEnabled(TRITONSERVER_LOG_VERBOSE)) {\n    common::TritonJson::WriteBuffer buffer;\n    RETURN_IF_ERROR(ModelConfig().PrettyWrite(&buffer));\n    LOG_MESSAGE(\n        TRITONSERVER_LOG_VERBOSE,\n        (std::string(\"model configuration:\\n\") + buffer.Contents()).c_str());\n  }\n\n  // ModelConfig is the model configuration as a TritonJson\n  // object. Use the TritonJson utilities to parse the JSON and\n  // determine if the configuration is supported by this backend.\n  common::TritonJson::Value inputs, outputs;\n  RETURN_IF_ERROR(ModelConfig().MemberAsArray(\"input\", &inputs));\n  RETURN_IF_ERROR(ModelConfig().MemberAsArray(\"output\", &outputs));\n\n  // The model must have exactly 1 input and 1 output.\n  RETURN_ERROR_IF_FALSE(\n      inputs.ArraySize() == 1, TRITONSERVER_ERROR_INVALID_ARG,\n      std::string(\"model configuration must have 1 input\"));\n  RETURN_ERROR_IF_FALSE(\n      outputs.ArraySize() == 1, TRITONSERVER_ERROR_INVALID_ARG,\n      std::string(\"model configuration must have 1 output\"));\n\n  common::TritonJson::Value input, output;\n  RETURN_IF_ERROR(inputs.IndexAsObject(0, &input));\n  RETURN_IF_ERROR(outputs.IndexAsObject(0, &output));\n\n  // Record the input and output name in the model state.\n  const char* input_name;\n  size_t input_name_len;\n  RETURN_IF_ERROR(input.MemberAsString(\"name\", &input_name, &input_name_len));\n  input_name_ = std::string(input_name);\n\n  const char* output_name;\n  size_t output_name_len;\n  RETURN_IF_ERROR(\n      output.MemberAsString(\"name\", &output_name, &output_name_len));\n  output_name_ = std::string(output_name);\n\n  // Input and output must have same datatype\n  std::string input_dtype, output_dtype;\n  RETURN_IF_ERROR(input.MemberAsString(\"data_type\", &input_dtype));\n  RETURN_IF_ERROR(output.MemberAsString(\"data_type\", &output_dtype));\n  RETURN_ERROR_IF_FALSE(\n      input_dtype == output_dtype, TRITONSERVER_ERROR_INVALID_ARG,\n      std::string(\"expected input and output datatype to match, got \") +\n          input_dtype + \" and \" + output_dtype);\n  datatype_ = ModelConfigDataTypeToTritonServerDataType(input_dtype);\n\n  // Input and output must have same shape. Reshape is not supported\n  // on either input or output so flag an error is the model\n  // configuration uses it.\n  triton::common::TritonJson::Value reshape;\n  RETURN_ERROR_IF_TRUE(\n      input.Find(\"reshape\", &reshape), TRITONSERVER_ERROR_UNSUPPORTED,\n      std::string(\"reshape not supported for input tensor\"));\n  RETURN_ERROR_IF_TRUE(\n      output.Find(\"reshape\", &reshape), TRITONSERVER_ERROR_UNSUPPORTED,\n      std::string(\"reshape not supported for output tensor\"));\n\n  std::vector<int64_t> input_shape, output_shape;\n  RETURN_IF_ERROR(backend::ParseShape(input, \"dims\", &input_shape));\n  RETURN_IF_ERROR(backend::ParseShape(output, \"dims\", &output_shape));\n\n  RETURN_ERROR_IF_FALSE(\n      input_shape == output_shape, TRITONSERVER_ERROR_INVALID_ARG,\n      std::string(\"expected input and output shape to match, got \") +\n          backend::ShapeToString(input_shape) + \" and \" +\n          backend::ShapeToString(output_shape));\n\n  nb_shape_ = input_shape;\n\n  return nullptr;  // success\n}\n\nextern \"C\" {\n\n// Triton calls TRITONBACKEND_ModelInitialize when a model is loaded\n// to allow the backend to create any state associated with the model,\n// and to also examine the model configuration to determine if the\n// configuration is suitable for the backend. Any errors reported by\n// this function will prevent the model from loading.\n//\nTRITONSERVER_Error*\nTRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model)\n{\n  // Create a ModelState object and associate it with the\n  // TRITONBACKEND_Model. If anything goes wrong with initialization\n  // of the model state then an error is returned and Triton will fail\n  // to load the model.\n  ModelState* model_state;\n  RETURN_IF_ERROR(ModelState::Create(model, &model_state));\n  RETURN_IF_ERROR(\n      TRITONBACKEND_ModelSetState(model, reinterpret_cast<void*>(model_state)));\n\n  return nullptr;  // success\n}\n\n// Triton calls TRITONBACKEND_ModelFinalize when a model is no longer\n// needed. The backend should cleanup any state associated with the\n// model. This function will not be called until all model instances\n// of the model have been finalized.\n//\nTRITONSERVER_Error*\nTRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model)\n{\n  void* vstate;\n  RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate));\n  ModelState* model_state = reinterpret_cast<ModelState*>(vstate);\n  delete model_state;\n\n  return nullptr;  // success\n}\n\n}  // extern \"C\"\n\n/////////////\n\n//\n// ModelInstanceState\n//\n// State associated with a model instance. An object of this class is\n// created and associated with each\n// TRITONBACKEND_ModelInstance. ModelInstanceState is derived from\n// BackendModelInstance class provided in the backend utilities that\n// provides many common functions.\n//\nclass ModelInstanceState : public BackendModelInstance {\n public:\n  static TRITONSERVER_Error* Create(\n      ModelState* model_state,\n      TRITONBACKEND_ModelInstance* triton_model_instance,\n      ModelInstanceState** state);\n  virtual ~ModelInstanceState() = default;\n\n  // Get the state of the model that corresponds to this instance.\n  ModelState* StateForModel() const { return model_state_; }\n\n private:\n  ModelInstanceState(\n      ModelState* model_state,\n      TRITONBACKEND_ModelInstance* triton_model_instance)\n      : BackendModelInstance(model_state, triton_model_instance),\n        model_state_(model_state)\n  {\n  }\n\n  ModelState* model_state_;\n};\n\nTRITONSERVER_Error*\nModelInstanceState::Create(\n    ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance,\n    ModelInstanceState** state)\n{\n  try {\n    *state = new ModelInstanceState(model_state, triton_model_instance);\n  }\n  catch (const BackendModelInstanceException& ex) {\n    RETURN_ERROR_IF_TRUE(\n        ex.err_ == nullptr, TRITONSERVER_ERROR_INTERNAL,\n        std::string(\"unexpected nullptr in BackendModelInstanceException\"));\n    RETURN_IF_ERROR(ex.err_);\n  }\n\n  return nullptr;  // success\n}\n\nextern \"C\" {\n\n// Triton calls TRITONBACKEND_ModelInstanceInitialize when a model\n// instance is created to allow the backend to initialize any state\n// associated with the instance.\n//\nTRITONSERVER_Error*\nTRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)\n{\n  // Get the model state associated with this instance's model.\n  TRITONBACKEND_Model* model;\n  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model));\n\n  void* vmodelstate;\n  RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate));\n  ModelState* model_state = reinterpret_cast<ModelState*>(vmodelstate);\n\n  // Create a ModelInstanceState object and associate it with the\n  // TRITONBACKEND_ModelInstance.\n  ModelInstanceState* instance_state;\n  RETURN_IF_ERROR(\n      ModelInstanceState::Create(model_state, instance, &instance_state));\n  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState(\n      instance, reinterpret_cast<void*>(instance_state)));\n\n  return nullptr;  // success\n}\n\n// Triton calls TRITONBACKEND_ModelInstanceFinalize when a model\n// instance is no longer needed. The backend should cleanup any state\n// associated with the model instance.\n//\nTRITONSERVER_Error*\nTRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)\n{\n  void* vstate;\n  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate));\n  ModelInstanceState* instance_state =\n      reinterpret_cast<ModelInstanceState*>(vstate);\n  delete instance_state;\n\n  return nullptr;  // success\n}\n\n}  // extern \"C\"\n\n/////////////\n\nextern \"C\" {\n\n// When Triton calls TRITONBACKEND_ModelInstanceExecute it is required\n// that a backend create a response for each request in the batch. A\n// response may be the output tensors required for that request or may\n// be an error that is returned in the response.\n//\nTRITONSERVER_Error*\nTRITONBACKEND_ModelInstanceExecute(\n    TRITONBACKEND_ModelInstance* instance, TRITONBACKEND_Request** requests,\n    const uint32_t request_count)\n{\n  // Collect various timestamps during the execution of this batch or\n  // requests. These values are reported below before returning from\n  // the function.\n\n  uint64_t exec_start_ns = 0;\n  SET_TIMESTAMP(exec_start_ns);\n\n  // Triton will not call this function simultaneously for the same\n  // 'instance'. But since this backend could be used by multiple\n  // instances from multiple models the implementation needs to handle\n  // multiple calls to this function at the same time (with different\n  // 'instance' objects). Best practice for a high-performance\n  // implementation is to avoid introducing mutex/lock and instead use\n  // only function-local and model-instance-specific state.\n  ModelInstanceState* instance_state;\n  RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(\n      instance, reinterpret_cast<void**>(&instance_state)));\n  ModelState* model_state = instance_state->StateForModel();\n\n  // 'responses' is initialized as a parallel array to 'requests',\n  // with one TRITONBACKEND_Response object for each\n  // TRITONBACKEND_Request object. If something goes wrong while\n  // creating these response objects, the backend simply returns an\n  // error from TRITONBACKEND_ModelInstanceExecute, indicating to\n  // Triton that this backend did not create or send any responses and\n  // so it is up to Triton to create and send an appropriate error\n  // response for each request. RETURN_IF_ERROR is one of several\n  // useful macros for error handling that can be found in\n  // backend_common.h.\n\n  std::vector<TRITONBACKEND_Response*> responses;\n  responses.reserve(request_count);\n  for (uint32_t r = 0; r < request_count; ++r) {\n    TRITONBACKEND_Request* request = requests[r];\n    TRITONBACKEND_Response* response;\n    RETURN_IF_ERROR(TRITONBACKEND_ResponseNew(&response, request));\n    responses.push_back(response);\n  }\n\n  // At this point, the backend takes ownership of 'requests', which\n  // means that it is responsible for sending a response for every\n  // request. From here, even if something goes wrong in processing,\n  // the backend must return 'nullptr' from this function to indicate\n  // success. Any errors and failures must be communicated via the\n  // response objects.\n  //\n  // To simplify error handling, the backend utilities manage\n  // 'responses' in a specific way and it is recommended that backends\n  // follow this same pattern. When an error is detected in the\n  // processing of a request, an appropriate error response is sent\n  // and the corresponding TRITONBACKEND_Response object within\n  // 'responses' is set to nullptr to indicate that the\n  // request/response has already been handled and no further processing\n  // should be performed for that request. Even if all responses fail,\n  // the backend still allows execution to flow to the end of the\n  // function so that statistics are correctly reported by the calls\n  // to TRITONBACKEND_ModelInstanceReportStatistics and\n  // TRITONBACKEND_ModelInstanceReportBatchStatistics.\n  // RESPOND_AND_SET_NULL_IF_ERROR, and\n  // RESPOND_ALL_AND_SET_NULL_IF_ERROR are macros from\n  // backend_common.h that assist in this management of response\n  // objects.\n\n  // The backend could iterate over the 'requests' and process each\n  // one separately. But for performance reasons it is usually\n  // preferred to create batched input tensors that are processed\n  // simultaneously. This is especially true for devices like GPUs\n  // that are capable of exploiting the large amount parallelism\n  // exposed by larger data sets.\n  //\n  // The backend utilities provide a \"collector\" to facilitate this\n  // batching process. The 'collector's ProcessTensor function will\n  // combine a tensor's value from each request in the batch into a\n  // single contiguous buffer. The buffer can be provided by the\n  // backend or 'collector' can create and manage it. In this backend,\n  // there is not a specific buffer into which the batch should be\n  // created, so use ProcessTensor arguments that cause collector to\n  // manage it. ProcessTensor does NOT support TRITONSERVER_TYPE_BYTES\n  // data type.\n\n  BackendInputCollector collector(\n      requests, request_count, &responses, model_state->TritonMemoryManager(),\n      false /* pinned_enabled */, nullptr /* stream*/);\n\n  // To instruct ProcessTensor to \"gather\" the entire batch of input\n  // tensors into a single contiguous buffer in CPU memory, set the\n  // \"allowed input types\" to be the CPU ones (see tritonserver.h in\n  // the triton-inference-server/core repo for allowed memory types).\n  std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>> allowed_input_types =\n      {{TRITONSERVER_MEMORY_CPU_PINNED, 0}, {TRITONSERVER_MEMORY_CPU, 0}};\n\n  const char* input_buffer;\n  size_t input_buffer_byte_size;\n  TRITONSERVER_MemoryType input_buffer_memory_type;\n  int64_t input_buffer_memory_type_id;\n\n  RESPOND_ALL_AND_SET_NULL_IF_ERROR(\n      responses, request_count,\n      collector.ProcessTensor(\n          model_state->InputTensorName().c_str(), nullptr /* existing_buffer */,\n          0 /* existing_buffer_byte_size */, allowed_input_types, &input_buffer,\n          &input_buffer_byte_size, &input_buffer_memory_type,\n          &input_buffer_memory_type_id));\n\n  // Finalize the collector. If 'true' is returned, 'input_buffer'\n  // will not be valid until the backend synchronizes the CUDA\n  // stream or event that was used when creating the collector. For\n  // this backend, GPU is not supported and so no CUDA sync should\n  // be needed; so if 'true' is returned simply log an error.\n  const bool need_cuda_input_sync = collector.Finalize();\n  if (need_cuda_input_sync) {\n    LOG_MESSAGE(\n        TRITONSERVER_LOG_ERROR,\n        \"'recommended' backend: unexpected CUDA sync required by collector\");\n  }\n\n  // 'input_buffer' contains the batched input tensor. The backend can\n  // implement whatever logic is necessary to produce the output\n  // tensor. This backend simply logs the input tensor value and then\n  // returns the input tensor value in the output tensor so no actual\n  // computation is needed.\n\n  uint64_t compute_start_ns = 0;\n  SET_TIMESTAMP(compute_start_ns);\n\n  LOG_MESSAGE(\n      TRITONSERVER_LOG_INFO,\n      (std::string(\"model \") + model_state->Name() + \": requests in batch \" +\n       std::to_string(request_count))\n          .c_str());\n  std::string tstr;\n  IGNORE_ERROR(BufferAsTypedString(\n      tstr, input_buffer, input_buffer_byte_size,\n      model_state->TensorDataType()));\n  LOG_MESSAGE(\n      TRITONSERVER_LOG_INFO,\n      (std::string(\"batched \" + model_state->InputTensorName() + \" value: \") +\n       tstr)\n          .c_str());\n\n  const char* output_buffer = input_buffer;\n  TRITONSERVER_MemoryType output_buffer_memory_type = input_buffer_memory_type;\n  int64_t output_buffer_memory_type_id = input_buffer_memory_type_id;\n\n  uint64_t compute_end_ns = 0;\n  SET_TIMESTAMP(compute_end_ns);\n\n  bool supports_first_dim_batching;\n  RESPOND_ALL_AND_SET_NULL_IF_ERROR(\n      responses, request_count,\n      model_state->SupportsFirstDimBatching(&supports_first_dim_batching));\n\n  std::vector<int64_t> tensor_shape;\n  RESPOND_ALL_AND_SET_NULL_IF_ERROR(\n      responses, request_count, model_state->TensorShape(tensor_shape));\n\n  // Because the output tensor values are concatenated into a single\n  // contiguous 'output_buffer', the backend must \"scatter\" them out\n  // to the individual response output tensors.  The backend utilities\n  // provide a \"responder\" to facilitate this scattering process.\n  // BackendOutputResponder does NOT support TRITONSERVER_TYPE_BYTES\n  // data type.\n\n  // The 'responders's ProcessTensor function will copy the portion of\n  // 'output_buffer' corresponding to each request's output into the\n  // response for that request.\n\n  BackendOutputResponder responder(\n      requests, request_count, &responses, model_state->TritonMemoryManager(),\n      supports_first_dim_batching, false /* pinned_enabled */,\n      nullptr /* stream*/);\n\n  responder.ProcessTensor(\n      model_state->OutputTensorName().c_str(), model_state->TensorDataType(),\n      tensor_shape, output_buffer, output_buffer_memory_type,\n      output_buffer_memory_type_id);\n\n  // Finalize the responder. If 'true' is returned, the output\n  // tensors' data will not be valid until the backend synchronizes\n  // the CUDA stream or event that was used when creating the\n  // responder. For this backend, GPU is not supported and so no CUDA\n  // sync should be needed; so if 'true' is returned simply log an\n  // error.\n  const bool need_cuda_output_sync = responder.Finalize();\n  if (need_cuda_output_sync) {\n    LOG_MESSAGE(\n        TRITONSERVER_LOG_ERROR,\n        \"'recommended' backend: unexpected CUDA sync required by responder\");\n  }\n\n  // Send all the responses that haven't already been sent because of\n  // an earlier error.\n  for (auto& response : responses) {\n    if (response != nullptr) {\n      LOG_IF_ERROR(\n          TRITONBACKEND_ResponseSend(\n              response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr),\n          \"failed to send response\");\n    }\n  }\n\n  uint64_t exec_end_ns = 0;\n  SET_TIMESTAMP(exec_end_ns);\n\n#ifdef TRITON_ENABLE_STATS\n  // For batch statistics need to know the total batch size of the\n  // requests. This is not necessarily just the number of requests,\n  // because if the model supports batching then any request can be a\n  // batched request itself.\n  size_t total_batch_size = 0;\n  if (!supports_first_dim_batching) {\n    total_batch_size = request_count;\n  } else {\n    for (uint32_t r = 0; r < request_count; ++r) {\n      auto& request = requests[r];\n      TRITONBACKEND_Input* input = nullptr;\n      LOG_IF_ERROR(\n          TRITONBACKEND_RequestInputByIndex(request, 0 /* index */, &input),\n          \"failed getting request input\");\n      if (input != nullptr) {\n        const int64_t* shape = nullptr;\n        LOG_IF_ERROR(\n            TRITONBACKEND_InputProperties(\n                input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr),\n            \"failed getting input properties\");\n        if (shape != nullptr) {\n          total_batch_size += shape[0];\n        }\n      }\n    }\n  }\n#else\n  (void)exec_start_ns;\n  (void)exec_end_ns;\n  (void)compute_start_ns;\n  (void)compute_end_ns;\n#endif  // TRITON_ENABLE_STATS\n\n  // Report statistics for each request, and then release the request.\n  for (uint32_t r = 0; r < request_count; ++r) {\n    auto& request = requests[r];\n\n#ifdef TRITON_ENABLE_STATS\n    LOG_IF_ERROR(\n        TRITONBACKEND_ModelInstanceReportStatistics(\n            instance_state->TritonModelInstance(), request,\n            (responses[r] != nullptr) /* success */, exec_start_ns,\n            compute_start_ns, compute_end_ns, exec_end_ns),\n        \"failed reporting request statistics\");\n#endif  // TRITON_ENABLE_STATS\n\n    LOG_IF_ERROR(\n        TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL),\n        \"failed releasing request\");\n  }\n\n#ifdef TRITON_ENABLE_STATS\n  // Report batch statistics.\n  LOG_IF_ERROR(\n      TRITONBACKEND_ModelInstanceReportBatchStatistics(\n          instance_state->TritonModelInstance(), total_batch_size,\n          exec_start_ns, compute_start_ns, compute_end_ns, exec_end_ns),\n      \"failed reporting batch request statistics\");\n#endif  // TRITON_ENABLE_STATS\n\n  return nullptr;  // success\n}\n\n}  // extern \"C\"\n\n}}}  // namespace triton::backend::recommended\n"
  },
  {
    "path": "examples/batching_strategies/single_batching/CMakeLists.txt",
    "content": "# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\ncmake_minimum_required(VERSION 3.31.8)\n\nproject(singlebatching LANGUAGES C CXX)\n\nset(TRITON_REPO_ORGANIZATION \"https://github.com/triton-inference-server\" CACHE STRING \"Git repository to pull from\")\nset(TRITON_COMMON_REPO_TAG \"main\" CACHE STRING \"Tag for triton-inference-server/common repo\")\nset(TRITON_CORE_REPO_TAG \"main\" CACHE STRING \"Tag for triton-inference-server/core repo\")\nset(TRITON_BACKEND_REPO_TAG \"main\" CACHE STRING \"Tag for triton-inference-server/backend repo\")\n\n#\n# Setting C++ min standard\n#\nset(TRITON_MIN_CXX_STANDARD 17 CACHE STRING \"The minimum C++ standard whose features are requested to build this target.\")\n\nif(NOT CMAKE_BUILD_TYPE)\n  set(CMAKE_BUILD_TYPE Release)\nendif()\n\n#\n# Dependencies\n#\n# FetchContent requires us to include the transitive closure of all\n# repos that we depend on so that we can override the tags.\n#\ninclude(FetchContent)\n\nFetchContent_Declare(\n  repo-common\n  GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/common.git\n  GIT_TAG ${TRITON_COMMON_REPO_TAG}\n  GIT_SHALLOW ON\n)\nFetchContent_Declare(\n  repo-core\n  GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/core.git\n  GIT_TAG ${TRITON_CORE_REPO_TAG}\n  GIT_SHALLOW ON\n)\nFetchContent_Declare(\n  repo-backend\n  GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/backend.git\n  GIT_TAG ${TRITON_BACKEND_REPO_TAG}\n  GIT_SHALLOW ON\n)\nFetchContent_MakeAvailable(repo-common repo-core repo-backend)\n\n#\n# The backend must be built into a shared library. Use an ldscript to\n# hide all symbols except for the TRITONBACKEND_ModelBatch API.\n#\nconfigure_file(src/libtriton_singlebatching.ldscript libtriton_singlebatching.ldscript COPYONLY)\n\nadd_library(\n  triton-single-batching SHARED\n  src/single_batching.cc\n)\n\ntarget_include_directories(\n  triton-single-batching\n  PRIVATE\n    ${CMAKE_CURRENT_SOURCE_DIR}/src\n)\n\ntarget_compile_features(triton-single-batching PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD})\ntarget_compile_options(\n  triton-single-batching PRIVATE\n  $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:\n    -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror>\n  $<$<CXX_COMPILER_ID:MSVC>:/Wall /D_WIN32_WINNT=0x0A00 /EHsc /Zc:preprocessor>\n)\n\ntarget_link_libraries(\n  triton-single-batching\n  PRIVATE\n    triton-core-serverapi   # from repo-core\n    triton-core-backendapi  # from repo-core\n    triton-core-serverstub  # from repo-core\n    triton-backend-utils    # from repo-backend\n)\n\nif(WIN32)\n  set_target_properties(\n    triton-single-batching PROPERTIES\n    POSITION_INDEPENDENT_CODE ON\n    OUTPUT_NAME triton_singlebatching\n  )\nelse()\n  set_target_properties(\n    triton-single-batching PROPERTIES\n    POSITION_INDEPENDENT_CODE ON\n    OUTPUT_NAME triton_singlebatching\n    LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_singlebatching.ldscript\n    LINK_FLAGS \"-Wl,--version-script libtriton_singlebatching.ldscript\"\n  )\nendif()\n\n#\n# Install\n#\ninclude(GNUInstallDirs)\nset(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/SingleBatching)\n\ninstall(\n  TARGETS\n    triton-single-batching\n  EXPORT\n    triton-single-batching-targets\n  LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/batching/single_batching\n  RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/batching/single_batching\n)\n\ninstall(\n  EXPORT\n    triton-single-batching-targets\n  FILE\n    SingleBatchingTargets.cmake\n  NAMESPACE\n    triton-single-batching\n  DESTINATION\n    ${INSTALL_CONFIGDIR}\n)\n\ninclude(CMakePackageConfigHelpers)\nconfigure_package_config_file(\n  ${CMAKE_CURRENT_LIST_DIR}/cmake/triton-single-batching.cmake.in\n  ${CMAKE_CURRENT_BINARY_DIR}/triton-single-batching.cmake\n  INSTALL_DESTINATION ${INSTALL_CONFIGDIR}\n)\n\ninstall(\n  FILES\n  ${CMAKE_CURRENT_BINARY_DIR}/triton-single-batching.cmake\n  DESTINATION ${INSTALL_CONFIGDIR}\n)\n\n#\n# Export from build tree\n#\nexport(\n  EXPORT triton-single-batching-targets\n  FILE ${CMAKE_CURRENT_BINARY_DIR}/triton-single-batching.cmake\n  NAMESPACE triton-single-batching\n)\n\nexport(PACKAGE triton-single-batching)\n"
  },
  {
    "path": "examples/batching_strategies/single_batching/cmake/triton-single-batching.cmake.in",
    "content": "# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\ninclude(CMakeFindDependencyMacro)\n\nget_filename_component(\n  SINGLEBATCHING_CMAKE_DIR \"${CMAKE_CURRENT_LIST_FILE}\" PATH\n)\n\nlist(APPEND CMAKE_MODULE_PATH ${SINGLEBATCHING_CMAKE_DIR})\n\nif(NOT TARGET triton-single-batching)\n  include(\"${SINGLEBATCHING_CMAKE_DIR}/SingleBatchingTargets.cmake\")\nendif()\n\nset(SINGLEBATCHING_LIBRARIES triton-single-batching)\n"
  },
  {
    "path": "examples/batching_strategies/single_batching/src/libtriton_singlebatching.ldscript",
    "content": "# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n{\n  global:\n    TRITONBACKEND_ModelBatch*;\n  local: *;\n};\n"
  },
  {
    "path": "examples/batching_strategies/single_batching/src/single_batching.cc",
    "content": "// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n//\n// Redistribution and use in source and binary forms, with or without\n// modification, are permitted provided that the following conditions\n// are met:\n//  * Redistributions of source code must retain the above copyright\n//    notice, this list of conditions and the following disclaimer.\n//  * Redistributions in binary form must reproduce the above copyright\n//    notice, this list of conditions and the following disclaimer in the\n//    documentation and/or other materials provided with the distribution.\n//  * Neither the name of NVIDIA CORPORATION nor the names of its\n//    contributors may be used to endorse or promote products derived\n//    from this software without specific prior written permission.\n//\n// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n#include <iostream>\n\n#include \"triton/core/tritonbackend.h\"\n\nnamespace triton { namespace core { namespace single_batching {\n\n//\n// Minimal custom  batching strategy that demonstrates the\n// TRITONBACKEND_ModelBatch API. This custom batching strategy dynamically\n// creates batches up to 1 request.\n//\n\n/////////////\n\nextern \"C\" {\n\n/// Check whether a request should be added to the pending model batch.\n///\n/// \\param request The request to be added to the pending batch.\n/// \\param userp The placeholder for backend to store and retrieve information\n/// about this pending batch. When the callback returns, this should reflect\n/// the latest batch information.\n/// \\param should_include The pointer to be updated on whether the request\n/// should be included in the batch.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error*\nTRITONBACKEND_ModelBatchIncludeRequest(\n    TRITONBACKEND_Request* request, void* userp, bool* should_include)\n{\n  // Check if the batch is empty.\n  // If so, include this request. Otherwise, do not.\n  bool* empty = static_cast<bool*>(userp);\n  if (*empty) {\n    *should_include = true;\n    *empty = false;\n  } else {\n    *should_include = false;\n  }\n\n  return nullptr;  // success\n}\n\n/// Callback to be invoked when Triton has begun forming a batch.\n///\n/// \\param batcher The read-only placeholder for backend to retrieve\n// information about the batching strategy for this model.\n/// \\param userp The placeholder for backend to store and retrieve information\n/// about this pending batch.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error*\nTRITONBACKEND_ModelBatchInitialize(\n    const TRITONBACKEND_Batcher* batcher, void** userp)\n{\n  // Userp will point to a boolean indicating whether the batch is empty.\n  *userp = new bool(true);\n  return nullptr;  // success\n}\n\n/// Callback to be invoked when Triton has finishing forming a batch.\n///\n/// \\param userp The placeholder for backend to store and retrieve information\n/// about this pending batch.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error*\nTRITONBACKEND_ModelBatchFinalize(void* userp)\n{\n  delete static_cast<bool*>(userp);\n  return nullptr;  // success\n}\n\n/// Create a new batcher for use with custom batching. This is called during\n/// model loading. The batcher will point to a user-defined data structure that\n/// holds read-only data used for custom batching.\n///\n/// \\param batcher User-defined placeholder for backend to store and\n/// retrieve information about the batching strategy for this model.\n/// return a TRITONSERVER_Error indicating success or failure.\n/// \\param model The backend model for which Triton is forming a batch.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error*\nTRITONBACKEND_ModelBatcherInitialize(\n    TRITONBACKEND_Batcher** batcher, TRITONBACKEND_Model* model)\n{\n  return nullptr;  // success\n}\n\n/// Free memory associated with batcher. This is called during model unloading.\n///\n/// \\param batcher User-defined placeholder for backend to store and\n/// retrieve information about the batching strategy for this model.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error*\nTRITONBACKEND_ModelBatcherFinalize(TRITONBACKEND_Batcher* batcher)\n{\n  return nullptr;  // success\n}\n\n}  // extern \"C\"\n\n}}}  // namespace triton::core::single_batching\n"
  },
  {
    "path": "examples/batching_strategies/volume_batching/CMakeLists.txt",
    "content": "# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\ncmake_minimum_required(VERSION 3.31.8)\n\nproject(volumebatching LANGUAGES C CXX)\n\nset(TRITON_REPO_ORGANIZATION \"https://github.com/triton-inference-server\" CACHE STRING \"Git repository to pull from\")\nset(TRITON_COMMON_REPO_TAG \"main\" CACHE STRING \"Tag for triton-inference-server/common repo\")\nset(TRITON_CORE_REPO_TAG \"main\" CACHE STRING \"Tag for triton-inference-server/core repo\")\nset(TRITON_BACKEND_REPO_TAG \"main\" CACHE STRING \"Tag for triton-inference-server/backend repo\")\n\n#\n# Setting C++ min standard\n#\nset(TRITON_MIN_CXX_STANDARD 17 CACHE STRING \"The minimum C++ standard whose features are requested to build this target.\")\n\nif(NOT CMAKE_BUILD_TYPE)\n  set(CMAKE_BUILD_TYPE Release)\nendif()\n\n#\n# Dependencies\n#\n# FetchContent requires us to include the transitive closure of all\n# repos that we depend on so that we can override the tags.\n#\ninclude(FetchContent)\n\nFetchContent_Declare(\n  repo-common\n  GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/common.git\n  GIT_TAG ${TRITON_COMMON_REPO_TAG}\n  GIT_SHALLOW ON\n)\nFetchContent_Declare(\n  repo-core\n  GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/core.git\n  GIT_TAG ${TRITON_CORE_REPO_TAG}\n  GIT_SHALLOW ON\n)\nFetchContent_Declare(\n  repo-backend\n  GIT_REPOSITORY ${TRITON_REPO_ORGANIZATION}/backend.git\n  GIT_TAG ${TRITON_BACKEND_REPO_TAG}\n  GIT_SHALLOW ON\n)\nFetchContent_MakeAvailable(repo-common repo-core repo-backend)\n\n#\n# The backend must be built into a shared library. Use an ldscript to\n# hide all symbols except for the TRITONBACKEND_ModelBatch API.\n#\nconfigure_file(src/libtriton_volumebatching.ldscript libtriton_volumebatching.ldscript COPYONLY)\n\nadd_library(\n  triton-volume-batching SHARED\n  src/volume_batching.cc\n)\n\ntarget_include_directories(\n  triton-volume-batching\n  PRIVATE\n    ${CMAKE_CURRENT_SOURCE_DIR}/src\n)\n\ntarget_compile_features(triton-volume-batching PRIVATE cxx_std_${TRITON_MIN_CXX_STANDARD})\ntarget_compile_options(\n  triton-volume-batching PRIVATE\n  $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:\n    -Wall -Wextra -Wno-unused-parameter -Wno-type-limits -Werror>\n  $<$<CXX_COMPILER_ID:MSVC>:/Wall /D_WIN32_WINNT=0x0A00 /EHsc /Zc:preprocessor>\n)\n\ntarget_link_libraries(\n  triton-volume-batching\n  PRIVATE\n    triton-core-serverapi   # from repo-core\n    triton-core-backendapi  # from repo-core\n    triton-core-serverstub  # from repo-core\n    triton-backend-utils    # from repo-backend\n)\n\nif(WIN32)\n  set_target_properties(\n    triton-volume-batching PROPERTIES\n    POSITION_INDEPENDENT_CODE ON\n    OUTPUT_NAME triton_volumebatching\n  )\nelse()\n  set_target_properties(\n    triton-volume-batching PROPERTIES\n    POSITION_INDEPENDENT_CODE ON\n    OUTPUT_NAME triton_volumebatching\n    LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_volumebatching.ldscript\n    LINK_FLAGS \"-Wl,--version-script libtriton_volumebatching.ldscript\"\n  )\nendif()\n\n#\n# Install\n#\ninclude(GNUInstallDirs)\nset(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/volumeBatching)\n\ninstall(\n  TARGETS\n    triton-volume-batching\n  EXPORT\n    triton-volume-batching-targets\n  LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/batching/volume_batching\n  RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/batching/volume_batching\n)\n\ninstall(\n  EXPORT\n    triton-volume-batching-targets\n  FILE\n    VolumeBatchingTargets.cmake\n  NAMESPACE\n    triton-volume-batching\n  DESTINATION\n    ${INSTALL_CONFIGDIR}\n)\n\ninclude(CMakePackageConfigHelpers)\nconfigure_package_config_file(\n  ${CMAKE_CURRENT_LIST_DIR}/cmake/triton-volume-batching.cmake.in\n  ${CMAKE_CURRENT_BINARY_DIR}/triton-volume-batching.cmake\n  INSTALL_DESTINATION ${INSTALL_CONFIGDIR}\n)\n\ninstall(\n  FILES\n  ${CMAKE_CURRENT_BINARY_DIR}/triton-volume-batching.cmake\n  DESTINATION ${INSTALL_CONFIGDIR}\n)\n\n#\n# Export from build tree\n#\nexport(\n  EXPORT triton-volume-batching-targets\n  FILE ${CMAKE_CURRENT_BINARY_DIR}/triton-volume-batching.cmake\n  NAMESPACE triton-volume-batching\n)\n\nexport(PACKAGE triton-volume-batching)\n"
  },
  {
    "path": "examples/batching_strategies/volume_batching/cmake/triton-volume-batching.cmake.in",
    "content": "# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\ninclude(CMakeFindDependencyMacro)\n\nget_filename_component(\n  VOLUMEBATCHING_CMAKE_DIR \"${CMAKE_CURRENT_LIST_FILE}\" PATH\n)\n\nlist(APPEND CMAKE_MODULE_PATH ${VOLUMEBATCHING_CMAKE_DIR})\n\nif(NOT TARGET triton-volume-batching)\n  include(\"${VOLUMEBATCHING_CMAKE_DIR}/VolumeBatchingTargets.cmake\")\nendif()\n\nset(VOLUMEBATCHING_LIBRARIES triton-volume-batching)\n"
  },
  {
    "path": "examples/batching_strategies/volume_batching/src/libtriton_volumebatching.ldscript",
    "content": "# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n{\n  global:\n    TRITONBACKEND_ModelBatch*;\n  local: *;\n};\n"
  },
  {
    "path": "examples/batching_strategies/volume_batching/src/volume_batching.cc",
    "content": "// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n//\n// Redistribution and use in source and binary forms, with or without\n// modification, are permitted provided that the following conditions\n// are met:\n//  * Redistributions of source code must retain the above copyright\n//    notice, this list of conditions and the following disclaimer.\n//  * Redistributions in binary form must reproduce the above copyright\n//    notice, this list of conditions and the following disclaimer in the\n//    documentation and/or other materials provided with the distribution.\n//  * Neither the name of NVIDIA CORPORATION nor the names of its\n//    contributors may be used to endorse or promote products derived\n//    from this software without specific prior written permission.\n//\n// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n#include <iostream>\n\n#include \"triton/core/tritonbackend.h\"\n\n#define TRITONJSON_STATUSTYPE TRITONSERVER_Error*\n#define TRITONJSON_STATUSRETURN(M) \\\n  return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, (M).c_str())\n#define TRITONJSON_STATUSSUCCESS nullptr\n#include \"triton/common/triton_json.h\"\n\nnamespace triton { namespace core { namespace volume_batching {\n\n//\n// Minimal custom  batching strategy that demonstrates the\n// TRITONBACKEND_ModelBatch API. This custom batching strategy dynamically\n// creates batches up to 1 request.\n//\n\n/////////////\n\nextern \"C\" {\n\n/// Check whether a request should be added to the pending model batch.\n///\n/// \\param request The request to be added to the pending batch.\n/// \\param userp The placeholder for backend to store and retrieve information\n/// about this pending batch. When the callback returns, this should reflect\n/// the latest batch information.\n/// \\param should_include The pointer to be updated on whether the request\n/// should be included in the batch.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error*\nTRITONBACKEND_ModelBatchIncludeRequest(\n    TRITONBACKEND_Request* request, void* userp, bool* should_include)\n{\n  // Default should_include to false in case function returns error.\n  *should_include = false;\n\n  // Get current remaining batch volume.\n  unsigned int* remaining_volume = static_cast<unsigned int*>(userp);\n\n  // Get request's volume in bytes.\n  unsigned int pending_volume = 0;\n\n  uint32_t input_count;\n  auto err = TRITONBACKEND_RequestInputCount(request, &input_count);\n  if (err)\n    return err;\n\n  TRITONBACKEND_Input* input;\n  size_t data_byte_size;\n\n  for (size_t count = 0; count < input_count; count++) {\n    auto err =\n        TRITONBACKEND_RequestInputByIndex(request, count /* index */, &input);\n    if (err)\n      return err;\n    err = TRITONBACKEND_InputProperties(\n        input, nullptr, nullptr, nullptr, nullptr, &data_byte_size, nullptr);\n    if (err)\n      return err;\n    pending_volume += static_cast<unsigned int>(data_byte_size);\n  }\n\n  // Print remaining volume for debugging purposes.\n  std::cout << \"Pending volume : \" << pending_volume << std::endl;\n  std::cout << \"Remaining volume : \" << *remaining_volume << std::endl;\n\n  // Check if there is enough remaining volume for this request.\n  // If so, include this request. Otherwise, do not.\n  if (pending_volume <= *remaining_volume) {\n    *should_include = true;\n    *remaining_volume = *remaining_volume - pending_volume;\n  } else {\n    *should_include = false;\n  }\n\n  return nullptr;  // success\n}\n\n/// Callback to be invoked when Triton has begun forming a batch.\n///\n/// \\param batcher The read-only placeholder for backend to retrieve\n// information about the batching strategy for this model.\n/// \\param userp The placeholder for backend to store and retrieve information\n/// about this pending batch.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error*\nTRITONBACKEND_ModelBatchInitialize(\n    const TRITONBACKEND_Batcher* batcher, void** userp)\n{\n  // Userp will point to an unsigned integer representing the remaining volume\n  // in bytes for this batch.\n  *userp = new unsigned int(*reinterpret_cast<const unsigned int*>(batcher));\n  return nullptr;  // success\n}\n\n/// Callback to be invoked when Triton has finishing forming a batch.\n///\n/// \\param userp The placeholder for backend to store and retrieve information\n/// about this pending batch.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error*\nTRITONBACKEND_ModelBatchFinalize(void* userp)\n{\n  delete static_cast<unsigned int*>(userp);\n  return nullptr;  // success\n}\n\n/// Create a new batcher for use with custom batching. This is called during\n/// model loading. The batcher will point to a user-defined data structure that\n/// holds read-only data used for custom batching.\n///\n/// \\param batcher User-defined placeholder for backend to store and\n/// retrieve information about the batching strategy for this model.\n/// return a TRITONSERVER_Error indicating success or failure.\n/// \\param model The backend model for which Triton is forming a batch.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error*\nTRITONBACKEND_ModelBatcherInitialize(\n    TRITONBACKEND_Batcher** batcher, TRITONBACKEND_Model* model)\n{\n  // Batcher will point to an unsigned integer representing the maximum\n  // volume in bytes for each batch.\n\n  // Read the user-specified bytes from the model config.\n  TRITONSERVER_Message* config_message;\n  TRITONBACKEND_ModelConfig(model, 1 /* config_version */, &config_message);\n\n  const char* buffer;\n  size_t byte_size;\n\n  uint64_t max_volume_bytes = 0;\n  std::string max_volume_bytes_str;\n\n  auto err =\n      TRITONSERVER_MessageSerializeToJson(config_message, &buffer, &byte_size);\n  if (err)\n    return err;\n\n  triton::common::TritonJson::Value model_config, params, volume_param;\n  err = model_config.Parse(buffer, byte_size);\n  TRITONSERVER_MessageDelete(config_message);\n\n  if (!model_config.Find(\"parameters\", &params)) {\n    return TRITONSERVER_ErrorNew(\n        TRITONSERVER_ERROR_NOT_FOUND,\n        \"Unable to find parameters in model config\");\n  }\n\n  std::vector<std::string> param_keys;\n\n  if (!params.Find(\"MAX_BATCH_VOLUME_BYTES\", &volume_param)) {\n    return TRITONSERVER_ErrorNew(\n        TRITONSERVER_ERROR_NOT_FOUND,\n        \"Unable to find MAX_BATCH_VOLUME_BYTES parameter in model config\");\n  }\n  err = volume_param.MemberAsString(\"string_value\", &max_volume_bytes_str);\n  if (err)\n    return err;\n\n  try {\n    max_volume_bytes = static_cast<uint64_t>(std::stoul(max_volume_bytes_str));\n  }\n  catch (const std::invalid_argument& ia) {\n    return TRITONSERVER_ErrorNew(\n        TRITONSERVER_ERROR_INVALID_ARG,\n        (std::string(\"failed to convert '\") + max_volume_bytes_str +\n         \"' to unsigned int64\")\n            .c_str());\n  }\n\n  *batcher = reinterpret_cast<TRITONBACKEND_Batcher*>(\n      new unsigned int(max_volume_bytes));\n  return nullptr;  // success\n}\n\n/// Free memory associated with batcher. This is called during model unloading.\n///\n/// \\param batcher User-defined placeholder for backend to store and\n/// retrieve information about the batching strategy for this model.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error*\nTRITONBACKEND_ModelBatcherFinalize(TRITONBACKEND_Batcher* batcher)\n{\n  delete reinterpret_cast<unsigned int*>(batcher);\n  return nullptr;  // success\n}\n\n}  // extern \"C\"\n\n}}}  // namespace triton::core::volume_batching\n"
  },
  {
    "path": "examples/clients/bls_client",
    "content": "#!/usr/bin/python\n# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\nimport argparse\nimport sys\n\nimport numpy as np\nimport tritonhttpclient as httpclient\nfrom tritonclientutils import np_to_triton_dtype\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\n        \"-u\",\n        \"--url\",\n        type=str,\n        required=False,\n        default=\"localhost:8000\",\n        help=\"Inference server URL. Default is localhost:8000.\",\n    )\n    FLAGS = parser.parse_args()\n\n    model_name = \"bls_fp32\"\n    shape = [16]\n    with httpclient.InferenceServerClient(url=FLAGS.url) as client:\n        input0_data = np.random.rand(*shape).astype(np.float32)\n        input1_data = np.random.rand(*shape).astype(np.float32)\n        inputs = [\n            httpclient.InferInput(\n                \"INPUT0\",\n                input0_data.shape,\n                np_to_triton_dtype(input0_data.dtype),\n            ),\n            httpclient.InferInput(\n                \"INPUT1\",\n                input1_data.shape,\n                np_to_triton_dtype(input1_data.dtype),\n            ),\n        ]\n\n        inputs[0].set_data_from_numpy(input0_data)\n        inputs[1].set_data_from_numpy(input1_data)\n\n        outputs = [\n            httpclient.InferRequestedOutput(\"OUTPUT0\"),\n            httpclient.InferRequestedOutput(\"OUTPUT1\"),\n        ]\n        response = client.infer(model_name, inputs, request_id=str(1), outputs=outputs)\n\n        result = response.get_response()\n        output0_data = response.as_numpy(\"OUTPUT0\")\n        output1_data = response.as_numpy(\"OUTPUT1\")\n\n        print(\n            \"INPUT0 ({}) + INPUT1 ({}) = OUTPUT0 ({})\".format(\n                input0_data, input1_data, output0_data\n            )\n        )\n        print(\n            \"INPUT0 ({}) - INPUT1 ({}) = OUTPUT1 ({})\".format(\n                input0_data, input1_data, output1_data\n            )\n        )\n\n        if not np.allclose(input0_data + input1_data, output0_data):\n            print(\"error: incorrect sum\")\n            sys.exit(1)\n\n        if not np.allclose(input0_data - input1_data, output1_data):\n            print(\"error: incorrect difference\")\n            sys.exit(1)\n\n    print(\"\\nPASS\")\n    sys.exit(0)\n"
  },
  {
    "path": "examples/clients/minimal_client",
    "content": "#!/usr/bin/env python\n# Copyright 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\nimport argparse\nimport sys\n\nimport numpy as np\nimport tritonclient.http as httpclient\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\n        \"-u\",\n        \"--url\",\n        type=str,\n        required=False,\n        default=\"localhost:8000\",\n        help=\"Inference server URL. Default is localhost:8000.\",\n    )\n    FLAGS = parser.parse_args()\n\n    # For the HTTP client, need to specify large enough concurrency to\n    # issue all the inference requests to the server in parallel. For\n    # this example we want to be able to send 2 requests concurrently.\n    try:\n        concurrent_request_count = 2\n        triton_client = httpclient.InferenceServerClient(\n            url=FLAGS.url, concurrency=concurrent_request_count\n        )\n    except Exception as e:\n        print(\"channel creation failed: \" + str(e))\n        sys.exit(1)\n\n    # First send a single request to the nonbatching model.\n    print(\"=========\")\n    input0_data = np.array([1, 2, 3, 4], dtype=np.int32)\n    print(\"Sending request to nonbatching model: IN0 = {}\".format(input0_data))\n\n    inputs = [httpclient.InferInput(\"IN0\", [4], \"INT32\")]\n    inputs[0].set_data_from_numpy(input0_data)\n    result = triton_client.infer(\"nonbatching\", inputs)\n\n    print(\"Response: {}\".format(result.get_response()))\n    print(\"OUT0 = {}\".format(result.as_numpy(\"OUT0\")))\n\n    # Send 2 requests to the batching model. Because these are sent\n    # asynchronously and Triton's dynamic batcher is configured to\n    # delay up to 5 seconds when forming a batch for this model, we\n    # expect these 2 requests to be batched within Triton and sent to\n    # the minimal backend as a single batch.\n    print(\"\\n=========\")\n    async_requests = []\n\n    input0_data = np.array([[10, 11, 12, 13]], dtype=np.int32)\n    print(\"Sending request to batching model: IN0 = {}\".format(input0_data))\n    inputs = [httpclient.InferInput(\"IN0\", [1, 4], \"INT32\")]\n    inputs[0].set_data_from_numpy(input0_data)\n    async_requests.append(triton_client.async_infer(\"batching\", inputs))\n\n    input0_data = np.array([[20, 21, 22, 23]], dtype=np.int32)\n    print(\"Sending request to batching model: IN0 = {}\".format(input0_data))\n    inputs = [httpclient.InferInput(\"IN0\", [1, 4], \"INT32\")]\n    inputs[0].set_data_from_numpy(input0_data)\n    async_requests.append(triton_client.async_infer(\"batching\", inputs))\n\n    for async_request in async_requests:\n        # Get the result from the initiated asynchronous inference\n        # request. This call will block till the server responds.\n        result = async_request.get_result()\n        print(\"Response: {}\".format(result.get_response()))\n        print(\"OUT0 = {}\".format(result.as_numpy(\"OUT0\")))\n"
  },
  {
    "path": "examples/clients/recommended_client",
    "content": "#!/usr/bin/env python\n# Copyright 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\nimport argparse\nimport sys\n\nimport numpy as np\nimport tritonclient.http as httpclient\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\n        \"-u\",\n        \"--url\",\n        type=str,\n        required=False,\n        default=\"localhost:8000\",\n        help=\"Inference server URL. Default is localhost:8000.\",\n    )\n    FLAGS = parser.parse_args()\n\n    # For the HTTP client, need to specify large enough concurrency to\n    # issue all the inference requests to the server in parallel. For\n    # this example we want to be able to send 2 requests concurrently.\n    try:\n        concurrent_request_count = 2\n        triton_client = httpclient.InferenceServerClient(\n            url=FLAGS.url, concurrency=concurrent_request_count\n        )\n    except Exception as e:\n        print(\"channel creation failed: \" + str(e))\n        sys.exit(1)\n\n    # Send 2 requests to the batching model. Because these are sent\n    # asynchronously and Triton's dynamic batcher is configured to\n    # delay up to 5 seconds when forming a batch for this model, we\n    # expect these 2 requests to be batched within Triton and sent to\n    # the backend as a single batch.\n    #\n    # The recommended backend can handle any model with 1 input and 1\n    # output as long as the input and output datatype and shape are\n    # the same. The batching model uses datatype FP32 and shape\n    # [ 4, 4 ].\n    print(\"\\n=========\")\n    async_requests = []\n\n    input0_data = np.array(\n        [\n            [\n                [1.0, 1.1, 1.2, 1.3],\n                [2.0, 2.1, 2.2, 2.3],\n                [3.0, 3.1, 3.2, 3.3],\n                [4.0, 4.1, 4.2, 4.3],\n            ]\n        ],\n        dtype=np.float32,\n    )\n    print(\"Sending request to batching model: input = {}\".format(input0_data))\n    inputs = [httpclient.InferInput(\"INPUT\", [1, 4, 4], \"FP32\")]\n    inputs[0].set_data_from_numpy(input0_data)\n    async_requests.append(triton_client.async_infer(\"batching\", inputs))\n\n    input0_data = np.array(\n        [\n            [\n                [10.0, 10.1, 10.2, 10.3],\n                [20.0, 20.1, 20.2, 20.3],\n                [30.0, 30.1, 30.2, 30.3],\n                [40.0, 40.1, 40.2, 40.3],\n            ]\n        ],\n        dtype=np.float32,\n    )\n    print(\"Sending request to batching model: input = {}\".format(input0_data))\n    inputs = [httpclient.InferInput(\"INPUT\", [1, 4, 4], \"FP32\")]\n    inputs[0].set_data_from_numpy(input0_data)\n    async_requests.append(triton_client.async_infer(\"batching\", inputs))\n\n    for async_request in async_requests:\n        # Get the result from the initiated asynchronous inference\n        # request. This call will block till the server responds.\n        result = async_request.get_result()\n        print(\"Response: {}\".format(result.get_response()))\n        print(\"OUTPUT = {}\".format(result.as_numpy(\"OUTPUT\")))\n"
  },
  {
    "path": "examples/model_repos/bls_models/addsub_onnx/1/model.onnx",
    "content": "\b\n\u0012\u0006triton:\u0002\n\u001b\n\u0006INPUT0\u0012\u0007_INPUT0\"\bIdentity\n\u001b\n\u0006INPUT1\u0012\u0007_INPUT1\"\bIdentity\n\u001e\n\u0007_INPUT0\n\u0007_INPUT1\u0012\u0005CAST0\"\u0003Add\n\u001e\n\u0007_INPUT0\n\u0007_INPUT1\u0012\u0005CAST1\"\u0003Sub\n!\n\u0005CAST0\u0012\u0007OUTPUT0\"\u0004Cast*\t\n\u0002to\u0018\u0001\u0001\u0002\n!\n\u0005CAST1\u0012\u0007OUTPUT1\"\u0004Cast*\t\n\u0002to\u0018\u0001\u0001\u0002\u0012$onnx_nobatch_float32_float32_float32Z\u0014\n\u0006INPUT0\u0012\n\n\b\b\u0001\u0012\u0004\n\u0002\b\u0010Z\u0014\n\u0006INPUT1\u0012\n\n\b\b\u0001\u0012\u0004\n\u0002\b\u0010b\u0015\n\u0007OUTPUT0\u0012\n\n\b\b\u0001\u0012\u0004\n\u0002\b\u0010b\u0015\n\u0007OUTPUT1\u0012\n\n\b\b\u0001\u0012\u0004\n\u0002\b\u0010B\u0002\u0010\u0015"
  },
  {
    "path": "examples/model_repos/bls_models/addsub_onnx/config.pbtxt",
    "content": "name: \"addsub_onnx\"\nplatform: \"onnxruntime_onnx\"\nmax_batch_size: 0\n\ninput [\n  {\n    name: \"INPUT0\"\n    data_type: TYPE_FP32\n    dims: [ 16 ]\n  },\n  {\n    name: \"INPUT1\"\n    data_type: TYPE_FP32\n    dims: [ 16 ]\n  }\n]\noutput [\n  {\n    name: \"OUTPUT0\"\n    data_type: TYPE_FP32\n    dims: [ 16 ]\n  },\n  {\n    name: \"OUTPUT1\"\n    data_type: TYPE_FP32\n    dims: [ 16 ]\n  }\n]\n"
  },
  {
    "path": "examples/model_repos/bls_models/addsub_python/1/model.py",
    "content": "# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\nimport json\n\nimport triton_python_backend_utils as pb_utils\n\n\n# This model calculates the sum and difference of the INPUT0 and INPUT1 and put\n# the results in OUTPUT0 and OUTPUT1 respectively. For more information\n# regarding how this model.py was written, please refer to Python Backend.\nclass TritonPythonModel:\n    def initialize(self, args):\n        self.model_config = model_config = json.loads(args[\"model_config\"])\n\n        output0_config = pb_utils.get_output_config_by_name(model_config, \"OUTPUT0\")\n\n        output1_config = pb_utils.get_output_config_by_name(model_config, \"OUTPUT1\")\n\n        self.output0_dtype = pb_utils.triton_string_to_numpy(\n            output0_config[\"data_type\"]\n        )\n        self.output1_dtype = pb_utils.triton_string_to_numpy(\n            output1_config[\"data_type\"]\n        )\n\n    def execute(self, requests):\n        output0_dtype = self.output0_dtype\n        output1_dtype = self.output1_dtype\n\n        responses = []\n\n        for request in requests:\n            in_0 = pb_utils.get_input_tensor_by_name(request, \"INPUT0\")\n            in_1 = pb_utils.get_input_tensor_by_name(request, \"INPUT1\")\n\n            out_0, out_1 = (\n                in_0.as_numpy() + in_1.as_numpy(),\n                in_0.as_numpy() - in_1.as_numpy(),\n            )\n\n            out_tensor_0 = pb_utils.Tensor(\"OUTPUT0\", out_0.astype(output0_dtype))\n            out_tensor_1 = pb_utils.Tensor(\"OUTPUT1\", out_1.astype(output1_dtype))\n\n            inference_response = pb_utils.InferenceResponse(\n                output_tensors=[out_tensor_0, out_tensor_1]\n            )\n            responses.append(inference_response)\n\n        return responses\n\n    def finalize(self):\n        print(\"Cleaning up...\")\n"
  },
  {
    "path": "examples/model_repos/bls_models/addsub_python/config.pbtxt",
    "content": "# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\nname: \"addsub_python\"\nbackend: \"python\"\nmax_batch_size: 0\n\ninput [\n  {\n    name: \"INPUT0\"\n    data_type: TYPE_FP32\n    dims: [ 16 ]\n  }\n]\ninput [\n  {\n    name: \"INPUT1\"\n    data_type: TYPE_FP32\n    dims: [ 16 ]\n  }\n]\noutput [\n  {\n    name: \"OUTPUT0\"\n    data_type: TYPE_FP32\n    dims: [ 16 ]\n  }\n]\noutput [\n  {\n    name: \"OUTPUT1\"\n    data_type: TYPE_FP32\n    dims: [ 16 ]\n  }\n]\n"
  },
  {
    "path": "examples/model_repos/bls_models/bls_fp32/config.pbtxt",
    "content": "# Copyright 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\nname: \"bls_fp32\"\nbackend: \"bls\"\nmax_batch_size: 0\n\ninput [\n  {\n    name: \"INPUT0\"\n    data_type: TYPE_FP32\n    dims: [ 16 ]\n  }\n]\ninput [\n  {\n    name: \"INPUT1\"\n    data_type: TYPE_FP32\n    dims: [ 16 ]\n  }\n]\noutput [\n  {\n    name: \"OUTPUT0\"\n    data_type: TYPE_FP32\n    dims: [ 16 ]\n  }\n]\noutput [\n  {\n    name: \"OUTPUT1\"\n    data_type: TYPE_FP32\n    dims: [ 16 ]\n  }\n]\ninstance_group [\n  {\n    kind: KIND_CPU\n  }\n]\n"
  },
  {
    "path": "examples/model_repos/minimal_models/batching/1/.gitkeep",
    "content": ""
  },
  {
    "path": "examples/model_repos/minimal_models/batching/config.pbtxt",
    "content": "backend: \"minimal\"\nmax_batch_size: 8\ndynamic_batching {\n  max_queue_delay_microseconds: 5000000\n}\ninput [\n  {\n    name: \"IN0\"\n    data_type: TYPE_INT32\n    dims: [ 4 ]\n  }\n]\noutput [\n  {\n    name: \"OUT0\"\n    data_type: TYPE_INT32\n    dims: [ 4 ]\n  }\n]\ninstance_group [\n  {\n    kind: KIND_CPU\n  }\n]\n"
  },
  {
    "path": "examples/model_repos/minimal_models/nonbatching/1/.gitkeep",
    "content": ""
  },
  {
    "path": "examples/model_repos/minimal_models/nonbatching/config.pbtxt",
    "content": "backend: \"minimal\"\nmax_batch_size: 0\ninput [\n  {\n    name: \"IN0\"\n    data_type: TYPE_INT32\n    dims: [ 4 ]\n  }\n]\noutput [\n  {\n    name: \"OUT0\"\n    data_type: TYPE_INT32\n    dims: [ 4 ]\n  }\n]\ninstance_group [\n  {\n    kind: KIND_CPU\n  }\n]\n"
  },
  {
    "path": "examples/model_repos/recommended_models/batching/1/.gitkeep",
    "content": ""
  },
  {
    "path": "examples/model_repos/recommended_models/batching/config.pbtxt",
    "content": "backend: \"recommended\"\nmax_batch_size: 8\ndynamic_batching {\n  max_queue_delay_microseconds: 5000000\n}\ninput [\n  {\n    name: \"INPUT\"\n    data_type: TYPE_FP32\n    dims: [ 4, 4 ]\n  }\n]\noutput [\n  {\n    name: \"OUTPUT\"\n    data_type: TYPE_FP32\n    dims: [ 4, 4 ]\n  }\n]\ninstance_group [\n  {\n    kind: KIND_CPU\n  }\n]\n"
  },
  {
    "path": "include/triton/backend/backend_common.h",
    "content": "// Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n//\n// Redistribution and use in source and binary forms, with or without\n// modification, are permitted provided that the following conditions\n// are met:\n//  * Redistributions of source code must retain the above copyright\n//    notice, this list of conditions and the following disclaimer.\n//  * Redistributions in binary form must reproduce the above copyright\n//    notice, this list of conditions and the following disclaimer in the\n//    documentation and/or other materials provided with the distribution.\n//  * Neither the name of NVIDIA CORPORATION nor the names of its\n//    contributors may be used to endorse or promote products derived\n//    from this software without specific prior written permission.\n//\n// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n#pragma once\n\n#include <chrono>\n#include <condition_variable>\n#include <deque>\n#include <iostream>\n#include <mutex>\n#include <set>\n#include <string>\n#include <unordered_map>\n#include <vector>\n\n#include \"triton/common/error.h\"\n#include \"triton/core/tritonbackend.h\"\n\n#define TRITONJSON_STATUSTYPE TRITONSERVER_Error*\n#define TRITONJSON_STATUSRETURN(M) \\\n  return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, (M).c_str())\n#define TRITONJSON_STATUSSUCCESS nullptr\n#include \"triton/common/triton_json.h\"\n\n#ifdef TRITON_ENABLE_GPU\n#include <cuda_runtime_api.h>\n#endif  // TRITON_ENABLE_GPU\n\nnamespace triton { namespace backend {\n\n#define IGNORE_ERROR(X)                   \\\n  do {                                    \\\n    TRITONSERVER_Error* ie_err__ = (X);   \\\n    if (ie_err__ != nullptr) {            \\\n      TRITONSERVER_ErrorDelete(ie_err__); \\\n    }                                     \\\n  } while (false)\n\n#define LOG_IF_ERROR(X, MSG)                                                   \\\n  do {                                                                         \\\n    TRITONSERVER_Error* lie_err__ = (X);                                       \\\n    if (lie_err__ != nullptr) {                                                \\\n      IGNORE_ERROR(TRITONSERVER_LogMessage(                                    \\\n          TRITONSERVER_LOG_ERROR, __FILE__, __LINE__,                          \\\n          (std::string(MSG) + \": \" + TRITONSERVER_ErrorCodeString(lie_err__) + \\\n           \" - \" + TRITONSERVER_ErrorMessage(lie_err__))                       \\\n              .c_str()));                                                      \\\n      TRITONSERVER_ErrorDelete(lie_err__);                                     \\\n    }                                                                          \\\n  } while (false)\n\n#define LOG_MESSAGE(LEVEL, MSG)                                  \\\n  do {                                                           \\\n    LOG_IF_ERROR(                                                \\\n        TRITONSERVER_LogMessage(LEVEL, __FILE__, __LINE__, MSG), \\\n        (\"failed to log message: \"));                            \\\n  } while (false)\n\n\n#define RETURN_ERROR_IF_FALSE(P, C, MSG)              \\\n  do {                                                \\\n    if (!(P)) {                                       \\\n      return TRITONSERVER_ErrorNew(C, (MSG).c_str()); \\\n    }                                                 \\\n  } while (false)\n\n#define RETURN_ERROR_IF_TRUE(P, C, MSG)               \\\n  do {                                                \\\n    if ((P)) {                                        \\\n      return TRITONSERVER_ErrorNew(C, (MSG).c_str()); \\\n    }                                                 \\\n  } while (false)\n\n#define RETURN_IF_ERROR(X)               \\\n  do {                                   \\\n    TRITONSERVER_Error* rie_err__ = (X); \\\n    if (rie_err__ != nullptr) {          \\\n      return rie_err__;                  \\\n    }                                    \\\n  } while (false)\n\n#ifdef TRITON_ENABLE_GPU\n#define LOG_IF_CUDA_ERROR(X, MSG)                                    \\\n  do {                                                               \\\n    cudaError_t lice_err__ = (X);                                    \\\n    if (lice_err__ != cudaSuccess) {                                 \\\n      IGNORE_ERROR(TRITONSERVER_LogMessage(                          \\\n          TRITONSERVER_LOG_INFO, __FILE__, __LINE__,                 \\\n          (std::string(MSG) + \": \" + cudaGetErrorString(lice_err__)) \\\n              .c_str()));                                            \\\n    }                                                                \\\n  } while (false)\n\n#define RETURN_IF_CUDA_ERROR(X, C, MSG)                                \\\n  do {                                                                 \\\n    cudaError_t rice_err__ = (X);                                      \\\n    if (rice_err__ != cudaSuccess) {                                   \\\n      return TRITONSERVER_ErrorNew(                                    \\\n          C, ((MSG) + \": \" + cudaGetErrorString(rice_err__)).c_str()); \\\n    }                                                                  \\\n  } while (false)\n#endif  // TRITON_ENABLE_GPU\n\n#define RESPOND_AND_SET_NULL_IF_ERROR(RESPONSE_PTR, X)               \\\n  do {                                                               \\\n    TRITONSERVER_Error* rarie_err__ = (X);                           \\\n    if (rarie_err__ != nullptr) {                                    \\\n      if (*RESPONSE_PTR != nullptr) {                                \\\n        LOG_IF_ERROR(                                                \\\n            TRITONBACKEND_ResponseSend(                              \\\n                *RESPONSE_PTR, TRITONSERVER_RESPONSE_COMPLETE_FINAL, \\\n                rarie_err__),                                        \\\n            \"failed to send error response\");                        \\\n        *RESPONSE_PTR = nullptr;                                     \\\n      }                                                              \\\n      TRITONSERVER_ErrorDelete(rarie_err__);                         \\\n    }                                                                \\\n  } while (false)\n\n#define RESPOND_ALL_AND_SET_NULL_IF_ERROR(RESPONSES, RESPONSES_COUNT, X) \\\n  do {                                                                   \\\n    TRITONSERVER_Error* raasnie_err__ = (X);                             \\\n    if (raasnie_err__ != nullptr) {                                      \\\n      for (size_t ridx = 0; ridx < RESPONSES_COUNT; ++ridx) {            \\\n        if (RESPONSES[ridx] != nullptr) {                                \\\n          LOG_IF_ERROR(                                                  \\\n              TRITONBACKEND_ResponseSend(                                \\\n                  RESPONSES[ridx], TRITONSERVER_RESPONSE_COMPLETE_FINAL, \\\n                  raasnie_err__),                                        \\\n              \"failed to send error response\");                          \\\n          RESPONSES[ridx] = nullptr;                                     \\\n        }                                                                \\\n      }                                                                  \\\n      TRITONSERVER_ErrorDelete(raasnie_err__);                           \\\n    }                                                                    \\\n  } while (false)\n\n#define RESPOND_ALL_AND_SET_TRUE_IF_ERROR(RESPONSES, RESPONSES_COUNT, BOOL, X) \\\n  do {                                                                         \\\n    TRITONSERVER_Error* raasnie_err__ = (X);                                   \\\n    if (raasnie_err__ != nullptr) {                                            \\\n      BOOL = true;                                                             \\\n      for (size_t ridx = 0; ridx < RESPONSES_COUNT; ++ridx) {                  \\\n        if (RESPONSES[ridx] != nullptr) {                                      \\\n          LOG_IF_ERROR(                                                        \\\n              TRITONBACKEND_ResponseSend(                                      \\\n                  RESPONSES[ridx], TRITONSERVER_RESPONSE_COMPLETE_FINAL,       \\\n                  raasnie_err__),                                              \\\n              \"failed to send error response\");                                \\\n          RESPONSES[ridx] = nullptr;                                           \\\n        }                                                                      \\\n      }                                                                        \\\n      TRITONSERVER_ErrorDelete(raasnie_err__);                                 \\\n    }                                                                          \\\n  } while (false)\n\n#ifdef TRITON_ENABLE_STATS\n#define TIMESPEC_TO_NANOS(TS) ((TS).tv_sec * 1000000000 + (TS).tv_nsec)\n#define SET_TIMESTAMP(TS_NS)                                         \\\n  {                                                                  \\\n    TS_NS = std::chrono::duration_cast<std::chrono::nanoseconds>(    \\\n                std::chrono::steady_clock::now().time_since_epoch()) \\\n                .count();                                            \\\n  }\n#define DECL_TIMESTAMP(TS_NS) \\\n  uint64_t TS_NS;             \\\n  SET_TIMESTAMP(TS_NS);\n#else\n#define DECL_TIMESTAMP(TS_NS)\n#define SET_TIMESTAMP(TS_NS)\n#endif  // TRITON_ENABLE_STATS\n\n#ifndef TRITON_ENABLE_GPU\nusing cudaStream_t = void*;\n#endif  // !TRITON_ENABLE_GPU\n\n/// Convenience deleter for TRITONBACKEND_ResponseFactory.\nstruct ResponseFactoryDeleter {\n  void operator()(TRITONBACKEND_ResponseFactory* f)\n  {\n    LOG_IF_ERROR(\n        TRITONBACKEND_ResponseFactoryDelete(f),\n        \"failed deleting response factory\");\n  }\n};\n\n// A representation of the BatchInput message in model config\nclass BatchInput {\n public:\n  enum class Kind {\n    BATCH_ELEMENT_COUNT,\n    BATCH_ACCUMULATED_ELEMENT_COUNT,\n    BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO,\n    BATCH_MAX_ELEMENT_COUNT_AS_SHAPE,\n    BATCH_ITEM_SHAPE,\n    BATCH_ITEM_SHAPE_FLATTEN\n  };\n  static TRITONSERVER_Error* ParseFromModelConfig(\n      triton::common::TritonJson::Value& config,\n      std::vector<BatchInput>* batch_inputs);\n  const std::vector<std::string>& TargetNames() const { return target_names_; }\n  TRITONSERVER_DataType DataType() const { return data_type_; }\n  Kind BatchInputKind() const { return kind_; }\n  std::string BatchInputKindString() const { return kind_str_; }\n  const std::vector<std::string>& SourceInputs() const\n  {\n    return source_inputs_;\n  }\n\n private:\n  TRITONSERVER_Error* Init(triton::common::TritonJson::Value& bi_config);\n  Kind kind_;\n  std::string kind_str_;\n  std::vector<std::string> target_names_;\n  TRITONSERVER_DataType data_type_;\n  std::vector<std::string> source_inputs_;\n};\n\n// A representation of the BatchOutput message in model config\nclass BatchOutput {\n public:\n  enum class Kind { BATCH_SCATTER_WITH_INPUT_SHAPE };\n  static TRITONSERVER_Error* ParseFromModelConfig(\n      triton::common::TritonJson::Value& config,\n      std::vector<BatchOutput>* batch_outputs);\n  const std::vector<std::string>& TargetNames() const { return target_names_; }\n  TRITONSERVER_DataType DataType() const { return data_type_; }\n  const std::vector<int64_t>& OutputShape() const { return shape_; }\n  Kind BatchOutputKind() const { return kind_; }\n  const std::vector<std::string>& SourceInputs() const\n  {\n    return source_inputs_;\n  }\n\n private:\n  Kind kind_;\n  std::vector<std::string> target_names_;\n  TRITONSERVER_DataType data_type_;\n  std::vector<int64_t> shape_;\n  std::vector<std::string> source_inputs_;\n};\n\nstruct CopyParams {\n  CopyParams(void* dst, const void* src, const size_t byte_size)\n      : dst_(dst), src_(src), byte_size_(byte_size)\n  {\n  }\n\n  void* dst_;\n  const void* src_;\n  const size_t byte_size_;\n};\n\n/// The value for a dimension in a shape that indicates that that\n/// dimension can take on any size.\nconstexpr int WILDCARD_DIM = -1;\n\nconstexpr char kTensorRTExecutionAccelerator[] = \"tensorrt\";\nconstexpr char kOpenVINOExecutionAccelerator[] = \"openvino\";\nconstexpr char kCUDAExecutionAccelerator[] = \"cuda\";\nconstexpr char kGPUIOExecutionAccelerator[] = \"gpu_io\";\nconstexpr char kAutoMixedPrecisionExecutionAccelerator[] =\n    \"auto_mixed_precision\";\n\nTRITONSERVER_MemoryType GetUsePinnedMemoryType(\n    TRITONSERVER_MemoryType ref_buffer_type);\n\nTRITONSERVER_Error* CommonErrorToTritonError(triton::common::Error error);\n\nTRITONSERVER_Error_Code StatusCodeToTritonCode(\n    triton::common::Error::Code error_code);\n\n/// Parse an array in a JSON object into the corresponding shape. The\n/// array must be composed of integers.\n///\n/// \\param io The JSON object containing the member array.\n/// \\param name The name of the array member in the JSON object.\n/// \\param shape Returns the shape.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error* ParseShape(\n    common::TritonJson::Value& io, const std::string& name,\n    std::vector<int64_t>* shape);\n\n/// Return the string representation of a shape.\n///\n/// \\param dims The shape dimensions.\n/// \\param dims_count The number of dimensions.\n/// \\return The string representation.\nstd::string ShapeToString(const int64_t* dims, const size_t dims_count);\n\n/// Return the string representation of a shape.\n///\n/// \\param shape The shape as a vector of dimensions.\n/// \\return The string representation.\nstd::string ShapeToString(const std::vector<int64_t>& shape);\n\n/// Deprecated. Use TRITONSERVER_Error* GetElementCount instead.\n/// Return the number of elements of a shape.\n///\n/// \\param dims The shape dimensions.\n/// \\param dims_count The number of dimensions.\n/// \\return The number of elements,\n/// -1 if unable to determine the number,\n/// -2 if the shape contains an invalid dim,\n/// or -3 if the number is too large to represent as an int64_t.\nint64_t GetElementCount(const int64_t* dims, const size_t dims_count);\n\n/// Deprecated. Use TRITONSERVER_Error* GetElementCount instead.\n/// Return the number of elements of a shape.\n///\n/// \\param shape The shape as a vector of dimensions.\n/// \\return The number of elements,\n/// -1 if unable to determine the number,\n/// -2 if the shape contains an invalid dim,\n/// or -3 if the number is too large to represent as an int64_t.\nint64_t GetElementCount(const std::vector<int64_t>& shape);\n\n/// Return the number of elements of a shape with error checking.\n///\n/// \\param dims The shape dimensions.\n/// \\param dims_count The number of dimensions.\n/// \\param cnt Returns the number of elements.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error* GetElementCount(\n    const int64_t* dims, const size_t dims_count, int64_t* cnt);\n\n/// Return the number of elements of a shape with error checking.\n///\n/// \\param shape The shape as a vector of dimensions.\n/// \\param cnt Returns the number of elements.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error* GetElementCount(\n    const std::vector<int64_t>& shape, int64_t* cnt);\n\n/// Deprecated. Use TRITONSERVER_Error* GetByteSize instead.\n/// Get the size, in bytes, of a tensor based on datatype and\n/// shape.\n/// \\param dtype The data-type.\n/// \\param dims The shape.\n/// \\return The size, in bytes, of the corresponding tensor,\n/// -1 if unable to determine the size,\n/// -2 if the shape contains an invalid dim,\n/// or -3 if the size is too large to represent as an int64_t.\nint64_t GetByteSize(\n    const TRITONSERVER_DataType& dtype, const std::vector<int64_t>& dims);\n\n/// Get the size, in bytes, of a tensor based on datatype and\n/// shape with error checking.\n/// \\param dtype The data-type.\n/// \\param dims The shape.\n/// \\param size Returns the size, in bytes, of the corresponding tensor.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error* GetByteSize(\n    const TRITONSERVER_DataType& dtype, const std::vector<int64_t>& dims,\n    int64_t* size);\n\n/// Get an input tensor's contents into a buffer. This overload expects\n/// both 'buffer' and buffers of the input to be in CPU.\n///\n/// \\param request The inference request.\n/// \\param input_name The name of the input buffer.\n/// \\param buffer The buffer where the input tensor content is copied into.\n/// \\param buffer_byte_size Acts as both input and output. On input\n/// gives the size of 'buffer', in bytes. The function will fail if\n/// the buffer is not large enough to hold the input tensor\n/// contents. Returns the size of the input tensor data returned in\n/// 'buffer'.\n/// \\param host_policy_name The host policy name to look up the input buffer.\n/// Default input buffer will be used if nullptr is provided.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error* ReadInputTensor(\n    TRITONBACKEND_Request* request, const std::string& input_name, char* buffer,\n    size_t* buffer_byte_size, const char* host_policy_name = nullptr);\n\n/// Get an input tensor's contents into a buffer. This overload of\n/// 'ReadInputTensor' supports input buffers that can be in any memory.\n///\n/// \\param request The inference request.\n/// \\param input_name The name of the input buffer.\n/// \\param buffer The buffer where the input tensor content is copied into.\n/// \\param buffer_byte_size Acts as both input and output. On input\n/// gives the size of 'buffer', in bytes. The function will fail if\n/// the buffer is not large enough to hold the input tensor\n/// contents. Returns the size of the input tensor data returned in\n/// 'buffer'.\n/// \\param host_policy_name The host policy name to look up the input buffer.\n/// Default input buffer will be used if nullptr is provided.\n/// \\param memory_type The memory type of the buffer provided.\n/// \\param memory_type_id The memory type id of the buffer provided.\n/// \\param cuda_stream specifies the stream to be associated with, and 0 can be\n/// passed for default stream.\n/// \\param cuda_used returns whether a CUDA memory copy is initiated. If true,\n/// the caller should synchronize on the given 'cuda_stream' to ensure data copy\n/// is completed.\n/// \\param copy_on_stream whether the memory copies should be performed in cuda\n/// host functions on the 'cuda_stream'.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error* ReadInputTensor(\n    TRITONBACKEND_Request* request, const std::string& input_name, char* buffer,\n    size_t* buffer_byte_size, TRITONSERVER_MemoryType memory_type,\n    int64_t memory_type_id, cudaStream_t cuda_stream, bool* cuda_used,\n    const char* host_policy_name = nullptr, const bool copy_on_stream = false);\n\n/// Validate that an input matches one of the allowed input names.\n/// \\param io The model input.\n/// \\param allowed The set of allowed input names.\n/// \\return The error status. A non-OK status indicates the input\n/// is not valid.\nTRITONSERVER_Error* CheckAllowedModelInput(\n    common::TritonJson::Value& io, const std::set<std::string>& allowed);\n\n/// Validate that an output matches one of the allowed output names.\n/// \\param io The model output.\n/// \\param allowed The set of allowed output names.\n/// \\return The error status. A non-OK status indicates the output\n/// is not valid.\nTRITONSERVER_Error* CheckAllowedModelOutput(\n    common::TritonJson::Value& io, const std::set<std::string>& allowed);\n\n/// Get the tensor name, false value, and true value for a boolean\n/// sequence batcher control kind. If 'required' is true then must\n/// find a tensor for the control. If 'required' is false, return\n/// 'tensor_name' as empty-string if the control is not mapped to any\n/// tensor.\n///\n/// \\param batcher The JSON object of the sequence batcher.\n/// \\param model_name The name of the model.\n/// \\param control_kind The kind of control tensor to look for.\n/// \\param required Whether the tensor must be specified.\n/// \\param tensor_name Returns the name of the tensor.\n/// \\param tensor_datatype Returns the data type of the tensor.\n/// \\param fp32_false_value Returns the float value for false if\n/// the tensor type is FP32.\n/// \\param fp32_true_value Returns the float value for true if\n/// the tensor type is FP32.\n/// \\param int32_false_value Returns the int value for false if\n/// the tensor type is INT32.\n/// \\param int32_true_value Returns the int value for true if\n/// the tensor type is INT32.\n/// \\param bool_false_value Returns the bool value for false if\n/// the tensor type is BOOL.\n/// \\param bool_true_value Returns the bool value for true if\n/// the tensor type is BOOL.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error* GetBooleanSequenceControlProperties(\n    common::TritonJson::Value& batcher, const std::string& model_name,\n    const std::string& control_kind, const bool required,\n    std::string* tensor_name, std::string* tensor_datatype,\n    float* fp32_false_value, float* fp32_true_value, int32_t* int32_false_value,\n    int32_t* int32_true_value, bool* bool_false_value, bool* bool_true_value);\n\n/// Get the tensor name and datatype for a non-boolean sequence\n/// batcher control kind. If 'required' is true then must find a\n/// tensor for the control. If 'required' is false, return\n/// 'tensor_name' as empty-string if the control is not mapped to any\n/// tensor. 'tensor_datatype' returns the required datatype for the\n/// control.\n///\n/// \\param batcher The JSON object of the sequence batcher.\n/// \\param model_name The name of the model.\n/// \\param control_kind The kind of control tensor to look for.\n/// \\param required Whether the tensor must be specified.\n/// \\param tensor_name Returns the name of the tensor.\n/// \\param tensor_datatype Returns the data type of the tensor.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error* GetTypedSequenceControlProperties(\n    common::TritonJson::Value& batcher, const std::string& model_name,\n    const std::string& control_kind, const bool required,\n    std::string* tensor_name, std::string* tensor_datatype);\n\n/// Create and send an error response for a set of requests. This\n/// function takes ownership of 'response_err' and so the caller must\n/// not access or delete it after this call returns.\n///\n/// \\param requests The requests.\n/// \\param request_count The number of 'requests'.\n/// \\param response_err The error to send to each request.\n/// \\param release_request If true, the requests will be released after\n/// sending the error responses and the request pointers are set to\n/// nullptr.\nvoid RequestsRespondWithError(\n    TRITONBACKEND_Request** requests, const uint32_t request_count,\n    TRITONSERVER_Error* response_err, const bool release_request = true);\n\n/// Send an error response for a set of responses. This function takes\n/// ownership of 'response_err' and so the caller must not access or\n/// delete it after this call returns.\n///\n/// \\param responses The responses.\n/// \\param response_count The number of 'responses'.\n/// \\param response_err The error to send.\nvoid SendErrorForResponses(\n    std::vector<TRITONBACKEND_Response*>* responses,\n    const uint32_t response_count, TRITONSERVER_Error* response_err);\n\n/// Copy buffer from 'src' to 'dst' for given 'byte_size'. The buffer location\n/// is identified by the memory type and id, and the corresponding copy will be\n/// initiated.\n/// \\param msg The message to be prepended in error message.\n/// \\param src_memory_type The memory type of the source buffer.\n/// \\param src_memory_type_id The memory type id of the source buffer.\n/// \\param dst_memory_type The memory type of the destination buffer.\n/// \\param dst_memory_type_id The memory type id of the destination buffer.\n/// \\param byte_size The byte size of the source buffer.\n/// \\param src The pointer to the source buffer.\n/// \\param dst The pointer to the destination buffer.\n/// \\param cuda_stream specifies the stream to be associated with, and 0 can be\n/// passed for default stream.\n/// \\param cuda_used returns whether a CUDA memory copy is initiated. If true,\n/// the caller should synchronize on the given 'cuda_stream' to ensure data copy\n/// is completed.\n/// \\param copy_on_stream whether the memory copies should be performed in cuda\n/// host functions on the 'cuda_stream'.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error* CopyBuffer(\n    const std::string& msg, const TRITONSERVER_MemoryType src_memory_type,\n    const int64_t src_memory_type_id,\n    const TRITONSERVER_MemoryType dst_memory_type,\n    const int64_t dst_memory_type_id, const size_t byte_size, const void* src,\n    void* dst, cudaStream_t cuda_stream, bool* cuda_used,\n    const bool copy_on_stream = false);\n\n/// Does a file or directory exist?\n/// \\param path The path to check for existence.\n/// \\param exists Returns true if file/dir exists\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error* FileExists(const std::string& path, bool* exists);\n\n/// Read a text file into a string.\n/// \\param path The path of the file.\n/// \\param contents Returns the contents of the file.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error* ReadTextFile(\n    const std::string& path, std::string* contents);\n\n/// Is a path a directory?\n/// \\param path The path to check.\n/// \\param is_dir Returns true if path represents a directory\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error* IsDirectory(const std::string& path, bool* is_dir);\n\n/// Join path segments into a longer path\n/// \\param segments The path segments.\n/// \\return the path formed by joining the segments.\nstd::string JoinPath(std::initializer_list<std::string> segments);\n\n/// Returns the content in the model version path and the path to the content as\n/// key-value pair.\n/// \\param model_repository_path The path to the model repository.\n/// \\param version The version of the model.\n/// \\param ignore_directories Whether the directories will be ignored.\n/// \\param ignore_files Whether the files will be ignored.\n/// \\param model_paths Returns the content in the model version path and\n/// the path to the content.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error* ModelPaths(\n    const std::string& model_repository_path, uint64_t version,\n    const bool ignore_directories, const bool ignore_files,\n    std::unordered_map<std::string, std::string>* model_paths);\n\n/// Create a CUDA stream appropriate for GPU<->CPU data transfer\n/// operations for a given GPU device. The caller takes ownership of\n/// the stream. 'stream' returns nullptr if GPU support is disabled.\n///\n/// \\param device_id The ID of the GPU.\n/// \\param priority The stream priority. Use 0 for normal priority.\n/// \\param stream Returns the created stream.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error* CreateCudaStream(\n    const int device_id, const int cuda_stream_priority, cudaStream_t* stream);\n\n/// Parse the string as long long integer.\n///\n/// \\param value The string.\n/// \\param parse_value The long long integral value of the string.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error* ParseLongLongValue(\n    const std::string& value, int64_t* parsed_value);\n\n/// Parse the string as unsigned long long integer.\n///\n/// \\param value The string.\n/// \\param parse_value The unsigned long long integral value of the string.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error* ParseUnsignedLongLongValue(\n    const std::string& value, uint64_t* parsed_value);\n\n/// Parse the string as boolean.\n///\n/// \\param value The string.\n/// \\param parse_value The boolean value of the string.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error* ParseBoolValue(\n    const std::string& value, bool* parsed_value);\n\n/// Parse the string as integer.\n///\n/// \\param value The string.\n/// \\param parse_value The integral value of the string.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error* ParseIntValue(const std::string& value, int* parsed_value);\n\n/// Parse the string as double.\n///\n/// \\param value The string.\n/// \\param parse_value The double value of the string.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error* ParseDoubleValue(\n    const std::string& value, double* parsed_value);\n\n/// Return the value of the specified key in a JSON object.\n///\n/// \\param params The JSON object containing the key-value mapping.\n/// \\param key The key to look up the value in the JSON object.\n/// \\param value Returns the value.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error* GetParameterValue(\n    triton::common::TritonJson::Value& params, const std::string& key,\n    std::string* value);\n\n/// Return the Triton server data type of the data type string specified\n/// in model config JSON.\n///\n/// \\param data_type_str The string representation of the data type.\n/// \\return the Triton server data type.\nTRITONSERVER_DataType ModelConfigDataTypeToTritonServerDataType(\n    const std::string& data_type_str);\n\n/// Try to parse the requested parameter.\n///\n/// \\param params The param in model config\n/// \\param mkey Key in the model config.\n/// \\param value The parsed string value.\n/// \\param default_value Default value to use when key is not found.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error* TryParseModelStringParameter(\n    triton::common::TritonJson::Value& params, const std::string& mkey,\n    std::string* value, const std::string& default_value);\n\n/// Try to parse the requested parameter.\n///\n/// \\param params The param in model config\n/// \\param mkey Key in the model config.\n/// \\param value The parsed int value.\n/// \\param default_value Default value to use when key is not found.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error* TryParseModelStringParameter(\n    triton::common::TritonJson::Value& params, const std::string& mkey,\n    int* value, const int& default_value);\n\n/// Try to parse the requested parameter.\n///\n/// \\param params The param in model config\n/// \\param mkey Key in the model config.\n/// \\param value The parsed bool value.\n/// \\param default_value Default value to use when key is not found.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error* TryParseModelStringParameter(\n    triton::common::TritonJson::Value& params, const std::string& mkey,\n    bool* value, const bool& default_value);\n\n/// Try to parse the requested parameter.\n///\n/// \\param params The param in model config\n/// \\param mkey Key in the model config.\n/// \\param value The parsed uint64 value.\n/// \\param default_value Default value to use when key is not found.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error* TryParseModelStringParameter(\n    triton::common::TritonJson::Value& params, const std::string& mkey,\n    uint64_t* value, const uint64_t& default_value);\n\n/// Get a string representation of a tensor buffer.\n///\n/// \\param str Returns the string.\n/// \\param buffer The base pointer to the tensor buffer.\n/// \\param buffer_byte_size The size of the buffer in bytes.\n/// \\param datatype The type of the tensor\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error* BufferAsTypedString(\n    std::string& str, const char* buffer, size_t buffer_byte_size,\n    TRITONSERVER_DataType datatype);\n\n/// Get the ID of the request as a string formatted for logging.\n///\n/// \\param request Request of which to get the ID.\n/// \\return a formatted string for logging the request ID.\nstd::string GetRequestId(TRITONBACKEND_Request* request);\n\n/// Validate the contiguous string buffer with correct format\n/// <int32_len><bytes>...<int32_len><bytes> and parse string\n/// elements into list of pairs of memory address and length.\n/// Note the returned list of pairs points to valid memory as long\n/// as memory pointed by buffer remains allocated.\n///\n/// \\param buffer The pointer to the contiguous string buffer.\n/// \\param buffer_byte_size The size of the buffer in bytes.\n/// \\param expected_element_cnt The number of expected string elements.\n/// \\param input_name The name of the input buffer.\n/// \\param str_list Returns pairs of address and length of parsed strings.\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error* ValidateStringBuffer(\n    const char* buffer, size_t buffer_byte_size,\n    const size_t expected_element_cnt, const char* input_name,\n    std::vector<std::pair<const char*, const uint32_t>>* str_list);\n\n/// Converts incoming utf-8 path to an OS valid path\n///\n/// On Linux there is not much to do.\n/// On Windows we need to take care of the long paths and handle them correctly\n/// to avoid legacy issues with MAX_PATH\n///\n/// More details:\n/// https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=registry\n///\n/// \\param path The path to validate\n/// \\param ret_path The updated valid path as per the OS requirements\n/// \\return a TRITONSERVER_Error indicating success or failure.\nTRITONSERVER_Error* GetOSValidPath(\n    const std::string& path, std::string& ret_path);\n\n}}  // namespace triton::backend\n"
  },
  {
    "path": "include/triton/backend/backend_input_collector.h",
    "content": "// Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n//\n// Redistribution and use in source and binary forms, with or without\n// modification, are permitted provided that the following conditions\n// are met:\n//  * Redistributions of source code must retain the above copyright\n//    notice, this list of conditions and the following disclaimer.\n//  * Redistributions in binary form must reproduce the above copyright\n//    notice, this list of conditions and the following disclaimer in the\n//    documentation and/or other materials provided with the distribution.\n//  * Neither the name of NVIDIA CORPORATION nor the names of its\n//    contributors may be used to endorse or promote products derived\n//    from this software without specific prior written permission.\n//\n// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n#pragma once\n\n#include <list>\n#include <memory>\n#include <string>\n#include <vector>\n\n#include \"triton/backend/backend_common.h\"\n#include \"triton/backend/backend_memory.h\"\n#include \"triton/common/async_work_queue.h\"\n#include \"triton/common/sync_queue.h\"\n#include \"triton/core/tritonbackend.h\"\n\n#ifdef TRITON_ENABLE_GPU\n#include <cuda_runtime_api.h>\n#endif  // TRITON_ENABLE_GPU\n\nnamespace triton { namespace backend {\n\n#ifndef TRITON_ENABLE_GPU\nusing cudaStream_t = void*;\nusing cudaEvent_t = void*;\n#endif  // !TRITON_ENABLE_GPU\n\n//\n// BackendInputCollector\n//\nclass BackendInputCollector {\n public:\n  // The caller can optionally provide 'event' for internal synchronization\n  // instead of using 'stream'. If 'host_policy_name' is provided, it must be\n  // valid for the lifetime of the collector\n  explicit BackendInputCollector(\n      TRITONBACKEND_Request** requests, const uint32_t request_count,\n      std::vector<TRITONBACKEND_Response*>* responses,\n      TRITONBACKEND_MemoryManager* memory_manager, const bool pinned_enabled,\n      cudaStream_t stream, cudaEvent_t event = nullptr,\n      cudaEvent_t buffer_ready_event = nullptr,\n      const size_t kernel_buffer_threshold = 0,\n      const char* host_policy_name = nullptr, const bool copy_on_stream = false,\n      const bool coalesce_request_input = false)\n      : need_sync_(false), requests_(requests), request_count_(request_count),\n        responses_(responses), memory_manager_(memory_manager),\n        pinned_enabled_(pinned_enabled),\n        use_async_cpu_copy_(triton::common::AsyncWorkQueue::WorkerCount() > 1),\n        stream_(stream),\n#ifdef TRITON_ENABLE_GPU\n        event_(event), buffer_ready_event_(buffer_ready_event),\n#endif  // TRITON_ENABLE_GPU\n        kernel_buffer_threshold_(kernel_buffer_threshold),\n        pending_pinned_byte_size_(0), pending_pinned_offset_(0),\n        pending_copy_kernel_buffer_byte_size_(0),\n        pending_copy_kernel_buffer_offset_(0),\n        pending_copy_kernel_input_buffer_counts_(0), async_task_count_(0),\n        host_policy_cstr_(host_policy_name), copy_on_stream_(copy_on_stream),\n        coalesce_request_input_(coalesce_request_input)\n  {\n  }\n\n  ~BackendInputCollector() = default;\n\n  // Process all requests for a named input tensor and return the\n  // concatenated values of those requests in a single contiguous\n  // buffer. This overload of the function can avoid data copy if the\n  // tensor values are already contiguous and the caller doesn't\n  // provide a destination 'buffer'.\n  //\n  // 'buffer' is used to determine whether the input should be placed at the\n  //   'buffer' provided by the caller. If 'buffer' == nullptr, the returned\n  //   buffer will be managed by the BackendInputCollector object and\n  //   has the same lifecycle as the BackendInputCollector object.\n  // 'buffer_byte_size' is the byte size of 'buffer' if it is not nullptr.\n  // 'allowed_input_types' is the ordered list of the memory type and id pairs\n  //   that the returned buffer can be. It must only contain the memory type\n  //   and id of 'buffer' if 'buffer' is not nullptr.\n  // 'dst_buffer' returns the contiguous buffer of the input tensor.\n  // 'dst_buffer_byte_size' the byte size of 'dst_buffer'.\n  // 'dst_memory_type' returns the memory type of 'dst_buffer'.\n  // 'dst_memory_type_id' returns the memory type id of 'dst_buffer'.\n  TRITONSERVER_Error* ProcessTensor(\n      const char* input_name, char* buffer, const size_t buffer_byte_size,\n      const std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>>&\n          allowed_input_types,\n      const char** dst_buffer, size_t* dst_buffer_byte_size,\n      TRITONSERVER_MemoryType* dst_memory_type, int64_t* dst_memory_type_id);\n\n  // Process all requests for a named input tensor and return the\n  // concatenated values of those requests in a single contiguous\n  // 'buffer'.\n  //\n  // 'buffer' The buffer to hold the concatenates tensor value. Must\n  // be large enough to hold all tensor value.\n  // 'buffer_byte_size' is the byte size of 'buffer'.\n  // 'dst_memory_type' The memory type of 'buffer'.\n  // 'dst_memory_type_id' The memory type id of 'buffer'.\n  void ProcessTensor(\n      const char* input_name, char* buffer, const size_t buffer_byte_size,\n      const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id);\n\n  // Process the batch input and return its shape. Returning error indicates\n  // that the batch input can't be formed properly and the caller should abort\n  // the whole batch.\n  TRITONSERVER_Error* BatchInputShape(\n      const BatchInput& batch_input, std::vector<int64_t>* shape);\n\n  // Process the batch input and derive its value into 'buffer'. Returning\n  // error indicates that the batch input can't be formed properly and\n  // the caller should abort the whole batch.\n  // 'buffer' is used to determine whether the input should be placed at the\n  //   'buffer' provided by the caller. If 'buffer' == nullptr, the returned\n  //   buffer will be managed by the BackendInputCollector object and\n  //   has the same lifecycle as the BackendInputCollector object.\n  // 'buffer_byte_size' is the byte size of 'buffer' if it is not nullptr.\n  // 'allowed_input_types' is the ordered list of the memory type and id pairs\n  //   that the returned buffer can be. It must only contain the memory type\n  //   and id of 'buffer' if it is not nullptr.\n  // 'dst_buffer' returns the contiguous buffer of the input tensor.\n  // 'dst_memory_type' returns the memory type of 'dst_buffer'.\n  // 'dst_memory_type_id' returns the memory type id of 'dst_buffer'.\n  TRITONSERVER_Error* ProcessBatchInput(\n      const BatchInput& batch_input, char* buffer,\n      const size_t buffer_byte_size,\n      const std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>>&\n          allowed_input_types,\n      const char** dst_buffer, size_t* dst_buffer_byte_size,\n      TRITONSERVER_MemoryType* dst_memory_type, int64_t* dst_memory_type_id);\n\n  // Finalize processing of all requests for all input tensors. Return\n  // true if cudaMemcpyAsync is called, and the caller should call\n  // cudaStreamSynchronize (or cudaEventSynchronize on 'event') before\n  // using the data.\n  bool Finalize();\n\n private:\n  struct ContiguousBuffer {\n    ContiguousBuffer() : start_request_idx_(0), end_request_idx_(0) {}\n    MemoryDesc memory_desc_;\n    size_t start_request_idx_;\n    size_t end_request_idx_;\n  };\n\n  class InputIterator {\n   public:\n    InputIterator(\n        TRITONBACKEND_Request** requests, const uint32_t request_count,\n        std::vector<TRITONBACKEND_Response*>* responses, const char* input_name,\n        const char* host_policy_name, const bool coalesce_request_input);\n\n    // Return false if iterator reaches the end of inputs, 'input' is not set.\n    bool GetNextContiguousInput(ContiguousBuffer* input);\n\n   private:\n    TRITONBACKEND_Request** requests_;\n    const uint32_t request_count_;\n    std::vector<TRITONBACKEND_Response*>* responses_;\n    const char* input_name_;\n    const char* host_policy_;\n    const bool coalesce_request_input_;\n\n    TRITONBACKEND_Input* curr_input_;\n    size_t curr_request_idx_;\n    size_t curr_buffer_idx_;\n    uint32_t curr_buffer_cnt_;\n    bool reach_end_;\n  };\n\n  // Return whether the entire input is in a contiguous buffer. If returns true,\n  // the properties of the contiguous input buffer will also be returned.\n  // Otherwise, only 'buffer_byte_size' will be set and return the total byte\n  // size of the input.\n  bool GetInputBufferIfContiguous(\n      const char* input_name, const char** buffer, size_t* buffer_byte_size,\n      TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id);\n  bool FlushPendingPinned(\n      char* tensor_buffer, const size_t tensor_buffer_byte_size,\n      const TRITONSERVER_MemoryType tensor_memory_type,\n      const int64_t tensor_memory_type_id);\n  bool FlushPendingCopyKernel(\n      char* tensor_buffer, const size_t tensor_buffer_byte_size,\n      const TRITONSERVER_MemoryType tensor_memory_type,\n      const int64_t tensor_memory_type_id);\n  TRITONSERVER_Error* LaunchCopyKernel(\n      char* tensor_buffer, const size_t tensor_buffer_byte_size,\n      const TRITONSERVER_MemoryType tensor_memory_type,\n      const int64_t tensor_memory_type_id);\n  bool SetInputTensor(\n      const char* input_name, const ContiguousBuffer& input,\n      char* tensor_buffer, const size_t tensor_buffer_byte_size,\n      const TRITONSERVER_MemoryType tensor_memory_type,\n      const int64_t tensor_memory_type_id, const size_t tensor_buffer_offset,\n      const TRITONSERVER_MemoryType use_pinned_memory_type,\n      const bool use_kernel, const bool wait_buffer);\n  template <typename T>\n  TRITONSERVER_Error* SetElementCount(\n      const std::string& source_input, char* buffer,\n      const size_t buffer_byte_size);\n  template <typename T>\n  TRITONSERVER_Error* SetAccumulatedElementCount(\n      const std::string& source_input, char* buffer,\n      const size_t buffer_byte_size);\n  template <typename T>\n  TRITONSERVER_Error* SetBatchItemShape(\n      const std::string& source_input, char* buffer,\n      const size_t buffer_byte_size);\n\n  bool need_sync_;\n  TRITONBACKEND_Request** requests_;\n  const uint32_t request_count_;\n  std::vector<TRITONBACKEND_Response*>* responses_;\n  TRITONBACKEND_MemoryManager* memory_manager_;\n  const bool pinned_enabled_;\n  const bool use_async_cpu_copy_;\n  cudaStream_t stream_;\n#ifdef TRITON_ENABLE_GPU\n  cudaEvent_t event_;\n  cudaEvent_t buffer_ready_event_;\n#endif  // TRITON_ENABLE_GPU\n  const size_t kernel_buffer_threshold_;\n\n  size_t pending_pinned_byte_size_;\n  size_t pending_pinned_offset_;\n  std::list<ContiguousBuffer> pending_pinned_input_buffers_;\n\n  // managed memories that need to live over the lifetime of this\n  // BackendInputCollector object.\n  std::list<std::unique_ptr<BackendMemory>> in_use_memories_;\n\n  size_t pending_copy_kernel_buffer_byte_size_;\n  size_t pending_copy_kernel_buffer_offset_;\n  size_t pending_copy_kernel_input_buffer_counts_;\n  std::list<ContiguousBuffer> pending_copy_kernel_input_buffers_;\n  std::vector<std::unique_ptr<std::vector<int8_t*>>> input_ptr_buffer_host_;\n  std::vector<std::unique_ptr<std::vector<size_t>>> byte_size_buffer_host_;\n  std::vector<std::unique_ptr<std::vector<size_t>>>\n      byte_size_offset_buffer_host_;\n\n  // Pinned memory buffers and the corresponding request_inputs where\n  // the final copy to the tensor is deferred until Finalize() after\n  // waiting for all in-flight copies.\n  struct DeferredPinned {\n    DeferredPinned(\n        char* pinned_memory, const size_t pinned_memory_size,\n        char* tensor_buffer, const size_t tensor_buffer_offset,\n        const TRITONSERVER_MemoryType tensor_memory_type,\n        const int64_t tensor_memory_id,\n        std::list<ContiguousBuffer>&& request_buffers,\n        std::vector<TRITONBACKEND_Response*>* responses)\n        : finalized_(false), pinned_memory_(pinned_memory),\n          pinned_memory_size_(pinned_memory_size),\n          tensor_buffer_(tensor_buffer),\n          tensor_buffer_offset_(tensor_buffer_offset),\n          tensor_memory_type_(tensor_memory_type),\n          tensor_memory_id_(tensor_memory_id),\n          requests_(std::move(request_buffers)), responses_(responses)\n    {\n    }\n\n    bool Finalize(cudaStream_t stream);\n    bool finalized_;\n    // Holding reference to the pinned memory buffer, which is managed\n    // by BackendInputCollector as 'pinned_memory'\n    char* pinned_memory_;\n    const size_t pinned_memory_size_;\n    char* tensor_buffer_;\n    const size_t tensor_buffer_offset_;\n    const TRITONSERVER_MemoryType tensor_memory_type_;\n    const int64_t tensor_memory_id_;\n    std::list<ContiguousBuffer> requests_;\n    std::vector<TRITONBACKEND_Response*>* responses_;\n  };\n\n  std::list<DeferredPinned> deferred_pinned_;\n  // FIXME use future to maintain an issue-order queue to drop task count\n  triton::common::SyncQueue<bool> completion_queue_;\n  size_t async_task_count_;\n\n  const char* host_policy_cstr_;\n  const bool copy_on_stream_;\n  const bool coalesce_request_input_;\n};\n\n}}  // namespace triton::backend\n"
  },
  {
    "path": "include/triton/backend/backend_memory.h",
    "content": "// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.\n//\n// Redistribution and use in source and binary forms, with or without\n// modification, are permitted provided that the following conditions\n// are met:\n//  * Redistributions of source code must retain the above copyright\n//    notice, this list of conditions and the following disclaimer.\n//  * Redistributions in binary form must reproduce the above copyright\n//    notice, this list of conditions and the following disclaimer in the\n//    documentation and/or other materials provided with the distribution.\n//  * Neither the name of NVIDIA CORPORATION nor the names of its\n//    contributors may be used to endorse or promote products derived\n//    from this software without specific prior written permission.\n//\n// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n#pragma once\n\n#include <string>\n#include <vector>\n\n#include \"triton/core/tritonbackend.h\"\n#include \"triton/core/tritonserver.h\"\n\nnamespace triton { namespace backend {\n\n// Collection of common properties that describes a buffer in Triton\nstruct MemoryDesc {\n  MemoryDesc()\n      : buffer_(nullptr), byte_size_(0), memory_type_(TRITONSERVER_MEMORY_CPU),\n        memory_type_id_(0)\n  {\n  }\n  MemoryDesc(\n      const char* buffer, size_t byte_size, TRITONSERVER_MemoryType memory_type,\n      int64_t memory_type_id)\n      : buffer_(buffer), byte_size_(byte_size), memory_type_(memory_type),\n        memory_type_id_(memory_type_id)\n  {\n  }\n  const char* buffer_;\n  size_t byte_size_;\n  TRITONSERVER_MemoryType memory_type_;\n  int64_t memory_type_id_;\n};\n\n//\n// BackendMemory\n//\n// Utility class for allocating and deallocating memory using both\n// TRITONBACKEND_MemoryManager and direct GPU and CPU malloc/free.\n//\nclass BackendMemory {\n public:\n  enum class AllocationType { CPU, CPU_PINNED, GPU, CPU_PINNED_POOL, GPU_POOL };\n\n  // Allocate a contiguous block of 'alloc_type' memory.  'mem'\n  // returns the pointer to the allocated memory.\n  //\n  // CPU, CPU_PINNED_POOL and GPU_POOL are allocated using\n  // TRITONBACKEND_MemoryManagerAllocate. Note that CPU_PINNED and GPU\n  // allocations can be much slower than the POOL variants.\n  //\n  // Two error codes have specific interpretations for this function:\n  //\n  //   TRITONSERVER_ERROR_UNSUPPORTED: Indicates that function is\n  //     incapable of allocating the requested memory type and memory\n  //     type ID. Requests for the memory type and ID will always fail\n  //     no matter 'byte_size' of the request.\n  //\n  //   TRITONSERVER_ERROR_UNAVAILABLE: Indicates that function can\n  //      allocate the memory type and ID but that currently it cannot\n  //      allocate a contiguous block of memory of the requested\n  //      'byte_size'.\n  static TRITONSERVER_Error* Create(\n      TRITONBACKEND_MemoryManager* manager, const AllocationType alloc_type,\n      const int64_t memory_type_id, const size_t byte_size,\n      BackendMemory** mem);\n\n  // Allocate a contiguous block of memory by attempting the\n  // allocation using 'alloc_types' in order until one is successful.\n  // See BackendMemory::Create() above for details.\n  static TRITONSERVER_Error* Create(\n      TRITONBACKEND_MemoryManager* manager,\n      const std::vector<AllocationType>& alloc_types,\n      const int64_t memory_type_id, const size_t byte_size,\n      BackendMemory** mem);\n\n  // Creates a BackendMemory object from a pre-allocated buffer. The buffer\n  // is not owned by the object created with this function. Hence, for\n  // proper operation, the lifetime of the buffer should at least extend till\n  // the corresponding BackendMemory.\n  static TRITONSERVER_Error* Create(\n      TRITONBACKEND_MemoryManager* manager, const AllocationType alloc_type,\n      const int64_t memory_type_id, void* buffer, const size_t byte_size,\n      BackendMemory** mem);\n\n  ~BackendMemory();\n\n  AllocationType AllocType() const { return alloctype_; }\n  int64_t MemoryTypeId() const { return memtype_id_; }\n  char* MemoryPtr() { return buffer_; }\n  size_t ByteSize() const { return byte_size_; }\n  TRITONSERVER_MemoryType MemoryType() const\n  {\n    return AllocTypeToMemoryType(alloctype_);\n  }\n\n  static TRITONSERVER_MemoryType AllocTypeToMemoryType(const AllocationType a);\n  static const char* AllocTypeString(const AllocationType a);\n\n private:\n  BackendMemory(\n      TRITONBACKEND_MemoryManager* manager, const AllocationType alloctype,\n      const int64_t memtype_id, char* buffer, const size_t byte_size,\n      const bool owns_buffer = true)\n      : manager_(manager), alloctype_(alloctype), memtype_id_(memtype_id),\n        buffer_(buffer), byte_size_(byte_size), owns_buffer_(owns_buffer)\n  {\n  }\n\n  TRITONBACKEND_MemoryManager* manager_;\n  AllocationType alloctype_;\n  int64_t memtype_id_;\n  char* buffer_;\n  size_t byte_size_;\n  bool owns_buffer_;\n};\n\n}}  // namespace triton::backend\n"
  },
  {
    "path": "include/triton/backend/backend_model.h",
    "content": "// Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n//\n// Redistribution and use in source and binary forms, with or without\n// modification, are permitted provided that the following conditions\n// are met:\n//  * Redistributions of source code must retain the above copyright\n//    notice, this list of conditions and the following disclaimer.\n//  * Redistributions in binary form must reproduce the above copyright\n//    notice, this list of conditions and the following disclaimer in the\n//    documentation and/or other materials provided with the distribution.\n//  * Neither the name of NVIDIA CORPORATION nor the names of its\n//    contributors may be used to endorse or promote products derived\n//    from this software without specific prior written permission.\n//\n// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n#pragma once\n\n#include <map>\n#include <set>\n#include <string>\n\n#include \"triton/backend/backend_common.h\"\n#include \"triton/core/tritonbackend.h\"\n#include \"triton/core/tritonserver.h\"\n\nnamespace triton { namespace backend {\n\n//\n// BackendModel\n//\n// Common functionality for a backend model. This class is provided as\n// a convenience; backends are not required to use this class.\n//\nclass BackendModel {\n public:\n  BackendModel(\n      TRITONBACKEND_Model* triton_model, const bool allow_optional = false);\n  virtual ~BackendModel() = default;\n\n  // Get the handle to the TRITONBACKEND server hosting this model.\n  TRITONSERVER_Server* TritonServer() { return triton_server_; }\n\n  // Get the handle to the memory manager for this model.\n  TRITONBACKEND_MemoryManager* TritonMemoryManager()\n  {\n    return triton_memory_manager_;\n  }\n\n  // Get the handle to the TRITONBACKEND model.\n  TRITONBACKEND_Model* TritonModel() { return triton_model_; }\n\n  // Get the name and version of the model.\n  const std::string& Name() const { return name_; }\n  uint64_t Version() const { return version_; }\n  const std::string& RepositoryPath() const { return repository_path_; }\n\n  // The model configuration.\n  common::TritonJson::Value& ModelConfig() { return model_config_; }\n\n  // Sets the updated model configuration to the core.\n  TRITONSERVER_Error* SetModelConfig();\n\n  // Parses information out of the model configuration.\n  TRITONSERVER_Error* ParseModelConfig();\n\n  // Maximum batch size supported by the model. A value of 0\n  // indicates that the model does not support batching.\n  int MaxBatchSize() const { return max_batch_size_; }\n\n  // Set the max batch size for the model. When a backend\n  // auto-completes a configuration it may set or change the maximum\n  // batch size.\n  void SetMaxBatchSize(const int b) { max_batch_size_ = b; }\n\n  // Does this model support batching in the first dimension?\n  TRITONSERVER_Error* SupportsFirstDimBatching(bool* supports);\n\n  // Use indirect pinned memory buffer when copying an input or output\n  // tensor to/from the model.\n  bool EnablePinnedInput() const { return enable_pinned_input_; }\n  bool EnablePinnedOutput() const { return enable_pinned_output_; }\n\n  const std::vector<BatchInput>& BatchInputs() const { return batch_inputs_; }\n  const std::vector<BatchOutput>& BatchOutputs() const\n  {\n    return batch_outputs_;\n  }\n  const BatchOutput* FindBatchOutput(const std::string& output_name) const;\n  bool IsInputRagged(const std::string& input_name) const\n  {\n    return (ragged_inputs_.find(input_name) != ragged_inputs_.end());\n  }\n  bool IsInputOptional(const std::string& input_name) const\n  {\n    return (optional_inputs_.find(input_name) != optional_inputs_.end());\n  }\n\n protected:\n  TRITONSERVER_Server* triton_server_;\n  TRITONBACKEND_MemoryManager* triton_memory_manager_;\n  TRITONBACKEND_Model* triton_model_;\n  std::string name_;\n  uint64_t version_;\n  std::string repository_path_;\n  bool allow_optional_;\n\n  common::TritonJson::Value model_config_;\n  int max_batch_size_;\n  bool enable_pinned_input_;\n  bool enable_pinned_output_;\n  std::vector<BatchInput> batch_inputs_;\n  std::vector<BatchOutput> batch_outputs_;\n  std::map<std::string, const BatchOutput*> batch_output_map_;\n  std::set<std::string> ragged_inputs_;\n  std::set<std::string> optional_inputs_;\n};\n\n//\n// BackendModelException\n//\n// Exception thrown if error occurs while constructing an\n// BackendModel.\n//\nstruct BackendModelException {\n  BackendModelException(TRITONSERVER_Error* err) : err_(err) {}\n  TRITONSERVER_Error* err_;\n};\n\n#define THROW_IF_BACKEND_MODEL_ERROR(X)                        \\\n  do {                                                         \\\n    TRITONSERVER_Error* tie_err__ = (X);                       \\\n    if (tie_err__ != nullptr) {                                \\\n      throw triton::backend::BackendModelException(tie_err__); \\\n    }                                                          \\\n  } while (false)\n\n}}  // namespace triton::backend\n"
  },
  {
    "path": "include/triton/backend/backend_model_instance.h",
    "content": "// Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.\n//\n// Redistribution and use in source and binary forms, with or without\n// modification, are permitted provided that the following conditions\n// are met:\n//  * Redistributions of source code must retain the above copyright\n//    notice, this list of conditions and the following disclaimer.\n//  * Redistributions in binary form must reproduce the above copyright\n//    notice, this list of conditions and the following disclaimer in the\n//    documentation and/or other materials provided with the distribution.\n//  * Neither the name of NVIDIA CORPORATION nor the names of its\n//    contributors may be used to endorse or promote products derived\n//    from this software without specific prior written permission.\n//\n// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n#pragma once\n\n#include <string>\n\n#include \"triton/core/tritonbackend.h\"\n\n#ifdef TRITON_ENABLE_GPU\n#include <cuda_runtime_api.h>\n#endif  // TRITON_ENABLE_GPU\n\nnamespace triton { namespace backend {\n\n#ifndef TRITON_ENABLE_GPU\nusing cudaStream_t = void*;\n#endif  // !TRITON_ENABLE_GPU\n\nclass BackendModel;\n\n//\n// BackendModelInstance\n//\n// Common functionality for a backend model instance. This class is\n// provided as a convenience; backends are not required to use this\n// class.\n//\nclass BackendModelInstance {\n public:\n  BackendModelInstance(\n      BackendModel* backend_model,\n      TRITONBACKEND_ModelInstance* triton_model_instance);\n  virtual ~BackendModelInstance();\n\n  // Get the name, kind and device ID of the instance.\n  const std::string& Name() const { return name_; }\n  TRITONSERVER_InstanceGroupKind Kind() const { return kind_; }\n  int32_t DeviceId() const { return device_id_; }\n\n  // Get the handle to the TRITONBACKEND model instance.\n  TRITONBACKEND_ModelInstance* TritonModelInstance()\n  {\n    return triton_model_instance_;\n  }\n\n  // Get the BackendModel representing the model that corresponds to\n  // this instance.\n  BackendModel* Model() const { return backend_model_; }\n\n  // The model configuration 'default_model_filename' value, or the\n  // value in model configuration 'cc_model_filenames' for the GPU\n  // targeted by this instance. If neither are specified in the model\n  // configuration, the return empty string.\n  const std::string& ArtifactFilename() const { return artifact_filename_; }\n\n  // Returns the stream associated with this instance that can be used\n  // for GPU<->CPU memory transfers. Returns nullptr if GPU support is\n  // disabled or if this instance is not executing on a GPU.\n  cudaStream_t CudaStream() { return stream_; }\n\n  const std::string& HostPolicyName() const { return host_policy_name_; }\n\n protected:\n  BackendModel* backend_model_;\n  TRITONBACKEND_ModelInstance* triton_model_instance_;\n\n  std::string name_;\n  TRITONSERVER_InstanceGroupKind kind_;\n  int32_t device_id_;\n\n  std::string artifact_filename_;\n  cudaStream_t stream_;\n\n  std::string host_policy_name_;\n};\n\n//\n// BackendModelInstanceException\n//\n// Exception thrown if error occurs while constructing an\n// BackendModelInstance.\n//\nstruct BackendModelInstanceException {\n  BackendModelInstanceException(TRITONSERVER_Error* err) : err_(err) {}\n  TRITONSERVER_Error* err_;\n};\n\n#define THROW_IF_BACKEND_INSTANCE_ERROR(X)                             \\\n  do {                                                                 \\\n    TRITONSERVER_Error* tie_err__ = (X);                               \\\n    if (tie_err__ != nullptr) {                                        \\\n      throw triton::backend::BackendModelInstanceException(tie_err__); \\\n    }                                                                  \\\n  } while (false)\n\n}}  // namespace triton::backend\n"
  },
  {
    "path": "include/triton/backend/backend_output_responder.h",
    "content": "// Copyright 2019-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n//\n// Redistribution and use in source and binary forms, with or without\n// modification, are permitted provided that the following conditions\n// are met:\n//  * Redistributions of source code must retain the above copyright\n//    notice, this list of conditions and the following disclaimer.\n//  * Redistributions in binary form must reproduce the above copyright\n//    notice, this list of conditions and the following disclaimer in the\n//    documentation and/or other materials provided with the distribution.\n//  * Neither the name of NVIDIA CORPORATION nor the names of its\n//    contributors may be used to endorse or promote products derived\n//    from this software without specific prior written permission.\n//\n// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n#pragma once\n\n#include <list>\n#include <string>\n#include <vector>\n\n#include \"triton/backend/backend_common.h\"\n#include \"triton/common/async_work_queue.h\"\n#include \"triton/core/tritonbackend.h\"\n\n#ifdef TRITON_ENABLE_GPU\n#include <cuda_runtime_api.h>\n#endif  // TRITON_ENABLE_GPU\n\nnamespace triton { namespace backend {\n\n#ifndef TRITON_ENABLE_GPU\nusing cudaStream_t = void*;\nusing cudaEvent_t = void*;\n#endif  // !TRITON_ENABLE_GPU\n\n//\n// BackendOutputResponder\n//\nclass BackendOutputResponder {\n public:\n  // The caller can optionally provide 'event' for internal synchronization\n  // instead of using 'stream'.\n  explicit BackendOutputResponder(\n      TRITONBACKEND_Request** requests, const uint32_t request_count,\n      std::vector<TRITONBACKEND_Response*>* responses,\n      TRITONBACKEND_MemoryManager* memory_manager,\n      const bool first_dim_batching, const bool pinned_enabled,\n      cudaStream_t stream, cudaEvent_t event = nullptr,\n      bool copy_on_stream = false)\n      : need_sync_(false), requests_(requests), request_count_(request_count),\n        responses_(responses), memory_manager_(memory_manager),\n        first_dim_batching_(first_dim_batching),\n        pinned_enabled_(pinned_enabled),\n        use_async_cpu_copy_(triton::common::AsyncWorkQueue::WorkerCount() > 1),\n        stream_(stream), event_(event), pending_pinned_byte_size_(0),\n        copy_on_stream_(copy_on_stream)\n  {\n  }\n\n  // Legacy constructor for backwards compatibility. The above\n  // constructor should be used for all new cases. The responder needs\n  // to know if the model is batching along the first dimension. With\n  // this constructor we derive that information from the\n  // max_batch_size value instead of having it provided directly as in\n  // the above constructor.\n  explicit BackendOutputResponder(\n      TRITONBACKEND_Request** requests, const uint32_t request_count,\n      std::vector<TRITONBACKEND_Response*>* responses, const int max_batch_size,\n      TRITONBACKEND_MemoryManager* memory_manager, const bool pinned_enabled,\n      cudaStream_t stream, cudaEvent_t event = nullptr,\n      bool copy_on_stream = false)\n      : need_sync_(false), requests_(requests), request_count_(request_count),\n        responses_(responses), memory_manager_(memory_manager),\n        first_dim_batching_(max_batch_size >= 1),\n        pinned_enabled_(pinned_enabled),\n        use_async_cpu_copy_(triton::common::AsyncWorkQueue::WorkerCount() > 1),\n        stream_(stream), event_(event), pending_pinned_byte_size_(0),\n        copy_on_stream_(copy_on_stream)\n  {\n  }\n\n  ~BackendOutputResponder();\n\n  // Process all responses for a named output tensor.\n  // 'batchn_shape' may be modified by the call.\n  void ProcessTensor(\n      const std::string& name, const TRITONSERVER_DataType datatype,\n      std::vector<int64_t>& batchn_shape, const char* buffer,\n      const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id);\n\n  // Process all responses for a named state tensor. Returns a vector of\n  // TRITONBACKEND_State objects that the backend can use to update the state.\n  // If TRITONBACKEND_StateUpdate is not called on the vector elements, the\n  // state will not be updated.\n  // 'batchn_shape' may be modified by the call.\n  std::vector<TRITONBACKEND_State*> ProcessStateTensor(\n      const std::string& name, const TRITONSERVER_DataType datatype,\n      std::vector<int64_t>& batchn_shape, const char* buffer,\n      const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id);\n\n  // Process all responses for a batch output and derive its value from\n  // 'buffer'.\n  void ProcessBatchOutput(\n      const std::string& name, const BatchOutput& batch_output,\n      const char* buffer, const TRITONSERVER_MemoryType memory_type,\n      const int64_t memory_type_id);\n\n  // Finalize processing of all responses for all output\n  // tensors. Return true if cudaMemcpyAsync is called, and the caller\n  // should call cudaStreamSynchronize (or cudaEventSynchronize on 'event')\n  // before using the data.\n  bool Finalize();\n\n private:\n  bool FlushPendingPinned(\n      const char* tensor_buffer,\n      const TRITONSERVER_MemoryType tensor_memory_type,\n      const int64_t tensor_memory_type_id);\n  bool SetFixedSizeBuffer(\n      TRITONBACKEND_Response** response, void* response_state_or_output,\n      const std::string& output_name, const size_t tensor_byte_size,\n      const size_t tensor_offset, const char* tensor_buffer,\n      const TRITONSERVER_MemoryType tensor_memory_type,\n      const int64_t tensor_memory_type_id,\n      const TRITONSERVER_MemoryType use_pinned_memory_type, bool state);\n\n  struct OutputData {\n    OutputData(\n        const std::string& name, void* buffer, const size_t buffer_byte_size,\n        const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id)\n        : name_(name), buffer_(buffer), buffer_byte_size_(buffer_byte_size),\n          memory_type_(memory_type), memory_type_id_(memory_type_id)\n    {\n    }\n    const std::string name_;\n    void* buffer_;\n    const size_t buffer_byte_size_;\n    const TRITONSERVER_MemoryType memory_type_;\n    const int64_t memory_type_id_;\n  };\n\n  bool need_sync_;\n  TRITONBACKEND_Request** requests_;\n  const uint32_t request_count_;\n  std::vector<TRITONBACKEND_Response*>* responses_;\n  TRITONBACKEND_MemoryManager* memory_manager_;\n  const bool first_dim_batching_;\n  const bool pinned_enabled_;\n  const bool use_async_cpu_copy_;\n  cudaStream_t stream_;\n  cudaEvent_t event_;\n\n  using ResponsesList =\n      std::list<std::pair<TRITONBACKEND_Response**, OutputData>>;\n\n  size_t pending_pinned_byte_size_;\n  size_t pending_pinned_offset_;\n  ResponsesList pending_pinned_outputs_;\n  const bool copy_on_stream_;\n\n  // Pinned memories that need to live over the lifetime of this\n  // BackendOutputResponder object.\n  std::list<char*> pinned_memories_;\n\n  // Pinned memory buffers and the corresponding response outputs\n  // where the final copy to the response is deferred until Finalize()\n  // after waiting for all in-flight copies.\n  struct DeferredPinned {\n    DeferredPinned(\n        char* pinned_memory, const size_t pinned_memory_size,\n        ResponsesList&& responses)\n        : pinned_memory_(pinned_memory),\n          pinned_memory_size_(pinned_memory_size),\n          responses_(std::move(responses))\n    {\n    }\n    char* pinned_memory_;\n    const size_t pinned_memory_size_;\n    ResponsesList responses_;\n  };\n\n  std::list<DeferredPinned> deferred_pinned_;\n};\n\n}}  // namespace triton::backend\n"
  },
  {
    "path": "include/triton/backend/device_memory_tracker.h",
    "content": "// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n//\n// Redistribution and use in source and binary forms, with or without\n// modification, are permitted provided that the following conditions\n// are met:\n//  * Redistributions of source code must retain the above copyright\n//    notice, this list of conditions and the following disclaimer.\n//  * Redistributions in binary form must reproduce the above copyright\n//    notice, this list of conditions and the following disclaimer in the\n//    documentation and/or other materials provided with the distribution.\n//  * Neither the name of NVIDIA CORPORATION nor the names of its\n//    contributors may be used to endorse or promote products derived\n//    from this software without specific prior written permission.\n//\n// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n#pragma once\n\n#include <algorithm>\n#include <functional>\n#include <map>\n#include <memory>\n#include <mutex>\n#include <unordered_map>\n#include <vector>\n\n#include \"triton/backend/backend_common.h\"\n#include \"triton/common/logging.h\"\n#include \"triton/core/tritonserver.h\"\n\n#if defined(TRITON_ENABLE_GPU) && defined(TRITON_ENABLE_MEMORY_TRACKER)\n#include <cupti.h>\n#endif\n\nstatic_assert(\n    sizeof(uint64_t) >= sizeof(uintptr_t),\n    \"The implementation is storing address pointer as uint64_t, \"\n    \"must ensure the space for pointer is <= sizeof(uint64_t).\");\n\nnamespace triton { namespace backend {\n\n/// DeviceMemoryTracker is a backend utility provided to track the memory\n/// allocated for a particular model and associated model instances.\n/// This utility is often used for backend to report memory usage through\n/// TRITONBACKEND_ModelReportMemoryUsage and\n/// TRITONBACKEND_ModelInstanceReportMemoryUsage, which provides\n/// additional information to Triton for making decision on model scaling and\n/// deployment.\n///\n/// Caveat: The memory tracker is implemented with CUPTI library which currently\n/// only supports single client/subscriber. This is an known limitation and as a\n/// result, the memory tracker can cause unexpected application failure if other\n/// component of the Triton process also uses CUPTI with a different\n/// configuration, for example, the framework used by the backend may have\n/// implemented similar profiler with CUPTI. Therefore, before enabling this\n/// memory tracker utilities, you should make sure that there is no other CUPTI\n/// client in the process. This tracker is implemented with the assumption that\n/// all other CUPTI clients are using the same implementation so that\n/// as long as all backends are compiled with this memory tracker, they may\n/// interact with an externally-initialized CUPTI to the backend without issues.\n///\n/// Typical usage:\n///\n/// On TRITONBACKEND_Initialize\n///  - Call DeviceMemoryTracker::Init\n///\n/// If DeviceMemoryTracker::Init returns true,\n/// DeviceMemoryTracker::TrackThreadMemoryUsage and\n/// DeviceMemoryTracker::UntrackThreadMemoryUsage can be called accordingly to\n/// track memory allocation in the scope between the two calls. The memory usage\n/// will be recorded in MemoryUsage object and may be reported through\n/// TRITONBACKEND_ModelReportMemoryUsage or\n/// TRITONBACKEND_ModelInstanceReportMemoryUsage based on the entity of the\n/// memory usage.\n///\n/// On reporting memory usage\n///  - Call MemoryUsage::SerializeToBufferAttributes to prepare the usage\n///    in the desired format. The BufferAttributes will be owned by MemoryUsage.\n\nextern \"C\" {\n\ntypedef struct TRITONBACKEND_CuptiTracker_t {\n  // C struct require extra implementation for dynamic array, for simplicity,\n  // the following assumptions are made to pre-allocate the array with max\n  // possible length:\n  //  - system / pinned memory allocation should only be on deviceId 0\n  //  - CUDA allocation will only be on visible CUDA devices\n  int64_t* system_memory_usage_byte_;\n  int64_t* pinned_memory_usage_byte_;\n  int64_t* cuda_memory_usage_byte_;\n  uint32_t system_array_len_;\n  uint32_t pinned_array_len_;\n  uint32_t cuda_array_len_;\n\n  // only set to false if somehow the CUPTI activity occurs on index out of\n  // range. In that case, user should invalidate the whole tracker.\n  bool valid_;\n} TRITONBACKEND_CuptiTracker;\n}\n\nclass DeviceMemoryTracker {\n public:\n  struct MemoryUsage {\n    MemoryUsage()\n    {\n      cuda_memory_usage_byte_.resize(CudaDeviceCount(), 0);\n\n      cupti_tracker_.system_memory_usage_byte_ =\n          system_memory_usage_byte_.data();\n      cupti_tracker_.pinned_memory_usage_byte_ =\n          pinned_memory_usage_byte_.data();\n      cupti_tracker_.cuda_memory_usage_byte_ = cuda_memory_usage_byte_.data();\n      cupti_tracker_.system_array_len_ = system_memory_usage_byte_.size();\n      cupti_tracker_.pinned_array_len_ = pinned_memory_usage_byte_.size();\n      cupti_tracker_.cuda_array_len_ = cuda_memory_usage_byte_.size();\n      cupti_tracker_.valid_ = true;\n    }\n\n    ~MemoryUsage()\n    {\n      // Make sure all C struct reference are dropped before clearing.\n      if (tracked_) {\n        UntrackThreadMemoryUsage(this);\n      }\n      for (auto& ba : buffer_attributes_) {\n        if (ba) {\n          LOG_IF_ERROR(\n              TRITONSERVER_BufferAttributesDelete(ba),\n              \"Releasing buffer attributes in MemoryUsage object\");\n        }\n      }\n    }\n\n    // Disable copy and assign to better manage C struct lifecycle\n    MemoryUsage(const MemoryUsage&) = delete;\n    void operator=(const MemoryUsage&) = delete;\n\n    // merge record from another MemoryUsage object\n    MemoryUsage& operator+=(const MemoryUsage& rhs)\n    {\n      std::transform(\n          rhs.system_memory_usage_byte_.begin(),\n          rhs.system_memory_usage_byte_.end(),\n          system_memory_usage_byte_.begin(), system_memory_usage_byte_.begin(),\n          std::plus<int64_t>());\n      std::transform(\n          rhs.pinned_memory_usage_byte_.begin(),\n          rhs.pinned_memory_usage_byte_.end(),\n          pinned_memory_usage_byte_.begin(), pinned_memory_usage_byte_.begin(),\n          std::plus<int64_t>());\n      std::transform(\n          rhs.cuda_memory_usage_byte_.begin(),\n          rhs.cuda_memory_usage_byte_.end(), cuda_memory_usage_byte_.begin(),\n          cuda_memory_usage_byte_.begin(), std::plus<int64_t>());\n      return *this;\n    }\n\n    // Serialize the MemoryUsage into an array of TRITONSERVER_BufferAttributes,\n    // the buffer attributes object are owned by the MemoryUsage object.\n    // Empty usage will be returned if the MemoryUsage object is invalid.\n    TRITONSERVER_Error* SerializeToBufferAttributes(\n        TRITONSERVER_BufferAttributes*** usage, uint32_t* usage_size)\n    {\n      if (!cupti_tracker_.valid_) {\n        return TRITONSERVER_ErrorNew(\n            TRITONSERVER_ERROR_INTERNAL, \"MemoryUsage record is invalid.\");\n      }\n      uint32_t usage_idx = 0;\n\n      // Define lambda to convert an vector of memory usage of the same type of\n      // device into buffer attributes and set in 'usage'\n      auto set_attributes_for_device_fn =\n          [&](const std::vector<int64_t>& devices,\n              const TRITONSERVER_MemoryType mem_type) -> TRITONSERVER_Error* {\n        for (size_t idx = 0; idx < devices.size(); ++idx) {\n          // skip if no allocation\n          if (devices[idx] == 0) {\n            continue;\n          }\n          // there is space in usage array\n          if (usage_idx >= buffer_attributes_.size()) {\n            buffer_attributes_.emplace_back(nullptr);\n            RETURN_IF_ERROR(\n                TRITONSERVER_BufferAttributesNew(&buffer_attributes_.back()));\n          }\n          auto entry = buffer_attributes_[usage_idx];\n\n          RETURN_IF_ERROR(\n              TRITONSERVER_BufferAttributesSetMemoryType(entry, mem_type));\n          RETURN_IF_ERROR(\n              TRITONSERVER_BufferAttributesSetMemoryTypeId(entry, idx));\n          RETURN_IF_ERROR(\n              TRITONSERVER_BufferAttributesSetByteSize(entry, devices[idx]));\n\n          ++usage_idx;\n        }\n        return nullptr;  // success\n      };\n\n      RETURN_IF_ERROR(set_attributes_for_device_fn(\n          system_memory_usage_byte_, TRITONSERVER_MEMORY_CPU));\n      RETURN_IF_ERROR(set_attributes_for_device_fn(\n          pinned_memory_usage_byte_, TRITONSERVER_MEMORY_CPU_PINNED));\n      RETURN_IF_ERROR(set_attributes_for_device_fn(\n          cuda_memory_usage_byte_, TRITONSERVER_MEMORY_GPU));\n\n      *usage_size = usage_idx;\n      *usage = buffer_attributes_.data();\n      return nullptr;\n    }\n\n    // Byte size of allocated memory tracked,\n    // 'system_memory_usage_byte_' is likely to be empty as system memory\n    // allocation is not controlled by CUDA driver. But keeping it for\n    // completeness.\n    std::vector<int64_t> system_memory_usage_byte_{0};\n    std::vector<int64_t> pinned_memory_usage_byte_{0};\n    std::vector<int64_t> cuda_memory_usage_byte_{0};\n    bool tracked_{false};\n\n    std::vector<TRITONSERVER_BufferAttributes*> buffer_attributes_;\n\n    TRITONBACKEND_CuptiTracker cupti_tracker_;\n  };\n\n  // Simple scope guard to make sure memory usage is untracked without coupling\n  // with MemoryUsage lifecycle\n  struct ScopeGuard {\n    ScopeGuard(MemoryUsage* usage) : usage_(usage) {}\n    ~ScopeGuard()\n    {\n      if (usage_ && usage_->tracked_) {\n        UntrackThreadMemoryUsage(usage_);\n      }\n    }\n    MemoryUsage* usage_{nullptr};\n  };\n\n\n#if defined(TRITON_ENABLE_GPU) && defined(TRITON_ENABLE_MEMORY_TRACKER)\n  static bool Init();\n  static void Fini();\n\n  static int CudaDeviceCount();\n\n  // The memory usage will be tracked and modified until it's untracked, 'usage'\n  // must be valid and not to be modified externally until untrack is called.\n  // Currently can distinguish activity by correlation id which is thread\n  // specific, which implies that there will be missing records if tracking\n  // region switching threads to handle other activities.\n  // This function takes no affect if 'usage' is nullptr.\n  static void TrackThreadMemoryUsage(MemoryUsage* usage);\n\n  // Note that CUPTI always pop from the top of the thread-wise stack, must be\n  // careful on the untrack order if there is need to use multiple MemoryUsage\n  // objects.\n  // This function takes no affect if 'usage' is nullptr.\n  static void UntrackThreadMemoryUsage(MemoryUsage* usage);\n\n  static bool EnableFromBackendConfig(\n      triton::common::TritonJson::Value& backend_config)\n  {\n    triton::common::TritonJson::Value cmdline;\n    if (backend_config.Find(\"cmdline\", &cmdline)) {\n      triton::common::TritonJson::Value value;\n      std::string value_str;\n      if (cmdline.Find(\"triton-backend-memory-tracker\", &value)) {\n        bool lvalue = false;\n        auto err = value.AsString(&value_str);\n        if (err != nullptr) {\n          LOG_IF_ERROR(err, \"Error parsing backend config: \");\n          return false;\n        }\n        err = ParseBoolValue(value_str, &lvalue);\n        if (err != nullptr) {\n          LOG_IF_ERROR(err, \"Error parsing backend config: \");\n          return false;\n        }\n        return lvalue;\n      }\n    }\n    return false;\n  }\n\n  ~DeviceMemoryTracker();\n\n  static void TrackActivity(CUpti_Activity* record)\n  {\n    if (tracker_) {\n      tracker_->TrackActivityInternal(record);\n    }\n  }\n\n private:\n  DeviceMemoryTracker();\n\n  void TrackActivityInternal(CUpti_Activity* record);\n  bool UpdateMemoryTypeUsage(\n      CUpti_ActivityMemory3* memory_record, const bool is_allocation,\n      int64_t* memory_usage, uint32_t usage_len);\n\n  std::mutex mtx_;\n  std::unordered_map<uint32_t, uintptr_t> activity_to_memory_usage_;\n  CUpti_SubscriberHandle subscriber_{nullptr};\n  int device_cnt_{0};\n\n  static std::unique_ptr<DeviceMemoryTracker> tracker_;\n#else   // no-ops\n  static bool Init() { return false; }\n  static void Fini() {}\n  static int CudaDeviceCount() { return 0; }\n  static void TrackThreadMemoryUsage(MemoryUsage* usage) {}\n  static void UntrackThreadMemoryUsage(MemoryUsage* usage) {}\n  static bool EnableFromBackendConfig(\n      const triton::common::TritonJson::Value& backend_config)\n  {\n    return false;\n  }\n#endif  // TRITON_ENABLE_GPU && TRITON_ENABLE_MEMORY_TRACKER\n};\n\n}}  // namespace triton::backend\n"
  },
  {
    "path": "pyproject.toml",
    "content": "# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n#\n# Redistribution and use in source and binary forms, with or without\n# modification, are permitted provided that the following conditions\n# are met:\n#  * Redistributions of source code must retain the above copyright\n#    notice, this list of conditions and the following disclaimer.\n#  * Redistributions in binary form must reproduce the above copyright\n#    notice, this list of conditions and the following disclaimer in the\n#    documentation and/or other materials provided with the distribution.\n#  * Neither the name of NVIDIA CORPORATION nor the names of its\n#    contributors may be used to endorse or promote products derived\n#    from this software without specific prior written permission.\n#\n# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n[tool.codespell]\n# note: pre-commit passes explicit lists of files here, which this skip file list doesn't override -\n# this is only to allow you to run codespell interactively\nskip = \"./.git,./.github\"\n# ignore short words, and typename parameters like OffsetT\nignore-regex = \"\\\\b(.{1,4}|[A-Z]\\\\w*T)\\\\b\"\n# use the 'clear' dictionary for unambiguous spelling mistakes\nbuiltin = \"clear\"\n# disable warnings about binary files and wrong encoding\nquiet-level = 3\n\n[tool.isort]\nprofile = \"black\"\nuse_parentheses = true\nmulti_line_output = 3\ninclude_trailing_comma = true\nforce_grid_wrap = 0\nensure_newline_before_comments = true\nline_length = 88\nbalanced_wrapping = true\nindent = \"    \"\nskip = [\"build\"]\n"
  },
  {
    "path": "src/backend_common.cc",
    "content": "// Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n//\n// Redistribution and use in source and binary forms, with or without\n// modification, are permitted provided that the following conditions\n// are met:\n//  * Redistributions of source code must retain the above copyright\n//    notice, this list of conditions and the following disclaimer.\n//  * Redistributions in binary form must reproduce the above copyright\n//    notice, this list of conditions and the following disclaimer in the\n//    documentation and/or other materials provided with the distribution.\n//  * Neither the name of NVIDIA CORPORATION nor the names of its\n//    contributors may be used to endorse or promote products derived\n//    from this software without specific prior written permission.\n//\n// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n#include \"triton/backend/backend_common.h\"\n\n#ifdef _WIN32\n// suppress the min and max definitions in Windef.h.\n#define NOMINMAX\n#include <Windows.h>\n\n// _CRT_INTERNAL_NONSTDC_NAMES 1 before including Microsoft provided C Runtime\n// library to expose declarations without \"_\" prefix to match POSIX style.\n#define _CRT_INTERNAL_NONSTDC_NAMES 1\n#include <direct.h>\n#include <io.h>\n#else\n#include <dirent.h>\n#include <unistd.h>\n#endif\n#include <sys/stat.h>\n\n#include <algorithm>\n#include <cerrno>\n#include <fstream>\n#include <functional>\n#include <memory>\n\n#ifdef _WIN32\n// <sys/stat.h> in Windows doesn't define S_ISDIR macro\n#if !defined(S_ISDIR) && defined(S_IFMT) && defined(S_IFDIR)\n#define S_ISDIR(m) (((m)&S_IFMT) == S_IFDIR)\n#endif\n#define F_OK 0\n#endif\n\nnamespace triton { namespace backend {\n\n#ifdef TRITON_ENABLE_GPU\nvoid CUDART_CB\nMemcpyHost(void* args)\n{\n  auto* copy_params = reinterpret_cast<CopyParams*>(args);\n  memcpy(copy_params->dst_, copy_params->src_, copy_params->byte_size_);\n  delete copy_params;\n}\n#endif  // TRITON_ENABLE_GPU\n\nTRITONSERVER_MemoryType\nGetUsePinnedMemoryType(TRITONSERVER_MemoryType ref_buffer_type)\n{\n  // The following matrix is used for both input and output.\n  // src   \\ dest | non-pinned    | pinned     | device\n  // non-pinned   | memcpy        | memcpy     | buffer needed\n  // pinned       | memcpy        | memcpy     | cudaMemcpy\n  // device       | buffer needed | cudaMemcpy | cudaMemcpy\n  if (ref_buffer_type == TRITONSERVER_MEMORY_CPU_PINNED) {\n    return TRITONSERVER_MEMORY_CPU_PINNED;\n  }\n\n  return (ref_buffer_type == TRITONSERVER_MEMORY_CPU) ? TRITONSERVER_MEMORY_GPU\n                                                      : TRITONSERVER_MEMORY_CPU;\n}\n\nTRITONSERVER_Error_Code\nStatusCodeToTritonCode(triton::common::Error::Code error_code)\n{\n  switch (error_code) {\n    case triton::common::Error::Code::UNKNOWN:\n      return TRITONSERVER_ERROR_UNKNOWN;\n    case triton::common::Error::Code::INTERNAL:\n      return TRITONSERVER_ERROR_INTERNAL;\n    case triton::common::Error::Code::NOT_FOUND:\n      return TRITONSERVER_ERROR_NOT_FOUND;\n    case triton::common::Error::Code::INVALID_ARG:\n      return TRITONSERVER_ERROR_INVALID_ARG;\n    case triton::common::Error::Code::UNAVAILABLE:\n      return TRITONSERVER_ERROR_UNAVAILABLE;\n    case triton::common::Error::Code::UNSUPPORTED:\n      return TRITONSERVER_ERROR_UNSUPPORTED;\n    case triton::common::Error::Code::ALREADY_EXISTS:\n      return TRITONSERVER_ERROR_ALREADY_EXISTS;\n\n    default:\n      break;\n  }\n\n  return TRITONSERVER_ERROR_UNKNOWN;\n}\n\nTRITONSERVER_Error*\nCommonErrorToTritonError(triton::common::Error error)\n{\n  return TRITONSERVER_ErrorNew(\n      StatusCodeToTritonCode(error.ErrorCode()), error.Message().c_str());\n}\n\nTRITONSERVER_Error*\nParseShape(\n    common::TritonJson::Value& io, const std::string& name,\n    std::vector<int64_t>* shape)\n{\n  common::TritonJson::Value shape_array;\n  RETURN_IF_ERROR(io.MemberAsArray(name.c_str(), &shape_array));\n  for (size_t i = 0; i < shape_array.ArraySize(); ++i) {\n    int64_t d = 0;\n    RETURN_IF_ERROR(shape_array.IndexAsInt(i, &d));\n    shape->push_back(d);\n  }\n\n  return nullptr;  // success\n}\n\nstd::string\nShapeToString(const int64_t* dims, const size_t dims_count)\n{\n  bool first = true;\n\n  std::string str(\"[\");\n  for (size_t i = 0; i < dims_count; ++i) {\n    const int64_t dim = dims[i];\n    if (!first) {\n      str += \",\";\n    }\n    str += std::to_string(dim);\n    first = false;\n  }\n\n  str += \"]\";\n  return str;\n}\n\nstd::string\nShapeToString(const std::vector<int64_t>& shape)\n{\n  return ShapeToString(shape.data(), shape.size());\n}\n\nint64_t\nGetElementCount(const int64_t* dims, const size_t dims_count)\n{\n  bool first = true;\n  int64_t cnt = 0;\n  for (size_t i = 0; i < dims_count; i++) {\n    if (dims[i] == WILDCARD_DIM) {\n      return -1;\n    } else if (dims[i] < 0) {  // invalid dim\n      return -2;\n    } else if (dims[i] == 0) {\n      return 0;\n    }\n\n    if (first) {\n      cnt = dims[i];\n      first = false;\n    } else {\n      // Check for overflow before multiplication\n      if (cnt > INT64_MAX / dims[i]) {\n        return -3;\n      }\n      cnt *= dims[i];\n    }\n  }\n\n  return cnt;\n}\n\nint64_t\nGetElementCount(const std::vector<int64_t>& shape)\n{\n  return GetElementCount(shape.data(), shape.size());\n}\n\nTRITONSERVER_Error*\nGetElementCount(const int64_t* dims, const size_t dims_count, int64_t* cnt)\n{\n  *cnt = GetElementCount(dims, dims_count);\n  if (*cnt == -2) {\n    return TRITONSERVER_ErrorNew(\n        TRITONSERVER_ERROR_INVALID_ARG,\n        (std::string(\"shape\") + ShapeToString(dims, dims_count) +\n         \" contains an invalid dim.\")\n            .c_str());\n  } else if (*cnt == -3) {\n    return TRITONSERVER_ErrorNew(\n        TRITONSERVER_ERROR_INVALID_ARG,\n        \"unexpected integer overflow while calculating element count.\");\n  }\n  return nullptr;  // success\n}\n\nTRITONSERVER_Error*\nGetElementCount(const std::vector<int64_t>& shape, int64_t* cnt)\n{\n  *cnt = GetElementCount(shape.data(), shape.size());\n  if (*cnt == -2) {\n    return TRITONSERVER_ErrorNew(\n        TRITONSERVER_ERROR_INVALID_ARG,\n        (std::string(\"shape\") + ShapeToString(shape) +\n         \" contains an invalid dim.\")\n            .c_str());\n  } else if (*cnt == -3) {\n    return TRITONSERVER_ErrorNew(\n        TRITONSERVER_ERROR_INVALID_ARG,\n        \"unexpected integer overflow while calculating element count.\");\n  }\n  return nullptr;  // success\n}\n\nint64_t\nGetByteSize(\n    const TRITONSERVER_DataType& dtype, const std::vector<int64_t>& dims)\n{\n  size_t dt_size = TRITONSERVER_DataTypeByteSize(dtype);\n  if (dt_size == 0) {\n    return -1;\n  }\n\n  int64_t cnt = GetElementCount(dims);\n  if (cnt <= 0) {\n    return cnt;\n  }\n\n  if ((cnt > INT64_MAX / dt_size)) {\n    return -3;\n  }\n  return cnt * dt_size;\n}\n\nTRITONSERVER_Error*\nGetByteSize(\n    const TRITONSERVER_DataType& dtype, const std::vector<int64_t>& dims,\n    int64_t* size)\n{\n  *size = GetByteSize(dtype, dims);\n  if (*size == -2) {\n    return TRITONSERVER_ErrorNew(\n        TRITONSERVER_ERROR_INVALID_ARG,\n        (std::string(\"shape\") + ShapeToString(dims) +\n         \" contains an invalid dim.\")\n            .c_str());\n  } else if (*size == -3) {\n    return TRITONSERVER_ErrorNew(\n        TRITONSERVER_ERROR_INVALID_ARG,\n        \"unexpected integer overflow while calculating byte size.\");\n  }\n  return nullptr;  // success\n}\n\nTRITONSERVER_Error*\nReadInputTensor(\n    TRITONBACKEND_Request* request, const std::string& input_name, char* buffer,\n    size_t* buffer_byte_size, TRITONSERVER_MemoryType memory_type,\n    int64_t memory_type_id, cudaStream_t cuda_stream, bool* cuda_used,\n    const char* host_policy_name, const bool copy_on_stream)\n{\n  TRITONBACKEND_Input* input;\n  RETURN_IF_ERROR(\n      TRITONBACKEND_RequestInput(request, input_name.c_str(), &input));\n\n  uint64_t input_byte_size;\n  uint32_t input_buffer_count;\n  RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy(\n      input, host_policy_name, nullptr, nullptr, nullptr, nullptr,\n      &input_byte_size, &input_buffer_count));\n  RETURN_ERROR_IF_FALSE(\n      input_byte_size <= *buffer_byte_size, TRITONSERVER_ERROR_INVALID_ARG,\n      std::string(\n          GetRequestId(request) + \"buffer too small for input tensor '\" +\n          input_name + \"', \" + std::to_string(*buffer_byte_size) + \" < \" +\n          std::to_string(input_byte_size)));\n\n  size_t output_buffer_offset = 0;\n  for (uint32_t b = 0; b < input_buffer_count; ++b) {\n    const void* input_buffer = nullptr;\n    uint64_t input_buffer_byte_size = 0;\n    TRITONSERVER_MemoryType input_memory_type = TRITONSERVER_MEMORY_CPU;\n    int64_t input_memory_type_id = 0;\n\n    RETURN_IF_ERROR(TRITONBACKEND_InputBufferForHostPolicy(\n        input, host_policy_name, b, &input_buffer, &input_buffer_byte_size,\n        &input_memory_type, &input_memory_type_id));\n\n    RETURN_IF_ERROR(CopyBuffer(\n        \"Failed to copy buffer\", input_memory_type, input_memory_type_id,\n        memory_type, memory_type_id, input_buffer_byte_size, input_buffer,\n        buffer + output_buffer_offset, cuda_stream, cuda_used, copy_on_stream));\n\n    output_buffer_offset += input_buffer_byte_size;\n  }\n\n  *buffer_byte_size = input_byte_size;\n\n  return nullptr;  // success\n}\n\nTRITONSERVER_Error*\nReadInputTensor(\n    TRITONBACKEND_Request* request, const std::string& input_name, char* buffer,\n    size_t* buffer_byte_size, const char* host_policy_name)\n{\n  bool cuda_used;\n  return ReadInputTensor(\n      request, input_name, buffer, buffer_byte_size,\n      TRITONSERVER_MEMORY_CPU /* memory_type */, 0 /* memory_type_id */,\n      0 /* cuda_stream */, &cuda_used);\n}\n\nTRITONSERVER_Error*\nCheckAllowedModelInput(\n    common::TritonJson::Value& io, const std::set<std::string>& allowed)\n{\n  std::string io_name;\n  RETURN_IF_ERROR(io.MemberAsString(\"name\", &io_name));\n  if (allowed.find(io_name) == allowed.end()) {\n    std::string astr;\n    for (const auto& a : allowed) {\n      if (!astr.empty()) {\n        astr.append(\", \");\n      }\n      astr.append(a);\n    }\n\n    return TRITONSERVER_ErrorNew(\n        TRITONSERVER_ERROR_INVALID_ARG,\n        std::string(\n            \"unexpected inference input '\" + io_name +\n            \"', allowed inputs are: \" + astr)\n            .c_str());\n  }\n  return nullptr;  // success\n}\n\nTRITONSERVER_Error*\nCheckAllowedModelOutput(\n    common::TritonJson::Value& io, const std::set<std::string>& allowed)\n{\n  std::string io_name;\n  RETURN_IF_ERROR(io.MemberAsString(\"name\", &io_name));\n  if (allowed.find(io_name) == allowed.end()) {\n    std::string astr;\n    for (const auto& a : allowed) {\n      if (!astr.empty()) {\n        astr.append(\", \");\n      }\n      astr.append(a);\n    }\n\n    return TRITONSERVER_ErrorNew(\n        TRITONSERVER_ERROR_INVALID_ARG,\n        std::string(\n            \"unexpected inference output '\" + io_name +\n            \"', allowed outputs are: \" + astr)\n            .c_str());\n  }\n\n  return nullptr;  // success\n}\n\nTRITONSERVER_Error*\nGetBooleanSequenceControlProperties(\n    common::TritonJson::Value& batcher, const std::string& model_name,\n    const std::string& control_kind, const bool required,\n    std::string* tensor_name, std::string* tensor_datatype,\n    float* fp32_false_value, float* fp32_true_value, int32_t* int32_false_value,\n    int32_t* int32_true_value, bool* bool_false_value, bool* bool_true_value)\n{\n  // Make sure same tensor is not configured for multiple controls\n  std::set<std::string> seen_tensors;\n\n  // Make sure the control kind is not mentioned multiple times.\n  bool seen_control = false;\n\n  common::TritonJson::Value control_inputs;\n  if (batcher.Find(\"control_input\", &control_inputs)) {\n    for (size_t ci_idx = 0; ci_idx < control_inputs.ArraySize(); ci_idx++) {\n      common::TritonJson::Value control_input;\n      RETURN_IF_ERROR(control_inputs.IndexAsObject(ci_idx, &control_input));\n      std::string input_name;\n      RETURN_IF_ERROR(control_input.MemberAsString(\"name\", &input_name));\n      if (input_name.empty()) {\n        return TRITONSERVER_ErrorNew(\n            TRITONSERVER_ERROR_INVALID_ARG,\n            (std::string(\n                 \"sequence batching control tensor must have a name for \") +\n             model_name)\n                .c_str());\n      }\n\n      if (seen_tensors.find(input_name) != seen_tensors.end()) {\n        return TRITONSERVER_ErrorNew(\n            TRITONSERVER_ERROR_INVALID_ARG,\n            (std::string(\"sequence batching control tensor '\") + input_name +\n             \"' is specified for multiple control kinds for \" + model_name)\n                .c_str());\n      }\n\n      seen_tensors.insert(input_name);\n      common::TritonJson::Value controls;\n      if (control_input.Find(\"control\", &controls)) {\n        for (size_t c_idx = 0; c_idx < controls.ArraySize(); c_idx++) {\n          common::TritonJson::Value c;\n          RETURN_IF_ERROR(controls.IndexAsObject(c_idx, &c));\n          std::string kind_str;\n          RETURN_IF_ERROR(c.MemberAsString(\"kind\", &kind_str));\n          if (kind_str == control_kind) {\n            if (seen_control) {\n              return TRITONSERVER_ErrorNew(\n                  TRITONSERVER_ERROR_INVALID_ARG,\n                  (std::string(\n                       \"sequence batching specifies multiple \" + control_kind +\n                       \" tensors for \" + model_name)\n                       .c_str()));\n            }\n\n            *tensor_name = input_name;\n            seen_control = true;\n\n            common::TritonJson::Value int32_false_true, fp32_false_true,\n                bool_false_true;\n            bool found_int32 =\n                (c.Find(\"int32_false_true\", &int32_false_true) &&\n                 (int32_false_true.ArraySize() > 0));\n            bool found_fp32 =\n                (c.Find(\"fp32_false_true\", &fp32_false_true) &&\n                 (fp32_false_true.ArraySize() > 0));\n            bool found_bool =\n                (c.Find(\"bool_false_true\", &bool_false_true) &&\n                 (bool_false_true.ArraySize() > 0));\n\n            // Make sure only one of int, float, or bool type is specified.\n            if (!(found_int32 || found_fp32 || found_bool)) {\n              return TRITONSERVER_ErrorNew(\n                  TRITONSERVER_ERROR_INVALID_ARG,\n                  (std::string(\n                       \"sequence batching must specify either \"\n                       \"'int32_false_true', 'fp32_false_true' or \"\n                       \"'bool_false_true' for \" +\n                       control_kind + \" for \" + model_name))\n                      .c_str());\n            } else if (\n                (found_fp32 && found_int32) || (found_fp32 && found_bool) ||\n                (found_int32 && found_bool)) {\n              return TRITONSERVER_ErrorNew(\n                  TRITONSERVER_ERROR_INVALID_ARG,\n                  (std::string(\n                       \"sequence batching specifies more than one from \"\n                       \"'int32_false_true', 'fp32_false_true' and \"\n                       \"'bool_false_true' for \" +\n                       control_kind + \" for \" + model_name))\n                      .c_str());\n            }\n\n            if (found_int32) {\n              if (int32_false_true.ArraySize() != 2) {\n                return TRITONSERVER_ErrorNew(\n                    TRITONSERVER_ERROR_INVALID_ARG,\n                    (std::string(\n                         \"sequence batching control 'int32_false_true' must \"\n                         \"have \"\n                         \"exactly 2 entries for \" +\n                         control_kind + \" for \" + model_name))\n                        .c_str());\n              }\n              if (tensor_datatype != nullptr) {\n                *tensor_datatype = \"TYPE_INT32\";\n              }\n              if (int32_false_value != nullptr) {\n                int64_t value;\n                RETURN_IF_ERROR(int32_false_true.IndexAsInt(0, &value));\n                *int32_false_value = value;\n              }\n              if (int32_true_value != nullptr) {\n                int64_t value;\n                RETURN_IF_ERROR(int32_false_true.IndexAsInt(1, &value));\n                *int32_true_value = value;\n              }\n            } else if (found_fp32) {\n              if (fp32_false_true.ArraySize() != 2) {\n                return TRITONSERVER_ErrorNew(\n                    TRITONSERVER_ERROR_INVALID_ARG,\n                    (std::string(\n                         \"sequence batching control 'fp32_false_true' must \"\n                         \"have exactly \"\n                         \"2 entries for \" +\n                         control_kind + \" for \" + model_name))\n                        .c_str());\n              }\n              if (tensor_datatype != nullptr) {\n                *tensor_datatype = \"TYPE_FP32\";\n              }\n              if (fp32_false_value != nullptr) {\n                double value = 0.0;\n                RETURN_IF_ERROR(fp32_false_true.IndexAsDouble(0, &value));\n                *fp32_false_value = value;\n              }\n              if (fp32_true_value != nullptr) {\n                double value = 0.0;\n                RETURN_IF_ERROR(fp32_false_true.IndexAsDouble(1, &value));\n                *fp32_true_value = value;\n              }\n            } else {\n              if (bool_false_true.ArraySize() != 2) {\n                return TRITONSERVER_ErrorNew(\n                    TRITONSERVER_ERROR_INVALID_ARG,\n                    (std::string(\n                         \"sequence batching control 'bool_false_true' must \"\n                         \"have exactly \"\n                         \"2 entries for \" +\n                         control_kind + \" for \" + model_name))\n                        .c_str());\n              }\n              if (tensor_datatype != nullptr) {\n                *tensor_datatype = \"TYPE_BOOL\";\n              }\n              if (bool_false_value != nullptr) {\n                bool value;\n                RETURN_IF_ERROR(bool_false_true.IndexAsBool(0, &value));\n                *bool_false_value = value;\n              }\n              if (bool_true_value != nullptr) {\n                bool value;\n                RETURN_IF_ERROR(bool_false_true.IndexAsBool(1, &value));\n                *bool_true_value = value;\n              }\n            }\n          }\n        }\n      }\n    }\n  }\n\n  if (!seen_control) {\n    if (required) {\n      return TRITONSERVER_ErrorNew(\n          TRITONSERVER_ERROR_INVALID_ARG,\n          (std::string(\n               \"sequence batching control tensor must specify a \" +\n               control_kind + \" value for \" + model_name))\n              .c_str());\n    }\n\n    tensor_name->clear();\n  }\n\n  return nullptr;  // success\n}\n\nTRITONSERVER_Error*\nGetTypedSequenceControlProperties(\n    common::TritonJson::Value& batcher, const std::string& model_name,\n    const std::string& control_kind, const bool required,\n    std::string* tensor_name, std::string* tensor_datatype)\n{\n  // Make sure same tensor is not configured for multiple controls\n  std::set<std::string> seen_tensors;\n\n  // Make sure the control kind is not mentioned multiple times.\n  bool seen_control = false;\n\n  common::TritonJson::Value control_inputs;\n  if (batcher.Find(\"control_input\", &control_inputs)) {\n    for (size_t ci_idx = 0; ci_idx < control_inputs.ArraySize(); ci_idx++) {\n      common::TritonJson::Value control_input;\n      RETURN_IF_ERROR(control_inputs.IndexAsObject(ci_idx, &control_input));\n      std::string input_name;\n      RETURN_IF_ERROR(control_input.MemberAsString(\"name\", &input_name));\n      if (input_name.empty()) {\n        return TRITONSERVER_ErrorNew(\n            TRITONSERVER_ERROR_INVALID_ARG,\n            (std::string(\n                 \"sequence batching control tensor must have a name for \") +\n             model_name)\n                .c_str());\n      }\n      if (seen_tensors.find(input_name) != seen_tensors.end()) {\n        return TRITONSERVER_ErrorNew(\n            TRITONSERVER_ERROR_INVALID_ARG,\n            (std::string(\"sequence batching control tensor '\") + input_name +\n             \"' is specified for multiple control kinds for \" + model_name)\n                .c_str());\n      }\n\n      seen_tensors.insert(input_name);\n      common::TritonJson::Value controls;\n      if (control_input.Find(\"control\", &controls)) {\n        for (size_t c_idx = 0; c_idx < controls.ArraySize(); c_idx++) {\n          common::TritonJson::Value c;\n          RETURN_IF_ERROR(controls.IndexAsObject(c_idx, &c));\n          std::string kind_str;\n          RETURN_IF_ERROR(c.MemberAsString(\"kind\", &kind_str));\n          if (kind_str == control_kind) {\n            if (seen_control) {\n              return TRITONSERVER_ErrorNew(\n                  TRITONSERVER_ERROR_INVALID_ARG,\n                  (std::string(\n                       \"sequence batching specifies multiple \" + control_kind +\n                       \" tensors for \" + model_name)\n                       .c_str()));\n            }\n\n            *tensor_name = input_name;\n            if (tensor_datatype != nullptr) {\n              RETURN_IF_ERROR(c.MemberAsString(\"data_type\", tensor_datatype));\n            }\n\n            seen_control = true;\n\n            common::TritonJson::Value int32_false_true, fp32_false_true,\n                bool_false_true;\n            bool found_int32 =\n                (c.Find(\"int32_false_true\", &int32_false_true) &&\n                 (int32_false_true.ArraySize() > 0));\n            bool found_fp32 =\n                (c.Find(\"fp32_false_true\", &fp32_false_true) &&\n                 (fp32_false_true.ArraySize() > 0));\n            bool found_bool =\n                (c.Find(\"bool_false_true\", &bool_false_true) &&\n                 (bool_false_true.ArraySize() > 0));\n            if (found_fp32 || found_int32 || found_bool) {\n              return TRITONSERVER_ErrorNew(\n                  TRITONSERVER_ERROR_INVALID_ARG,\n                  (std::string(\n                       \"sequence batching must not specify either \"\n                       \"'int32_false_true', 'fp32_false_true' or \"\n                       \"'bool_false_true' for \" +\n                       control_kind + \" for \" + model_name))\n                      .c_str());\n            }\n          }\n        }\n      }\n    }\n  }\n\n  if (!seen_control) {\n    if (required) {\n      return TRITONSERVER_ErrorNew(\n          TRITONSERVER_ERROR_INVALID_ARG,\n          (std::string(\n               \"sequence batching control tensor must specify a \" +\n               control_kind + \" value for \" + model_name))\n              .c_str());\n    }\n\n    tensor_name->clear();\n  }\n\n  return nullptr;  // success\n}\n\nvoid\nRequestsRespondWithError(\n    TRITONBACKEND_Request** requests, const uint32_t request_count,\n    TRITONSERVER_Error* response_err, const bool release_request)\n{\n  for (size_t i = 0; i < request_count; i++) {\n    TRITONBACKEND_Response* response;\n    auto err = TRITONBACKEND_ResponseNew(&response, requests[i]);\n    if (err != nullptr) {\n      LOG_MESSAGE(\n          TRITONSERVER_LOG_ERROR,\n          (GetRequestId(requests[i]) + \"fail to create response\").c_str());\n      TRITONSERVER_ErrorDelete(err);\n    } else {\n      LOG_IF_ERROR(\n          TRITONBACKEND_ResponseSend(\n              response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, response_err),\n          (GetRequestId(requests[i]) + \"fail to send error response\").c_str());\n    }\n\n    if (release_request) {\n      LOG_IF_ERROR(\n          TRITONBACKEND_RequestRelease(\n              requests[i], TRITONSERVER_REQUEST_RELEASE_ALL),\n          \"fail to release request\");\n      requests[i] = nullptr;\n    }\n  }\n\n  TRITONSERVER_ErrorDelete(response_err);\n}\n\nvoid\nSendErrorForResponses(\n    std::vector<TRITONBACKEND_Response*>* responses,\n    const uint32_t response_count, TRITONSERVER_Error* response_err)\n{\n  for (size_t i = 0; i < response_count; i++) {\n    TRITONBACKEND_Response* response = (*responses)[i];\n    if (response != nullptr) {\n      LOG_IF_ERROR(\n          TRITONBACKEND_ResponseSend(\n              response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, response_err),\n          \"fail to send error response\");\n      (*responses)[i] = nullptr;\n    }\n  }\n\n  TRITONSERVER_ErrorDelete(response_err);\n}\n\nTRITONSERVER_Error*\nCopyBuffer(\n    const std::string& msg, const TRITONSERVER_MemoryType src_memory_type,\n    const int64_t src_memory_type_id,\n    const TRITONSERVER_MemoryType dst_memory_type,\n    const int64_t dst_memory_type_id, const size_t byte_size, const void* src,\n    void* dst, cudaStream_t cuda_stream, bool* cuda_used,\n    const bool copy_on_stream)\n{\n  *cuda_used = false;\n\n  if (byte_size > 0) {\n    if (src == nullptr) {\n      return TRITONSERVER_ErrorNew(\n          TRITONSERVER_ERROR_INTERNAL,\n          std::string(\n              msg + \": attempted a copy of \" + std::to_string(byte_size) +\n              \" Bytes from an uninitialized memory\")\n              .c_str());\n    }\n\n    if (dst == nullptr) {\n      return TRITONSERVER_ErrorNew(\n          TRITONSERVER_ERROR_INTERNAL,\n          std::string(\n              msg + \": attempted a copy of \" + std::to_string(byte_size) +\n              \" Bytes to an uninitialized memory\")\n              .c_str());\n    }\n  }\n\n\n  // For CUDA memcpy, if copy_on_stream is false, all host to host copy will be\n  // blocked in respect to the host, so use memcpy() directly. In this case,\n  // need to be careful on whether the src buffer is valid.\n  if ((src_memory_type != TRITONSERVER_MEMORY_GPU) &&\n      (dst_memory_type != TRITONSERVER_MEMORY_GPU)) {\n#ifdef TRITON_ENABLE_GPU\n    if (copy_on_stream) {\n      auto params = new CopyParams(dst, src, byte_size);\n      cudaLaunchHostFunc(\n          cuda_stream, MemcpyHost, reinterpret_cast<void*>(params));\n      *cuda_used = true;\n    } else {\n      memcpy(dst, src, byte_size);\n    }\n#else\n    memcpy(dst, src, byte_size);\n#endif  // TRITON_ENABLE_GPU\n  } else {\n#ifdef TRITON_ENABLE_GPU\n    // [TODO] use cudaMemcpyDefault if UVM is supported for the device\n    auto copy_kind = cudaMemcpyDeviceToDevice;\n    if (src_memory_type != TRITONSERVER_MEMORY_GPU) {\n      copy_kind = cudaMemcpyHostToDevice;\n    } else if (dst_memory_type != TRITONSERVER_MEMORY_GPU) {\n      copy_kind = cudaMemcpyDeviceToHost;\n    }\n\n    if ((src_memory_type_id != dst_memory_type_id) &&\n        (copy_kind == cudaMemcpyDeviceToDevice)) {\n      RETURN_IF_CUDA_ERROR(\n          cudaMemcpyPeerAsync(\n              dst, dst_memory_type_id, src, src_memory_type_id, byte_size,\n              cuda_stream),\n          TRITONSERVER_ERROR_INTERNAL, msg + \": failed to perform CUDA copy\");\n    } else {\n      RETURN_IF_CUDA_ERROR(\n          cudaMemcpyAsync(dst, src, byte_size, copy_kind, cuda_stream),\n          TRITONSERVER_ERROR_INTERNAL, msg + \": failed to perform CUDA copy\");\n    }\n\n    *cuda_used = true;\n#else\n    return TRITONSERVER_ErrorNew(\n        TRITONSERVER_ERROR_INTERNAL,\n        std::string(msg + \": try to use CUDA copy while GPU is not supported\")\n            .c_str());\n#endif  // TRITON_ENABLE_GPU\n  }\n\n  return nullptr;  // success\n}\n\nTRITONSERVER_Error*\nGetDirectoryContents(const std::string& path, std::set<std::string>* contents)\n{\n#ifdef _WIN32\n  WIN32_FIND_DATA entry;\n  HANDLE dir = FindFirstFile(path.c_str(), &entry);\n  if (dir == INVALID_HANDLE_VALUE) {\n    return TRITONSERVER_ErrorNew(\n        TRITONSERVER_ERROR_INTERNAL,\n        (std::string(\"failed to open directory: \") + path).c_str());\n  }\n  if ((entry.cFileName != \".\") && (entry.cFileName != \"..\")) {\n    contents->insert(entry.cFileName);\n  }\n  while (FindNextFileA(dir, &entry)) {\n    if ((entry.cFileName != \".\") && (entry.cFileName != \"..\")) {\n      contents->insert(entry.cFileName);\n    }\n  }\n\n  FindClose(dir);\n#else\n  DIR* dir = opendir(path.c_str());\n  if (dir == nullptr) {\n    return TRITONSERVER_ErrorNew(\n        TRITONSERVER_ERROR_INTERNAL,\n        (std::string(\"failed to open directory: \") + path).c_str());\n  }\n\n  struct dirent* entry;\n  while ((entry = readdir(dir)) != nullptr) {\n    std::string entryname = entry->d_name;\n    if ((entryname != \".\") && (entryname != \"..\")) {\n      contents->insert(entryname);\n    }\n  }\n\n  closedir(dir);\n#endif\n  return nullptr;  // success\n}\n\nTRITONSERVER_Error*\nFileExists(const std::string& path, bool* exists)\n{\n  std::string valid_path;\n  GetOSValidPath(path, valid_path);\n  *exists = (access(valid_path.c_str(), F_OK) == 0);\n  return nullptr;  // success\n}\n\nTRITONSERVER_Error*\nReadTextFile(const std::string& path, std::string* contents)\n{\n  std::string valid_path;\n  GetOSValidPath(path, valid_path);\n  std::ifstream in(valid_path, std::ios::in | std::ios::binary);\n  if (!in) {\n    return TRITONSERVER_ErrorNew(\n        TRITONSERVER_ERROR_INTERNAL,\n        (\"failed to open/read file '\" + valid_path + \"': \" + strerror(errno))\n            .c_str());\n  }\n\n  in.seekg(0, std::ios::end);\n  contents->resize(in.tellg());\n  in.seekg(0, std::ios::beg);\n  in.read(&(*contents)[0], contents->size());\n  in.close();\n\n  return nullptr;  // success\n}\n\nTRITONSERVER_Error*\nIsDirectory(const std::string& path, bool* is_dir)\n{\n  *is_dir = false;\n\n  std::string valid_path;\n  GetOSValidPath(path, valid_path);\n\n  struct stat st;\n  if (stat(valid_path.c_str(), &st) != 0) {\n    return TRITONSERVER_ErrorNew(\n        TRITONSERVER_ERROR_INTERNAL,\n        (std::string(\"failed to stat file \") + valid_path).c_str());\n  }\n\n  *is_dir = S_ISDIR(st.st_mode);\n  return nullptr;  // success\n}\n\nstd::string\nJoinPath(std::initializer_list<std::string> segments)\n{\n  std::string joined;\n\n  for (const auto& seg : segments) {\n    if (joined.empty()) {\n      joined = seg;\n    } else if (!seg.empty() && (seg[0] == '/')) {  // IsAbsolutePath(seg)\n      if (joined[joined.size() - 1] == '/') {\n        joined.append(seg.substr(1));\n      } else {\n        joined.append(seg);\n      }\n    } else {  // !IsAbsolutePath(seg)\n      if (joined[joined.size() - 1] != '/') {\n        joined.append(\"/\");\n      }\n      joined.append(seg);\n    }\n  }\n\n  return joined;\n}\n\nTRITONSERVER_Error*\nModelPaths(\n    const std::string& model_repository_path, uint64_t version,\n    const bool ignore_directories, const bool ignore_files,\n    std::unordered_map<std::string, std::string>* model_paths)\n{\n  std::set<std::string> model_files;\n  // Read all the files in 'path' and filter by type for different requirements\n  auto path = JoinPath({model_repository_path, std::to_string(version)});\n  RETURN_IF_ERROR(GetDirectoryContents(path, &model_files));\n  if (ignore_directories) {\n    // Erase directory entries...\n    for (auto iter = model_files.begin(); iter != model_files.end();) {\n      bool is_dir;\n      RETURN_IF_ERROR(IsDirectory(JoinPath({path, *iter}), &is_dir));\n      if (is_dir) {\n        iter = model_files.erase(iter);\n      } else {\n        ++iter;\n      }\n    }\n  }\n  if (ignore_files) {\n    // Erase non-directory entries...\n    for (auto iter = model_files.begin(); iter != model_files.end();) {\n      bool is_dir;\n      RETURN_IF_ERROR(IsDirectory(JoinPath({path, *iter}), &is_dir));\n      if (!is_dir) {\n        iter = model_files.erase(iter);\n      } else {\n        ++iter;\n      }\n    }\n  }\n\n  for (const auto& filename : model_files) {\n    const auto model_path = JoinPath({path, filename});\n    model_paths->emplace(\n        std::piecewise_construct, std::make_tuple(filename),\n        std::make_tuple(model_path));\n  }\n\n  return nullptr;  // success\n}\n\nTRITONSERVER_Error*\nCreateCudaStream(\n    const int device_id, const int cuda_stream_priority, cudaStream_t* stream)\n{\n  *stream = nullptr;\n\n#ifdef TRITON_ENABLE_GPU\n  // Make sure that correct device is set before creating stream and\n  // then restore the device to what was set by the caller.\n  int current_device;\n  auto cuerr = cudaGetDevice(&current_device);\n  bool overridden = false;\n  if (cuerr == cudaSuccess) {\n    overridden = (current_device != device_id);\n    if (overridden) {\n      cuerr = cudaSetDevice(device_id);\n    }\n  }\n\n  if (cuerr == cudaSuccess) {\n    cuerr = cudaStreamCreateWithPriority(\n        stream, cudaStreamDefault, cuda_stream_priority);\n  }\n\n  if (overridden) {\n    cudaSetDevice(current_device);\n  }\n\n  if (cuerr != cudaSuccess) {\n    *stream = nullptr;\n    return TRITONSERVER_ErrorNew(\n        TRITONSERVER_ERROR_INTERNAL,\n        (std::string(\"unable to create stream: \") + cudaGetErrorString(cuerr))\n            .c_str());\n  }\n#endif  // TRITON_ENABLE_GPU\n\n  return nullptr;  // success\n}\n\nTRITONSERVER_Error*\nParseLongLongValue(const std::string& value, int64_t* parsed_value)\n{\n  try {\n    *parsed_value = std::stoll(value);\n  }\n  catch (const std::invalid_argument& ia) {\n    return TRITONSERVER_ErrorNew(\n        TRITONSERVER_ERROR_INVALID_ARG,\n        (std::string(\"failed to convert '\") + value +\n         \"' to long long integral number\")\n            .c_str());\n  }\n\n  return nullptr;  // success\n}\n\nTRITONSERVER_Error*\nParseUnsignedLongLongValue(const std::string& value, uint64_t* parsed_value)\n{\n  try {\n    *parsed_value = std::stoull(value);\n  }\n  catch (const std::invalid_argument& ia) {\n    return TRITONSERVER_ErrorNew(\n        TRITONSERVER_ERROR_INVALID_ARG,\n        (std::string(\"failed to convert '\") + value +\n         \"' to unsigned long long integral number\")\n            .c_str());\n  }\n\n  return nullptr;  // success\n}\n\nTRITONSERVER_Error*\nParseBoolValue(const std::string& value, bool* parsed_value)\n{\n  std::string lvalue = value;\n  std::transform(\n      lvalue.begin(), lvalue.end(), lvalue.begin(),\n      [](unsigned char c) { return std::tolower(c); });\n\n  if ((lvalue == \"true\") || (lvalue == \"on\") || (lvalue == \"1\")) {\n    *parsed_value = true;\n    return nullptr;  // success\n  }\n  if ((lvalue == \"false\") || (lvalue == \"off\") || (lvalue == \"0\")) {\n    *parsed_value = false;\n    return nullptr;  // success\n  }\n\n  return TRITONSERVER_ErrorNew(\n      TRITONSERVER_ERROR_INVALID_ARG,\n      (std::string(\"failed to convert '\") + value + \"' to boolean\").c_str());\n}\n\nTRITONSERVER_Error*\nParseIntValue(const std::string& value, int* parsed_value)\n{\n  try {\n    *parsed_value = std::stoi(value);\n  }\n  catch (const std::invalid_argument& ia) {\n    return TRITONSERVER_ErrorNew(\n        TRITONSERVER_ERROR_INVALID_ARG,\n        (std::string(\"failed to convert '\") + value + \"' to integral number\")\n            .c_str());\n  }\n\n  return nullptr;  // success\n}\n\nTRITONSERVER_Error*\nParseDoubleValue(const std::string& value, double* parsed_value)\n{\n  try {\n    *parsed_value = std::stod(value);\n  }\n  catch (const std::invalid_argument& ia) {\n    return TRITONSERVER_ErrorNew(\n        TRITONSERVER_ERROR_INVALID_ARG,\n        (std::string(\"failed to convert '\") + value + \"' to double number\")\n            .c_str());\n  }\n\n  return nullptr;  // success\n}\n\nTRITONSERVER_Error*\nGetParameterValue(\n    triton::common::TritonJson::Value& params, const std::string& key,\n    std::string* value)\n{\n  triton::common::TritonJson::Value json_value;\n  RETURN_ERROR_IF_FALSE(\n      params.Find(key.c_str(), &json_value), TRITONSERVER_ERROR_NOT_FOUND,\n      std::string(\"model configuration is missing the parameter \") + key);\n  RETURN_IF_ERROR(json_value.MemberAsString(\"string_value\", value));\n  return nullptr;  // success\n}\n\nTRITONSERVER_Error*\nBatchInput::ParseFromModelConfig(\n    triton::common::TritonJson::Value& config,\n    std::vector<BatchInput>* batch_inputs)\n{\n  batch_inputs->clear();\n  triton::common::TritonJson::Value bis;\n  RETURN_IF_ERROR(config.MemberAsArray(\"batch_input\", &bis));\n  for (size_t i = 0; i < bis.ArraySize(); ++i) {\n    triton::common::TritonJson::Value bi;\n    RETURN_IF_ERROR(bis.IndexAsObject(i, &bi));\n    batch_inputs->emplace_back();\n    RETURN_IF_ERROR(batch_inputs->back().Init(bi));\n  }\n\n  return nullptr;  // success\n}\n\nTRITONSERVER_Error*\nBatchInput::Init(triton::common::TritonJson::Value& bi_config)\n{\n  {\n    triton::common::TritonJson::Value bi_target_names;\n    RETURN_IF_ERROR(bi_config.MemberAsArray(\"target_name\", &bi_target_names));\n    for (size_t i = 0; i < bi_target_names.ArraySize(); ++i) {\n      std::string tn;\n      RETURN_IF_ERROR(bi_target_names.IndexAsString(i, &tn));\n      target_names_.emplace_back(std::move(tn));\n    }\n  }\n  {\n    RETURN_IF_ERROR(bi_config.MemberAsString(\"kind\", &kind_str_));\n    if (kind_str_ == \"BATCH_ELEMENT_COUNT\") {\n      kind_ = Kind::BATCH_ELEMENT_COUNT;\n    } else if (kind_str_ == \"BATCH_ACCUMULATED_ELEMENT_COUNT\") {\n      kind_ = Kind::BATCH_ACCUMULATED_ELEMENT_COUNT;\n    } else if (kind_str_ == \"BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO\") {\n      kind_ = Kind::BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO;\n    } else if (kind_str_ == \"BATCH_MAX_ELEMENT_COUNT_AS_SHAPE\") {\n      kind_ = Kind::BATCH_MAX_ELEMENT_COUNT_AS_SHAPE;\n    } else if (kind_str_ == \"BATCH_ITEM_SHAPE\") {\n      kind_ = Kind::BATCH_ITEM_SHAPE;\n    } else if (kind_str_ == \"BATCH_ITEM_SHAPE_FLATTEN\") {\n      kind_ = Kind::BATCH_ITEM_SHAPE_FLATTEN;\n    } else {\n      RETURN_ERROR_IF_FALSE(\n          false, TRITONSERVER_ERROR_INVALID_ARG,\n          std::string(\"unexpected batch input kind '\" + kind_str_ + \"'\"));\n    }\n  }\n  {\n    std::string bi_dtype;\n    RETURN_IF_ERROR(bi_config.MemberAsString(\"data_type\", &bi_dtype));\n    data_type_ = ModelConfigDataTypeToTritonServerDataType(bi_dtype);\n    RETURN_ERROR_IF_TRUE(\n        data_type_ == TRITONSERVER_TYPE_INVALID, TRITONSERVER_ERROR_INVALID_ARG,\n        std::string(\"unexpected batch input data type '\" + bi_dtype + \"'\"));\n  }\n  {\n    triton::common::TritonJson::Value bi_source_inputs;\n    RETURN_IF_ERROR(bi_config.MemberAsArray(\"source_input\", &bi_source_inputs));\n    for (size_t i = 0; i < bi_source_inputs.ArraySize(); ++i) {\n      std::string si;\n      RETURN_IF_ERROR(bi_source_inputs.IndexAsString(i, &si));\n      source_inputs_.emplace_back(std::move(si));\n    }\n  }\n  return nullptr;  // success\n}\n\nTRITONSERVER_DataType\nModelConfigDataTypeToTritonServerDataType(const std::string& data_type_str)\n{\n  // Must start with \"TYPE_\".\n  if (data_type_str.rfind(\"TYPE_\", 0) != 0) {\n    return TRITONSERVER_TYPE_INVALID;\n  }\n\n  const std::string dtype = data_type_str.substr(strlen(\"TYPE_\"));\n\n  if (dtype == \"BOOL\") {\n    return TRITONSERVER_TYPE_BOOL;\n  } else if (dtype == \"UINT8\") {\n    return TRITONSERVER_TYPE_UINT8;\n  } else if (dtype == \"UINT16\") {\n    return TRITONSERVER_TYPE_UINT16;\n  } else if (dtype == \"UINT32\") {\n    return TRITONSERVER_TYPE_UINT32;\n  } else if (dtype == \"UINT64\") {\n    return TRITONSERVER_TYPE_UINT64;\n  } else if (dtype == \"INT8\") {\n    return TRITONSERVER_TYPE_INT8;\n  } else if (dtype == \"INT16\") {\n    return TRITONSERVER_TYPE_INT16;\n  } else if (dtype == \"INT32\") {\n    return TRITONSERVER_TYPE_INT32;\n  } else if (dtype == \"INT64\") {\n    return TRITONSERVER_TYPE_INT64;\n  } else if (dtype == \"FP16\") {\n    return TRITONSERVER_TYPE_FP16;\n  } else if (dtype == \"FP32\") {\n    return TRITONSERVER_TYPE_FP32;\n  } else if (dtype == \"FP64\") {\n    return TRITONSERVER_TYPE_FP64;\n  } else if (dtype == \"STRING\") {\n    return TRITONSERVER_TYPE_BYTES;\n  } else if (dtype == \"BF16\") {\n    return TRITONSERVER_TYPE_BF16;\n  }\n\n  return TRITONSERVER_TYPE_INVALID;\n}\n\nTRITONSERVER_Error*\nBatchOutput::ParseFromModelConfig(\n    triton::common::TritonJson::Value& config,\n    std::vector<BatchOutput>* batch_outputs)\n{\n  batch_outputs->clear();\n  triton::common::TritonJson::Value bos;\n  RETURN_IF_ERROR(config.MemberAsArray(\"batch_output\", &bos));\n  for (size_t i = 0; i < bos.ArraySize(); ++i) {\n    batch_outputs->emplace_back();\n    auto& batch_output = batch_outputs->back();\n    triton::common::TritonJson::Value bo;\n    RETURN_IF_ERROR(bos.IndexAsObject(i, &bo));\n    {\n      triton::common::TritonJson::Value bo_target_names;\n      RETURN_IF_ERROR(bo.MemberAsArray(\"target_name\", &bo_target_names));\n      for (size_t i = 0; i < bo_target_names.ArraySize(); ++i) {\n        std::string tn;\n        RETURN_IF_ERROR(bo_target_names.IndexAsString(i, &tn));\n        batch_output.target_names_.emplace_back(std::move(tn));\n      }\n    }\n    {\n      std::string bo_kind;\n      RETURN_IF_ERROR(bo.MemberAsString(\"kind\", &bo_kind));\n      if (bo_kind == \"BATCH_SCATTER_WITH_INPUT_SHAPE\") {\n        batch_output.kind_ = Kind::BATCH_SCATTER_WITH_INPUT_SHAPE;\n        // Keep track of the output info for later cross reference with input\n        int64_t mbs = 0;\n        RETURN_IF_ERROR(config.MemberAsInt(\"max_batch_size\", &mbs));\n        if (mbs != 0) {\n          batch_output.shape_.push_back(-1);\n        }\n        triton::common::TritonJson::Value ios;\n        RETURN_IF_ERROR(config.MemberAsArray(\"output\", &ios));\n        for (size_t i = 0; i < ios.ArraySize(); i++) {\n          triton::common::TritonJson::Value io;\n          RETURN_IF_ERROR(ios.IndexAsObject(i, &io));\n          std::string io_name;\n          RETURN_IF_ERROR(io.MemberAsString(\"name\", &io_name));\n          if (io_name == batch_output.target_names_[0]) {\n            std::string io_dtype;\n            RETURN_IF_ERROR(io.MemberAsString(\"data_type\", &io_dtype));\n            batch_output.data_type_ =\n                ModelConfigDataTypeToTritonServerDataType(io_dtype);\n            // If a reshape is provided for the input then use that when\n            // validating that the model matches what is expected.\n            triton::common::TritonJson::Value reshape;\n            if (io.Find(\"reshape\", &reshape)) {\n              RETURN_IF_ERROR(\n                  ParseShape(reshape, \"shape\", &batch_output.shape_));\n            } else {\n              RETURN_IF_ERROR(ParseShape(io, \"dims\", &batch_output.shape_));\n            }\n            break;\n          }\n        }\n      } else {\n        RETURN_ERROR_IF_FALSE(\n            false, TRITONSERVER_ERROR_INVALID_ARG,\n            std::string(\"unexpected batch output kind '\" + bo_kind + \"'\"));\n      }\n    }\n    {\n      triton::common::TritonJson::Value bo_source_inputs;\n      RETURN_IF_ERROR(bo.MemberAsArray(\"source_input\", &bo_source_inputs));\n      for (size_t i = 0; i < bo_source_inputs.ArraySize(); ++i) {\n        std::string si;\n        RETURN_IF_ERROR(bo_source_inputs.IndexAsString(i, &si));\n        batch_output.source_inputs_.emplace_back(std::move(si));\n      }\n    }\n  }\n\n  return nullptr;  // success\n}\n\nTRITONSERVER_Error*\nTryParseModelStringParameter(\n    triton::common::TritonJson::Value& params, const std::string& mkey,\n    std::string* value, const std::string& default_value)\n{\n  triton::common::TritonJson::Value json_value;\n  if (params.Find(mkey.c_str(), &json_value)) {\n    RETURN_IF_ERROR(json_value.MemberAsString(\"string_value\", value));\n  } else {\n    *value = default_value;\n  }\n\n  return nullptr;  // success\n}\n\nTRITONSERVER_Error*\nTryParseModelStringParameter(\n    triton::common::TritonJson::Value& params, const std::string& mkey,\n    int* value, const int& default_value)\n{\n  triton::common::TritonJson::Value json_value;\n  if (params.Find(mkey.c_str(), &json_value)) {\n    std::string string_value;\n    RETURN_IF_ERROR(json_value.MemberAsString(\"string_value\", &string_value));\n    return ParseIntValue(string_value, value);\n  } else {\n    *value = default_value;\n    return nullptr;  // success\n  }\n}\n\nTRITONSERVER_Error*\nTryParseModelStringParameter(\n    triton::common::TritonJson::Value& params, const std::string& mkey,\n    bool* value, const bool& default_value)\n{\n  triton::common::TritonJson::Value json_value;\n  if (params.Find(mkey.c_str(), &json_value)) {\n    std::string string_value;\n    RETURN_IF_ERROR(json_value.MemberAsString(\"string_value\", &string_value));\n    return ParseBoolValue(string_value, value);\n  } else {\n    *value = default_value;\n    return nullptr;  // success\n  }\n}\n\nTRITONSERVER_Error*\nTryParseModelStringParameter(\n    triton::common::TritonJson::Value& params, const std::string& mkey,\n    uint64_t* value, const uint64_t& default_value)\n{\n  triton::common::TritonJson::Value json_value;\n  if (params.Find(mkey.c_str(), &json_value)) {\n    std::string string_value;\n    RETURN_IF_ERROR(json_value.MemberAsString(\"string_value\", &string_value));\n    return ParseUnsignedLongLongValue(string_value, value);\n  } else {\n    *value = default_value;\n    return nullptr;  // success\n  }\n}\n\nnamespace {\n\ntemplate <typename T>\nTRITONSERVER_Error*\nBufferAsTypedString(\n    std::string& str, const char* buffer, const size_t element_cnt)\n{\n  const T* vals = reinterpret_cast<const T*>(buffer);\n\n  str += \"[ \";\n  for (size_t i = 0; i < element_cnt; ++i) {\n    const T& v = vals[i];\n    if (i != 0) {\n      str += \", \";\n    }\n    str += std::to_string(v);\n  }\n\n  str += \" ]\";\n\n  return nullptr;  // success\n}\n\n}  // namespace\n\n\nTRITONSERVER_Error*\nBufferAsTypedString(\n    std::string& str, const char* buffer, size_t buffer_byte_size,\n    TRITONSERVER_DataType datatype)\n{\n  const size_t element_cnt =\n      buffer_byte_size / TRITONSERVER_DataTypeByteSize(datatype);\n\n  switch (datatype) {\n    case TRITONSERVER_TYPE_UINT8:\n      return BufferAsTypedString<uint8_t>(str, buffer, element_cnt);\n    case TRITONSERVER_TYPE_UINT16:\n      return BufferAsTypedString<uint16_t>(str, buffer, element_cnt);\n    case TRITONSERVER_TYPE_UINT32:\n      return BufferAsTypedString<uint32_t>(str, buffer, element_cnt);\n    case TRITONSERVER_TYPE_UINT64:\n      return BufferAsTypedString<uint64_t>(str, buffer, element_cnt);\n\n    case TRITONSERVER_TYPE_INT8:\n      return BufferAsTypedString<int8_t>(str, buffer, element_cnt);\n    case TRITONSERVER_TYPE_INT16:\n      return BufferAsTypedString<int16_t>(str, buffer, element_cnt);\n    case TRITONSERVER_TYPE_INT32:\n      return BufferAsTypedString<int32_t>(str, buffer, element_cnt);\n    case TRITONSERVER_TYPE_INT64:\n      return BufferAsTypedString<int64_t>(str, buffer, element_cnt);\n\n    case TRITONSERVER_TYPE_FP32:\n      return BufferAsTypedString<float>(str, buffer, element_cnt);\n    case TRITONSERVER_TYPE_FP64:\n      return BufferAsTypedString<double>(str, buffer, element_cnt);\n\n    default:\n      return TRITONSERVER_ErrorNew(\n          TRITONSERVER_ERROR_INVALID_ARG,\n          std::string(\n              std::string(\"class result not available for output due to \"\n                          \"unsupported type '\") +\n              std::string(TRITONSERVER_DataTypeString(datatype)) + \"'\")\n              .c_str());\n  }\n\n  return nullptr;  // success\n}\n\nstd::string\nGetRequestId(TRITONBACKEND_Request* request)\n{\n  const char* request_id = nullptr;\n  LOG_IF_ERROR(\n      TRITONBACKEND_RequestId(request, &request_id),\n      \"unable to retrieve request ID string\");\n  if ((request_id == nullptr) || (request_id[0] == '\\0')) {\n    request_id = \"<id_unknown>\";\n  }\n  return std::string(\"[request id: \") + request_id + \"] \";\n}\n\nTRITONSERVER_Error*\nValidateStringBuffer(\n    const char* buffer, size_t buffer_byte_size,\n    const size_t expected_element_cnt, const char* input_name,\n    std::vector<std::pair<const char*, const uint32_t>>* str_list)\n{\n  size_t element_idx = 0;\n  size_t remaining_bytes = buffer_byte_size;\n\n  // Each string in 'buffer' is a 4-byte length followed by the string itself\n  // with no null-terminator.\n  while (remaining_bytes >= sizeof(uint32_t)) {\n    // Do not modify this line. str_list->size() must not exceed\n    // expected_element_cnt.\n    if (element_idx >= expected_element_cnt) {\n      return TRITONSERVER_ErrorNew(\n          TRITONSERVER_ERROR_INVALID_ARG,\n          std::string(\n              \"unexpected number of string elements \" +\n              std::to_string(element_idx + 1) + \" for inference input '\" +\n              input_name + \"', expecting \" +\n              std::to_string(expected_element_cnt))\n              .c_str());\n    }\n\n    const uint32_t len = *(reinterpret_cast<const uint32_t*>(buffer));\n    remaining_bytes -= sizeof(uint32_t);\n    buffer += sizeof(uint32_t);\n\n    if (remaining_bytes < len) {\n      return TRITONSERVER_ErrorNew(\n          TRITONSERVER_ERROR_INVALID_ARG,\n          std::string(\n              \"incomplete string data for inference input '\" +\n              std::string(input_name) + \"', expecting string of length \" +\n              std::to_string(len) + \" but only \" +\n              std::to_string(remaining_bytes) + \" bytes available\")\n              .c_str());\n    }\n\n    if (str_list) {\n      str_list->push_back({buffer, len});\n    }\n    buffer += len;\n    remaining_bytes -= len;\n    element_idx++;\n  }\n\n  if (element_idx != expected_element_cnt) {\n    return TRITONSERVER_ErrorNew(\n        TRITONSERVER_ERROR_INTERNAL,\n        std::string(\n            \"expected \" + std::to_string(expected_element_cnt) +\n            \" strings for inference input '\" + input_name + \"', got \" +\n            std::to_string(element_idx))\n            .c_str());\n  }\n  return nullptr;\n}\n\nTRITONSERVER_Error*\nGetOSValidPath(const std::string& path, std::string& ret_path)\n{\n  std::string l_path(path);\n#ifdef _WIN32\n  constexpr const const char* kWindowsLongPathPrefix = \"\\\\\\\\?\\\\\";\n  // On Windows long paths must be marked correctly otherwise, due to backwards\n  // compatibility, all paths are limited to MAX_PATH length\n  if (l_path.size() >= MAX_PATH) {\n    // Must be prefixed with \"\\\\?\\\" to be considered long path\n    if (l_path.substr(0, 4) != (kWindowsLongPathPrefix)) {\n      // Long path but not \"tagged\" correctly\n      l_path = (kWindowsLongPathPrefix) + l_path;\n    }\n  }\n  std::replace(l_path.begin(), l_path.end(), '/', '\\\\');\n#endif\n  ret_path = l_path;\n  return nullptr;\n}\n\n}}  // namespace triton::backend\n"
  },
  {
    "path": "src/backend_input_collector.cc",
    "content": "// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n//\n// Redistribution and use in source and binary forms, with or without\n// modification, are permitted provided that the following conditions\n// are met:\n//  * Redistributions of source code must retain the above copyright\n//    notice, this list of conditions and the following disclaimer.\n//  * Redistributions in binary form must reproduce the above copyright\n//    notice, this list of conditions and the following disclaimer in the\n//    documentation and/or other materials provided with the distribution.\n//  * Neither the name of NVIDIA CORPORATION nor the names of its\n//    contributors may be used to endorse or promote products derived\n//    from this software without specific prior written permission.\n//\n// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n#include \"triton/backend/backend_input_collector.h\"\n\n#include <atomic>\n\n#include \"triton/backend/backend_common.h\"\n#ifdef TRITON_ENABLE_GPU\n#include \"kernel.h\"\n#endif  // TRITON_ENABLE_GPU\n\nnamespace triton { namespace backend {\n\n//\n// BackendInputCollector::InputIterator\n//\n\nBackendInputCollector::InputIterator::InputIterator(\n    TRITONBACKEND_Request** requests, const uint32_t request_count,\n    std::vector<TRITONBACKEND_Response*>* responses, const char* input_name,\n    const char* host_policy_name, const bool coalesce_request_input)\n    : requests_(requests), request_count_(request_count), responses_(responses),\n      input_name_(input_name), host_policy_(host_policy_name),\n      coalesce_request_input_(coalesce_request_input), curr_request_idx_(0),\n      curr_buffer_idx_(0), reach_end_(false)\n{\n  auto& response = (*responses_)[curr_request_idx_];\n  RESPOND_AND_SET_NULL_IF_ERROR(\n      &response, TRITONBACKEND_RequestInput(\n                     requests_[curr_request_idx_], input_name_, &curr_input_));\n  RESPOND_AND_SET_NULL_IF_ERROR(\n      &response, TRITONBACKEND_InputPropertiesForHostPolicy(\n                     curr_input_, host_policy_, nullptr, nullptr, nullptr,\n                     nullptr, nullptr, &curr_buffer_cnt_));\n}\n\nbool\nBackendInputCollector::InputIterator::GetNextContiguousInput(\n    ContiguousBuffer* input)\n{\n  if (reach_end_ || (curr_buffer_idx_ >= curr_buffer_cnt_)) {\n    return false;\n  }\n\n  // Get the first buffer\n  TRITONBACKEND_InputBufferForHostPolicy(\n      curr_input_, host_policy_, curr_buffer_idx_,\n      reinterpret_cast<const void**>(&input->memory_desc_.buffer_),\n      reinterpret_cast<uint64_t*>(&input->memory_desc_.byte_size_),\n      &input->memory_desc_.memory_type_, &input->memory_desc_.memory_type_id_);\n  ++curr_buffer_idx_;\n  input->start_request_idx_ = curr_request_idx_;\n  input->end_request_idx_ = curr_request_idx_;\n  if (!coalesce_request_input_) {\n    if (curr_buffer_idx_ >= curr_buffer_cnt_) {\n      ++curr_request_idx_;\n      if (curr_request_idx_ < request_count_) {\n        auto& response = (*responses_)[curr_request_idx_];\n        RESPOND_AND_SET_NULL_IF_ERROR(\n            &response,\n            TRITONBACKEND_RequestInput(\n                requests_[curr_request_idx_], input_name_, &curr_input_));\n        RESPOND_AND_SET_NULL_IF_ERROR(\n            &response, TRITONBACKEND_InputPropertiesForHostPolicy(\n                           curr_input_, host_policy_, nullptr, nullptr, nullptr,\n                           nullptr, nullptr, &curr_buffer_cnt_));\n        // reset buffer idx\n        curr_buffer_idx_ = 0;\n      } else {\n        reach_end_ = true;\n      }\n    }\n    return true;\n  }\n\n  do {\n    for (; curr_buffer_idx_ < curr_buffer_cnt_; ++curr_buffer_idx_) {\n      const void* next_buffer;\n      size_t next_buffer_byte_size;\n      TRITONSERVER_MemoryType next_memory_type;\n      int64_t next_memory_type_id;\n      TRITONBACKEND_InputBufferForHostPolicy(\n          curr_input_, host_policy_, curr_buffer_idx_, &next_buffer,\n          reinterpret_cast<uint64_t*>(&next_buffer_byte_size),\n          &next_memory_type, &next_memory_type_id);\n      if (((input->memory_desc_.buffer_ + input->memory_desc_.byte_size_) !=\n           next_buffer) ||\n          (input->memory_desc_.memory_type_ != next_memory_type) ||\n          (input->memory_desc_.memory_type_id_ != next_memory_type_id)) {\n        return true;\n      }\n      input->memory_desc_.byte_size_ += next_buffer_byte_size;\n      input->end_request_idx_ = curr_request_idx_;\n    }\n    // Iterated all buffers for current request, check next\n    ++curr_request_idx_;\n    if (curr_request_idx_ < request_count_) {\n      auto& response = (*responses_)[curr_request_idx_];\n      RESPOND_AND_SET_NULL_IF_ERROR(\n          &response,\n          TRITONBACKEND_RequestInput(\n              requests_[curr_request_idx_], input_name_, &curr_input_));\n      RESPOND_AND_SET_NULL_IF_ERROR(\n          &response, TRITONBACKEND_InputPropertiesForHostPolicy(\n                         curr_input_, host_policy_, nullptr, nullptr, nullptr,\n                         nullptr, nullptr, &curr_buffer_cnt_));\n      // reset buffer idx\n      curr_buffer_idx_ = 0;\n    }\n  } while (curr_request_idx_ < request_count_);\n  reach_end_ = true;\n  return true;\n}\n\n//\n// BackendInputCollector\n//\n\nbool\nBackendInputCollector::GetInputBufferIfContiguous(\n    const char* input_name, const char** buffer, size_t* buffer_byte_size,\n    TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id)\n{\n  *buffer = nullptr;\n  *buffer_byte_size = 0;\n  const char* expected_next_buffer = nullptr;\n  bool contiguous = true;\n  for (size_t idx = 0; idx < request_count_; idx++) {\n    auto& request = requests_[idx];\n    auto& response = (*responses_)[idx];\n\n    TRITONBACKEND_Input* input;\n    RESPOND_AND_SET_NULL_IF_ERROR(\n        &response, TRITONBACKEND_RequestInput(request, input_name, &input));\n    uint64_t byte_size;\n    uint32_t buffer_count;\n    RESPOND_AND_SET_NULL_IF_ERROR(\n        &response, TRITONBACKEND_InputPropertiesForHostPolicy(\n                       input, host_policy_cstr_, nullptr, nullptr, nullptr,\n                       nullptr, &byte_size, &buffer_count));\n    for (size_t idx = 0; idx < buffer_count; ++idx) {\n      const void* src_buffer;\n      size_t src_byte_size;\n      TRITONSERVER_MemoryType src_memory_type;\n      int64_t src_memory_type_id;\n\n      RESPOND_AND_SET_NULL_IF_ERROR(\n          &response, TRITONBACKEND_InputBufferForHostPolicy(\n                         input, host_policy_cstr_, idx, &src_buffer,\n                         reinterpret_cast<uint64_t*>(&src_byte_size),\n                         &src_memory_type, &src_memory_type_id));\n      if (*buffer != nullptr) {\n        // If have seen the second buffer while coalescing input is not\n        // requested, treat the inputs are not contiguous\n        if (coalesce_request_input_ && (expected_next_buffer == src_buffer) &&\n            (*memory_type == src_memory_type) &&\n            (*memory_type_id == src_memory_type_id)) {\n          expected_next_buffer += src_byte_size;\n        } else {\n          contiguous = false;\n        }\n        // Want to know total buffer byte size even if it is not contiguous\n        *buffer_byte_size += src_byte_size;\n      } else {\n        *buffer = reinterpret_cast<const char*>(src_buffer);\n        *memory_type = src_memory_type;\n        *memory_type_id = src_memory_type_id;\n        *buffer_byte_size = src_byte_size;\n        expected_next_buffer = *buffer + src_byte_size;\n      }\n    }\n  }\n  return contiguous;\n}\n\nvoid\nBackendInputCollector::ProcessTensor(\n    const char* input_name, char* buffer, const size_t buffer_byte_size,\n    const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id)\n{\n  // A value of CPU_PINNED indicates that pinned memory buffer is not\n  // needed for this tensor. Any other value indicates that a pinned\n  // memory buffer is needed when the target memory type matches\n  // 'use_pinned_memory_type'.\n  TRITONSERVER_MemoryType use_pinned_memory_type =\n      TRITONSERVER_MEMORY_CPU_PINNED;\n  if (pinned_enabled_) {\n    use_pinned_memory_type = GetUsePinnedMemoryType(memory_type);\n  }\n  const bool use_kernel = (kernel_buffer_threshold_ != 0);\n\n  size_t buffer_offset = 0;\n\n  InputIterator ii(\n      requests_, request_count_, responses_, input_name, host_policy_cstr_,\n      coalesce_request_input_);\n  ContiguousBuffer input;\n  while (ii.GetNextContiguousInput(&input)) {\n    // If there are pending copies from tensor buffer that is not\n    // contiguous with 'response's part of that buffer, then need to\n    // go ahead and perform the pending copies so that can start a new\n    // contiguous region if necessary.\n    if ((pending_pinned_byte_size_ > 0) &&\n        (buffer_offset !=\n         (pending_pinned_byte_size_ + pending_pinned_offset_))) {\n      need_sync_ |= FlushPendingPinned(\n          buffer, buffer_byte_size, memory_type, memory_type_id);\n    }\n    if ((pending_copy_kernel_buffer_byte_size_ > 0) &&\n        (buffer_offset != (pending_copy_kernel_buffer_byte_size_ +\n                           pending_copy_kernel_buffer_offset_))) {\n      need_sync_ |= FlushPendingCopyKernel(\n          buffer, buffer_byte_size, memory_type, memory_type_id);\n    }\n\n    need_sync_ |= SetInputTensor(\n        input_name, input, buffer, buffer_byte_size, memory_type,\n        memory_type_id, buffer_offset, use_pinned_memory_type, use_kernel,\n        true);\n\n    buffer_offset += input.memory_desc_.byte_size_;\n  }\n\n  // Done with the tensor, flush any pending pinned copies.\n  need_sync_ |=\n      FlushPendingPinned(buffer, buffer_byte_size, memory_type, memory_type_id);\n  need_sync_ |= FlushPendingCopyKernel(\n      buffer, buffer_byte_size, memory_type, memory_type_id);\n#ifdef TRITON_ENABLE_GPU\n  if (need_sync_ && (event_ != nullptr)) {\n    cudaEventRecord(event_, stream_);\n  }\n#endif  // TRITON_ENABLE_GPU\n}\n\nTRITONSERVER_Error*\nBackendInputCollector::ProcessTensor(\n    const char* input_name, char* buffer, const size_t buffer_byte_size,\n    const std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>>&\n        allowed_input_types,\n    const char** dst_buffer, size_t* dst_buffer_byte_size,\n    TRITONSERVER_MemoryType* dst_memory_type, int64_t* dst_memory_type_id)\n{\n  if (buffer == nullptr) {\n    if (allowed_input_types.size() == 0) {\n      return TRITONSERVER_ErrorNew(\n          TRITONSERVER_ERROR_INTERNAL,\n          \"'allowed_input_types' must contain at least one pair of memory type \"\n          \"and id\");\n    }\n    if (GetInputBufferIfContiguous(\n            input_name, dst_buffer, dst_buffer_byte_size, dst_memory_type,\n            dst_memory_type_id)) {\n      // zero size buffer will be treated as contiguous as well,\n      // but we want to invoke backend memory to have a valid address.\n      if (*dst_buffer_byte_size != 0) {\n        // If the buffer is contiguous, check if the caller expects its type\n        for (const auto& allowed_type : allowed_input_types) {\n          if ((*dst_memory_type == allowed_type.first) &&\n              ((*dst_memory_type_id == allowed_type.second))) {\n            return nullptr;  // success\n          }\n        }\n      }\n    }\n    // A separate buffer is needed\n    BackendMemory* backend_memory = nullptr;\n    for (const auto& allowed_type : allowed_input_types) {\n      std::vector<BackendMemory::AllocationType> alloc_types;\n      const int64_t memory_type_id = allowed_type.second;\n      switch (allowed_type.first) {\n        case TRITONSERVER_MEMORY_GPU:\n          alloc_types = {\n              BackendMemory::AllocationType::GPU_POOL,\n              BackendMemory::AllocationType::GPU};\n          break;\n        case TRITONSERVER_MEMORY_CPU_PINNED:\n          alloc_types = {\n              BackendMemory::AllocationType::CPU_PINNED_POOL,\n              BackendMemory::AllocationType::CPU_PINNED};\n          break;\n        case TRITONSERVER_MEMORY_CPU:\n          alloc_types = {BackendMemory::AllocationType::CPU};\n          break;\n      }\n      auto err = BackendMemory::Create(\n          memory_manager_, alloc_types, memory_type_id, *dst_buffer_byte_size,\n          &backend_memory);\n      if (err != nullptr) {\n        LOG_MESSAGE(\n            TRITONSERVER_LOG_VERBOSE,\n            (std::string(\"unable to create backend memory for type: \") +\n             TRITONSERVER_MemoryTypeString(allowed_type.first) +\n             \" id: \" + std::to_string(memory_type_id) + \": \" +\n             TRITONSERVER_ErrorMessage(err))\n                .c_str());\n        TRITONSERVER_ErrorDelete(err);\n      } else {\n        in_use_memories_.emplace_back(backend_memory);\n        break;\n      }\n    }\n    if (backend_memory == nullptr) {\n      return TRITONSERVER_ErrorNew(\n          TRITONSERVER_ERROR_INTERNAL,\n          (std::string(\"failed to allocate contiguous buffer for input '\") +\n           input_name + \"'\")\n              .c_str());\n    }\n    buffer = backend_memory->MemoryPtr();\n    *dst_buffer = backend_memory->MemoryPtr();\n    *dst_buffer_byte_size = backend_memory->ByteSize();\n    *dst_memory_type = backend_memory->MemoryType();\n    *dst_memory_type_id = backend_memory->MemoryTypeId();\n  } else {\n    if (allowed_input_types.size() != 1) {\n      return TRITONSERVER_ErrorNew(\n          TRITONSERVER_ERROR_INTERNAL,\n          \"'allowed_input_types' must only contain the memory type and id of \"\n          \"'buffer'\");\n    }\n    *dst_buffer = buffer;\n    *dst_buffer_byte_size = buffer_byte_size;\n    *dst_memory_type = allowed_input_types[0].first;\n    *dst_memory_type_id = allowed_input_types[0].second;\n  }\n  if (*dst_buffer_byte_size != 0) {\n    ProcessTensor(\n        input_name, buffer, *dst_buffer_byte_size, *dst_memory_type,\n        *dst_memory_type_id);\n  }\n  return nullptr;  // success\n}\n\nbool\nBackendInputCollector::Finalize()\n{\n#ifdef TRITON_ENABLE_GPU\n  if ((!deferred_pinned_.empty()) && need_sync_) {\n    if (event_ != nullptr) {\n      cudaEventSynchronize(event_);\n    } else {\n      cudaStreamSynchronize(stream_);\n    }\n    need_sync_ = false;\n  }\n#endif  // TRITON_ENABLE_GPU\n\n  // After the above sync all the GPU->pinned copies are complete. Any\n  // deferred copies of pinned->CPU can now be done.\n#ifdef TRITON_ENABLE_GPU\n  if (buffer_ready_event_ != nullptr) {\n    cudaEventSynchronize(buffer_ready_event_);\n    buffer_ready_event_ = nullptr;\n  }\n#endif  // TRITON_ENABLE_GPU\n  for (auto& def : deferred_pinned_) {\n    if (!def.finalized_) {\n      need_sync_ |= def.Finalize(stream_);\n    }\n  }\n  for (size_t i = 0; i < async_task_count_; i++) {\n    need_sync_ |= completion_queue_.Get();\n  }\n\n#ifdef TRITON_ENABLE_GPU\n  // Record the new event location if deferred copies occur\n  if ((!deferred_pinned_.empty()) && need_sync_ && (event_ != nullptr)) {\n    cudaEventRecord(event_, stream_);\n  }\n#endif  // TRITON_ENABLE_GPU\n\n  return need_sync_;\n}\n\nbool\nBackendInputCollector::DeferredPinned::Finalize(cudaStream_t stream)\n{\n  bool cuda_used = false;\n  auto err = CopyBuffer(\n      \"pinned buffer\", TRITONSERVER_MEMORY_CPU_PINNED, 0, tensor_memory_type_,\n      tensor_memory_id_, pinned_memory_size_, pinned_memory_,\n      tensor_buffer_ + tensor_buffer_offset_, stream, &cuda_used);\n\n  // If something goes wrong with the copy all the pending\n  // responses fail...\n  if (err != nullptr) {\n    for (auto& pr : requests_) {\n      for (size_t idx = pr.start_request_idx_; idx <= pr.end_request_idx_;\n           ++idx) {\n        if ((*responses_)[idx] != nullptr) {\n          LOG_IF_ERROR(\n              TRITONBACKEND_ResponseSend(\n                  (*responses_)[idx], TRITONSERVER_RESPONSE_COMPLETE_FINAL,\n                  err),\n              \"failed to send error response\");\n          (*responses_)[idx] = nullptr;\n        }\n      }\n    }\n    TRITONSERVER_ErrorDelete(err);\n  }\n  return cuda_used;\n}\n\nbool\nBackendInputCollector::SetInputTensor(\n    const char* input_name, const ContiguousBuffer& input, char* tensor_buffer,\n    const size_t tensor_buffer_byte_size,\n    const TRITONSERVER_MemoryType tensor_memory_type,\n    const int64_t tensor_memory_type_id, const size_t tensor_buffer_offset,\n    const TRITONSERVER_MemoryType use_pinned_memory_type, const bool use_kernel,\n    const bool wait_buffer)\n{\n  bool cuda_copy = false;\n\n  if ((tensor_buffer_offset + input.memory_desc_.byte_size_) >\n      tensor_buffer_byte_size) {\n    for (size_t i = input.start_request_idx_; i <= input.end_request_idx_;\n         ++i) {\n      RESPOND_AND_SET_NULL_IF_ERROR(\n          &(*responses_)[i],\n          TRITONSERVER_ErrorNew(\n              TRITONSERVER_ERROR_INVALID_ARG,\n              std::string(\n                  \"unexpected total byte size \" +\n                  std::to_string(\n                      tensor_buffer_offset + input.memory_desc_.byte_size_) +\n                  \" for input '\" + input_name + \"', expecting \" +\n                  std::to_string(tensor_buffer_byte_size))\n                  .c_str()));\n    }\n    return cuda_copy;\n  }\n\n  // If the request buffer matches the memory type that should use an\n  // intermediate pinned memory buffer for the transfer, then just\n  // record the input as pending and increase the size required for\n  // the intermediate pinned buffer. We only do this check for the\n  // first buffer of an input and apply the same policy for all\n  // buffers. So if an inputs data is split over different memory\n  // types this may not be ideal but that should be a very rare\n  // situation.\n  if ((use_pinned_memory_type != TRITONSERVER_MEMORY_CPU_PINNED) &&\n      (input.memory_desc_.memory_type_ == use_pinned_memory_type)) {\n    if (pending_pinned_byte_size_ == 0) {\n      pending_pinned_offset_ = tensor_buffer_offset;\n    }\n\n    pending_pinned_byte_size_ += input.memory_desc_.byte_size_;\n    pending_pinned_input_buffers_.push_back(input);\n    return cuda_copy;\n  }\n  // [FIXME] support other direction if prove to be faster, all kernel\n  // handling code in this class assumes the destination buffer is on device\n  // If the request buffer and the destination buffer are accessible by all\n  // GPUs (i.e. pinned, device), initiate the copy via copy CUDA kernel.\n  // We only do this check for the\n  // first buffer of an input and apply the same policy for all\n  // buffers. So if an inputs data is split over different memory\n  // types this may not be ideal but that should be a very rare\n  // situation.\n  // Currently checked direction:\n  // pinned -> device\n  // same device -> device\n  // different device -> device\n  if (use_kernel &&\n      (input.memory_desc_.memory_type_ != TRITONSERVER_MEMORY_CPU) &&\n      (tensor_memory_type == TRITONSERVER_MEMORY_GPU)) {\n    // [FIXME] Currently not allowing copy between devices as it requires\n    // peer-to-peer access to be enabled. Peer-to-peer is enabled by default,\n    // but server can still runs even if it fails to enable peer-to-peer.\n    // Should provide a utility to check whether a device pair allows direct\n    // access and use gather kernel accordingly\n    if ((input.memory_desc_.memory_type_ != TRITONSERVER_MEMORY_GPU) ||\n        (input.memory_desc_.memory_type_id_ == tensor_memory_type_id)) {\n      if (pending_copy_kernel_buffer_byte_size_ == 0) {\n        pending_copy_kernel_buffer_offset_ = tensor_buffer_offset;\n      }\n\n      pending_copy_kernel_buffer_byte_size_ += input.memory_desc_.byte_size_;\n      ++pending_copy_kernel_input_buffer_counts_;\n      pending_copy_kernel_input_buffers_.push_back(input);\n      return cuda_copy;\n    }\n  }\n\n#ifdef TRITON_ENABLE_GPU\n  if (wait_buffer && (buffer_ready_event_ != nullptr)) {\n    cudaEventSynchronize(buffer_ready_event_);\n    buffer_ready_event_ = nullptr;\n  }\n#endif  // TRITON_ENABLE_GPU\n\n  // Direct copy without intermediate pinned memory.\n  bool cuda_used = false;\n  auto err = CopyBuffer(\n      input_name, input.memory_desc_.memory_type_,\n      input.memory_desc_.memory_type_id_, tensor_memory_type,\n      tensor_memory_type_id, input.memory_desc_.byte_size_,\n      input.memory_desc_.buffer_, tensor_buffer + tensor_buffer_offset, stream_,\n      &cuda_used, copy_on_stream_);\n  if (err != nullptr) {\n    for (size_t i = input.start_request_idx_; i <= input.end_request_idx_;\n         ++i) {\n      RESPOND_AND_SET_NULL_IF_ERROR(\n          &(*responses_)[i],\n          TRITONSERVER_ErrorNew(\n              TRITONSERVER_ErrorCode(err), TRITONSERVER_ErrorMessage(err)));\n    }\n    TRITONSERVER_ErrorDelete(err);\n  }\n  cuda_copy |= cuda_used;\n  return cuda_copy;\n}\n\nbool\nBackendInputCollector::FlushPendingPinned(\n    char* tensor_buffer, const size_t tensor_buffer_byte_size,\n    const TRITONSERVER_MemoryType tensor_memory_type,\n    const int64_t tensor_memory_type_id)\n{\n  bool cuda_copy = false;\n\n  // Will be copying from CPU->pinned->GPU or GPU->pinned->CPU\n\n  // Attempt to allocate a pinned buffer to use for staging the\n  // copy... if we fail to allocated the pinned buffer then we just\n  // directly go CPU->GPU or GPU->CPU.\n  char* pinned_memory = nullptr;\n  int64_t pinned_memory_type_id = 0;\n  TRITONSERVER_MemoryType pinned_memory_type;\n  BackendMemory* backend_memory;\n  if (pending_pinned_byte_size_ > 0) {\n    TRITONSERVER_Error* err = BackendMemory::Create(\n        memory_manager_,\n        {BackendMemory::AllocationType::CPU_PINNED_POOL,\n         BackendMemory::AllocationType::CPU_PINNED},\n        0 /* memory_type_id */, pending_pinned_byte_size_, &backend_memory);\n    if (err != nullptr) {\n      TRITONSERVER_ErrorDelete(err);\n    } else {\n      pinned_memory = backend_memory->MemoryPtr();\n      pinned_memory_type = backend_memory->MemoryType();\n      pinned_memory_type_id = backend_memory->MemoryTypeId();\n    }\n  }\n\n  // If the pinned buffer wasn't actually allocated then just perform\n  // a direct copy.\n  if (pinned_memory == nullptr) {\n    size_t offset = 0;\n    for (auto& pr : pending_pinned_input_buffers_) {\n      cuda_copy |= SetInputTensor(\n          \"pinned fallback\", pr, tensor_buffer, tensor_buffer_byte_size,\n          tensor_memory_type, tensor_memory_type_id,\n          pending_pinned_offset_ + offset, TRITONSERVER_MEMORY_CPU_PINNED,\n          false, true);\n      offset += pr.memory_desc_.byte_size_;\n    }\n  }\n  // We have a pinned buffer so copy the pending input buffer(s) into\n  // the pinned memory.\n  else {  // pinned_memory_type == TRITONSERVER_MEMORY_CPU_PINNED\n    bool cuda_used = false;\n    size_t offset = 0;\n    if (!use_async_cpu_copy_) {\n      for (auto& pr : pending_pinned_input_buffers_) {\n        cuda_used |= SetInputTensor(\n            \"pinned H2H\", pr, pinned_memory, pending_pinned_byte_size_,\n            TRITONSERVER_MEMORY_CPU_PINNED, 0 /* memory_type_id */, offset,\n            TRITONSERVER_MEMORY_CPU_PINNED, false, true);\n        offset += pr.memory_desc_.byte_size_;\n      }\n\n      cuda_copy |= cuda_used;\n\n      // If the copy was not async (i.e. if request input was in CPU so\n      // a CPU->CPU-PINNED copy was performed above), then the pinned\n      // buffer now holds the tensor contents and we can immediately\n      // issue the copies from the pinned buffer to the tensor.\n      //\n      // Otherwise the GPU->CPU-PINNED async copies are in flight and we\n      // simply remember the pinned buffer and the corresponding\n      // request inputs so that we can do the pinned->CPU copies in\n      // finalize after we have waited for all async copies to complete.\n      if (!cuda_used) {\n#ifdef TRITON_ENABLE_GPU\n        if (buffer_ready_event_ != nullptr) {\n          cudaEventSynchronize(buffer_ready_event_);\n          buffer_ready_event_ = nullptr;\n        }\n#endif  // TRITON_ENABLE_GPU\n        auto err = CopyBuffer(\n            \"pinned input buffer H2D\", TRITONSERVER_MEMORY_CPU_PINNED,\n            0 /* memory_type_id */, tensor_memory_type, tensor_memory_type_id,\n            pending_pinned_byte_size_, pinned_memory,\n            tensor_buffer + pending_pinned_offset_, stream_, &cuda_used,\n            copy_on_stream_);\n        cuda_copy |= cuda_used;\n\n        // If something goes wrong with the copy all the pending\n        // responses fail...\n        if (err != nullptr) {\n          for (auto& pr : pending_pinned_input_buffers_) {\n            for (size_t idx = pr.start_request_idx_; idx <= pr.end_request_idx_;\n                 ++idx) {\n              if ((*responses_)[idx] != nullptr) {\n                LOG_IF_ERROR(\n                    TRITONBACKEND_ResponseSend(\n                        (*responses_)[idx],\n                        TRITONSERVER_RESPONSE_COMPLETE_FINAL, err),\n                    \"failed to send error response\");\n                (*responses_)[idx] = nullptr;\n              }\n            }\n          }\n          TRITONSERVER_ErrorDelete(err);\n        }\n      } else {  // cuda_used\n        deferred_pinned_.emplace_back(\n            pinned_memory, pending_pinned_byte_size_, tensor_buffer,\n            pending_pinned_offset_, tensor_memory_type, tensor_memory_type_id,\n            std::move(pending_pinned_input_buffers_), responses_);\n      }\n    } else {\n      async_task_count_++;\n      deferred_pinned_.emplace_back(\n          pinned_memory, pending_pinned_byte_size_, tensor_buffer,\n          pending_pinned_offset_, tensor_memory_type, tensor_memory_type_id,\n          std::move(pending_pinned_input_buffers_), responses_);\n      auto& deferred_pinned = deferred_pinned_.back();\n      // Mark finalized to avoid duplicated call to DeferredPinned::Finalized()\n      // in BackendInputCollector::Finalize()\n      deferred_pinned_.back().finalized_ = true;\n      auto incomplete_count = new std::atomic<size_t>(std::min(\n          deferred_pinned_.back().requests_.size(),\n          triton::common::AsyncWorkQueue::WorkerCount()));\n      auto pending_pinned_byte_size = pending_pinned_byte_size_;\n      size_t stride = (deferred_pinned_.back().requests_.size() +\n                       triton::common::AsyncWorkQueue::WorkerCount() - 1) /\n                      triton::common::AsyncWorkQueue::WorkerCount();\n      auto pending_it = deferred_pinned_.back().requests_.begin();\n      while (pending_it != deferred_pinned_.back().requests_.end()) {\n        auto end_it = pending_it;\n        auto next_offset = offset;\n        for (size_t idx = 0; idx < stride; idx++) {\n          next_offset += end_it->memory_desc_.byte_size_;\n          end_it++;\n          if (end_it == deferred_pinned_.back().requests_.end()) {\n            break;\n          }\n        }\n\n        auto err =\n            CommonErrorToTritonError(triton::common::AsyncWorkQueue::AddTask(\n                [this, offset, pinned_memory, pinned_memory_type,\n                 pending_pinned_byte_size, pinned_memory_type_id, pending_it,\n                 end_it, incomplete_count, &deferred_pinned]() mutable {\n                  for (; pending_it != end_it; pending_it++) {\n                    SetInputTensor(\n                        \"pinned async H2H\", *pending_it, pinned_memory,\n                        pending_pinned_byte_size, pinned_memory_type,\n                        pinned_memory_type_id, offset,\n                        TRITONSERVER_MEMORY_CPU_PINNED, false, false);\n                    offset += pending_it->memory_desc_.byte_size_;\n                  }\n                  // The last segmented task will start the next phase of\n                  // the internal pinned buffer copy\n                  if (incomplete_count->fetch_sub(1) == 1) {\n#ifdef TRITON_ENABLE_GPU\n                    if (buffer_ready_event_ != nullptr) {\n                      cudaEventSynchronize(buffer_ready_event_);\n                      buffer_ready_event_ = nullptr;\n                    }\n#endif  // TRITON_ENABLE_GPU\n                    completion_queue_.Put(deferred_pinned.Finalize(stream_));\n                    delete incomplete_count;\n                  }\n                }));\n        if (err != nullptr) {\n          for (; pending_it != end_it; pending_it++) {\n            for (size_t idx = pending_it->start_request_idx_;\n                 idx <= pending_it->end_request_idx_; ++idx) {\n              if ((*responses_)[idx] != nullptr) {\n                LOG_IF_ERROR(\n                    TRITONBACKEND_ResponseSend(\n                        (*responses_)[idx],\n                        TRITONSERVER_RESPONSE_COMPLETE_FINAL, err),\n                    \"failed to send error response\");\n                (*responses_)[idx] = nullptr;\n              }\n            }\n          }\n        }\n        TRITONSERVER_ErrorDelete(err);\n\n        offset = next_offset;\n        pending_it = end_it;\n      }\n    }\n  }\n\n  // Pending pinned copies are handled...\n  pending_pinned_byte_size_ = 0;\n  pending_pinned_offset_ = 0;\n  pending_pinned_input_buffers_.clear();\n\n  // Need to hold on to the allocated pinned buffer as there are still\n  // copies in flight. Will delete it in finalize.\n  if (pinned_memory != nullptr) {\n    in_use_memories_.emplace_back(backend_memory);\n  }\n\n  return cuda_copy;\n}\n\nTRITONSERVER_Error*\nBackendInputCollector::BatchInputShape(\n    const BatchInput& batch_input, std::vector<int64_t>* shape)\n{\n  *shape = std::vector<int64_t>{0};\n  switch (batch_input.BatchInputKind()) {\n    case BatchInput::Kind::BATCH_ELEMENT_COUNT:\n    case BatchInput::Kind::BATCH_ACCUMULATED_ELEMENT_COUNT: {\n      (*shape)[0] = request_count_;\n      break;\n    }\n    case BatchInput::Kind::BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO: {\n      (*shape)[0] = request_count_ + 1;\n      break;\n    }\n    case BatchInput::Kind::BATCH_MAX_ELEMENT_COUNT_AS_SHAPE: {\n      const auto& source_input = batch_input.SourceInputs()[0];\n      for (size_t req_idx = 0; req_idx < request_count_; req_idx++) {\n        TRITONBACKEND_Input* input;\n        RETURN_IF_ERROR(TRITONBACKEND_RequestInput(\n            requests_[req_idx], source_input.c_str(), &input));\n        const int64_t* shape_arr;\n        uint32_t dims_count;\n        int64_t element_cnt = 0;\n        RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy(\n            input, host_policy_cstr_, nullptr, nullptr, &shape_arr, &dims_count,\n            nullptr, nullptr));\n        RETURN_IF_ERROR(GetElementCount(shape_arr, dims_count, &element_cnt));\n        (*shape)[0] = std::max((*shape)[0], element_cnt);\n      }\n      break;\n    }\n    case BatchInput::Kind::BATCH_ITEM_SHAPE: {\n      shape->emplace_back(0);\n      const auto& source_input = batch_input.SourceInputs()[0];\n      for (size_t req_idx = 0; req_idx < request_count_; req_idx++) {\n        TRITONBACKEND_Input* input;\n        RETURN_IF_ERROR(TRITONBACKEND_RequestInput(\n            requests_[req_idx], source_input.c_str(), &input));\n        const int64_t* shape_arr;\n        uint32_t dims_count;\n        RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy(\n            input, host_policy_cstr_, nullptr, nullptr, &shape_arr, &dims_count,\n            nullptr, nullptr));\n        // Assuming first dimension is batch size and ragged input is only set\n        // for batching enabled model.\n        (*shape)[0] += shape_arr[0];\n        // The batch input tracks the shape without batch dimension for\n        // each batch item\n        (*shape)[1] = (dims_count - 1);\n      }\n      break;\n    }\n    case BatchInput::Kind::BATCH_ITEM_SHAPE_FLATTEN: {\n      const auto& source_input = batch_input.SourceInputs()[0];\n      for (size_t req_idx = 0; req_idx < request_count_; req_idx++) {\n        TRITONBACKEND_Input* input;\n        RETURN_IF_ERROR(TRITONBACKEND_RequestInput(\n            requests_[req_idx], source_input.c_str(), &input));\n        const int64_t* shape_arr;\n        uint32_t dims_count;\n        RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy(\n            input, host_policy_cstr_, nullptr, nullptr, &shape_arr, &dims_count,\n            nullptr, nullptr));\n        // Assuming first dimension is batch size and ragged input is only set\n        // for batching enabled model.\n        // The batch input tracks the shape without batch dimension for\n        // each batch item\n        (*shape)[0] += (shape_arr[0] * (dims_count - 1));\n      }\n      break;\n    }\n    default:\n      return TRITONSERVER_ErrorNew(\n          TRITONSERVER_ERROR_INTERNAL, \"unsupported BatchInputKind received\");\n  }\n  return nullptr;  // success\n}\n\nTRITONSERVER_Error*\nBackendInputCollector::ProcessBatchInput(\n    const BatchInput& batch_input, char* buffer, const size_t buffer_byte_size,\n    const std::vector<std::pair<TRITONSERVER_MemoryType, int64_t>>&\n        allowed_input_types,\n    const char** dst_buffer, size_t* dst_buffer_byte_size,\n    TRITONSERVER_MemoryType* dst_memory_type, int64_t* dst_memory_type_id)\n{\n#ifdef TRITON_ENABLE_GPU\n  if (buffer_ready_event_ != nullptr) {\n    cudaEventSynchronize(buffer_ready_event_);\n    buffer_ready_event_ = nullptr;\n  }\n#endif  // TRITON_ENABLE_GPU\n  if (buffer == nullptr) {\n    if (allowed_input_types.size() == 0) {\n      return TRITONSERVER_ErrorNew(\n          TRITONSERVER_ERROR_INTERNAL,\n          \"'allowed_input_types' must contain at least one pair of memory type \"\n          \"and id\");\n    }\n    // Calculate the byte size of the buffer\n    std::vector<int64_t> shape;\n    RETURN_IF_ERROR(BatchInputShape(batch_input, &shape));\n    RETURN_IF_ERROR(GetByteSize(\n        batch_input.DataType(), shape,\n        reinterpret_cast<int64_t*>(dst_buffer_byte_size)));\n    BackendMemory* backend_memory = nullptr;\n    for (const auto& allowed_type : allowed_input_types) {\n      std::vector<BackendMemory::AllocationType> alloc_types;\n      const int64_t memory_type_id = allowed_type.second;\n      switch (allowed_type.first) {\n        case TRITONSERVER_MEMORY_GPU:\n          alloc_types = {\n              BackendMemory::AllocationType::GPU_POOL,\n              BackendMemory::AllocationType::GPU};\n          break;\n        case TRITONSERVER_MEMORY_CPU_PINNED:\n          alloc_types = {\n              BackendMemory::AllocationType::CPU_PINNED_POOL,\n              BackendMemory::AllocationType::CPU_PINNED};\n          break;\n        case TRITONSERVER_MEMORY_CPU:\n          alloc_types = {BackendMemory::AllocationType::CPU};\n          break;\n      }\n      auto err = BackendMemory::Create(\n          memory_manager_, alloc_types, memory_type_id, *dst_buffer_byte_size,\n          &backend_memory);\n      if (err != nullptr) {\n        LOG_MESSAGE(\n            TRITONSERVER_LOG_VERBOSE,\n            (std::string(\"unable to create backend memory for type: \") +\n             TRITONSERVER_MemoryTypeString(allowed_type.first) +\n             \" id: \" + std::to_string(memory_type_id) + \": \" +\n             TRITONSERVER_ErrorMessage(err))\n                .c_str());\n        TRITONSERVER_ErrorDelete(err);\n      } else {\n        in_use_memories_.emplace_back(backend_memory);\n        break;\n      }\n    }\n    if (backend_memory == nullptr) {\n      return TRITONSERVER_ErrorNew(\n          TRITONSERVER_ERROR_INTERNAL,\n          (std::string(\n               \"failed to allocate contiguous buffer for batch input '\") +\n           batch_input.TargetNames()[0] + \"'\")\n              .c_str());\n    }\n    buffer = backend_memory->MemoryPtr();\n    *dst_buffer = backend_memory->MemoryPtr();\n    *dst_buffer_byte_size = backend_memory->ByteSize();\n    *dst_memory_type = backend_memory->MemoryType();\n    *dst_memory_type_id = backend_memory->MemoryTypeId();\n  } else {\n    if (allowed_input_types.size() != 1) {\n      return TRITONSERVER_ErrorNew(\n          TRITONSERVER_ERROR_INTERNAL,\n          \"'allowed_input_types' must only contain the memory type and id of \"\n          \"'buffer'\");\n    }\n    *dst_buffer = buffer;\n    *dst_buffer_byte_size = buffer_byte_size;\n    *dst_memory_type = allowed_input_types[0].first;\n    *dst_memory_type_id = allowed_input_types[0].second;\n  }\n\n  char* input_buffer = buffer;\n  std::unique_ptr<BackendMemory> internal_buffer;\n  // Need a CPU buffer for modifying the value\n  if (*dst_memory_type == TRITONSERVER_MEMORY_GPU) {\n    BackendMemory* ib = nullptr;\n    RETURN_IF_ERROR(BackendMemory::Create(\n        memory_manager_,\n        {BackendMemory::AllocationType::CPU_PINNED_POOL,\n         BackendMemory::AllocationType::CPU},\n        0, *dst_buffer_byte_size, &ib));\n    internal_buffer.reset(ib);\n    input_buffer = internal_buffer->MemoryPtr();\n  }\n  const auto& data_type = batch_input.DataType();\n  switch (batch_input.BatchInputKind()) {\n    case BatchInput::Kind::BATCH_ELEMENT_COUNT: {\n      const auto& source_input = batch_input.SourceInputs()[0];\n      if (data_type == TRITONSERVER_TYPE_FP32) {\n        RETURN_IF_ERROR(SetElementCount<float>(\n            source_input, input_buffer, *dst_buffer_byte_size));\n      } else {\n        RETURN_IF_ERROR(SetElementCount<int32_t>(\n            source_input, input_buffer, *dst_buffer_byte_size));\n      }\n      break;\n    }\n    case BatchInput::Kind::BATCH_ACCUMULATED_ELEMENT_COUNT: {\n      const auto& source_input = batch_input.SourceInputs()[0];\n      if (data_type == TRITONSERVER_TYPE_FP32) {\n        RETURN_IF_ERROR(SetAccumulatedElementCount<float>(\n            source_input, input_buffer, *dst_buffer_byte_size));\n      } else {\n        RETURN_IF_ERROR(SetAccumulatedElementCount<int32_t>(\n            source_input, input_buffer, *dst_buffer_byte_size));\n      }\n      break;\n    }\n    case BatchInput::Kind::BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO: {\n      const auto& source_input = batch_input.SourceInputs()[0];\n      if (data_type == TRITONSERVER_TYPE_FP32) {\n        *reinterpret_cast<float*>(input_buffer) = 0;\n        if (*dst_buffer_byte_size < sizeof(float)) {\n          return TRITONSERVER_ErrorNew(\n              TRITONSERVER_ERROR_INVALID_ARG,\n              (std::string(\n                   \"Unexpected total byte size for batch input. Expect >= \") +\n               std::to_string(sizeof(float)) + \", got \" +\n               std::to_string(*dst_buffer_byte_size))\n                  .c_str());\n        }\n\n        RETURN_IF_ERROR(SetAccumulatedElementCount<float>(\n            source_input, input_buffer + sizeof(float),\n            *dst_buffer_byte_size - sizeof(float)));\n      } else {\n        *reinterpret_cast<int32_t*>(input_buffer) = 0;\n        if (*dst_buffer_byte_size < sizeof(int32_t)) {\n          return TRITONSERVER_ErrorNew(\n              TRITONSERVER_ERROR_INVALID_ARG,\n              (std::string(\n                   \"Unexpected total byte size for batch input. Expect >= \") +\n               std::to_string(sizeof(int32_t)) + \", got \" +\n               std::to_string(*dst_buffer_byte_size))\n                  .c_str());\n        }\n\n        RETURN_IF_ERROR(SetAccumulatedElementCount<int32_t>(\n            source_input, input_buffer + sizeof(int32_t),\n            *dst_buffer_byte_size - sizeof(int32_t)));\n      }\n      break;\n    }\n    case BatchInput::Kind::BATCH_MAX_ELEMENT_COUNT_AS_SHAPE: {\n      // The batch input is described by the shape,\n      // no data modification is needed\n      return nullptr;  // success\n    }\n    case BatchInput::Kind::BATCH_ITEM_SHAPE:\n    case BatchInput::Kind::BATCH_ITEM_SHAPE_FLATTEN: {\n      // Use the same utilities for both types as the data will be the same,\n      // only difference is the shape of the tensor.\n      const auto& source_input = batch_input.SourceInputs()[0];\n      if (data_type == TRITONSERVER_TYPE_FP32) {\n        *reinterpret_cast<float*>(input_buffer) = 0;\n        RETURN_IF_ERROR(SetBatchItemShape<float>(\n            source_input, input_buffer, *dst_buffer_byte_size));\n      } else {\n        *reinterpret_cast<int32_t*>(input_buffer) = 0;\n        RETURN_IF_ERROR(SetBatchItemShape<int32_t>(\n            source_input, input_buffer, *dst_buffer_byte_size));\n      }\n      break;\n    }\n  }\n  if (*dst_memory_type == TRITONSERVER_MEMORY_GPU) {\n    bool cuda_used;\n    RETURN_IF_ERROR(CopyBuffer(\n        \"batch input buffer\", internal_buffer->MemoryType(),\n        internal_buffer->MemoryTypeId(), *dst_memory_type, *dst_memory_type_id,\n        *dst_buffer_byte_size, input_buffer, buffer, stream_, &cuda_used,\n        copy_on_stream_));\n    // Need to keep the backend memory alive in the case of async copy\n    in_use_memories_.emplace_back(std::move(internal_buffer));\n    need_sync_ |= cuda_used;\n  }\n  return nullptr;  // success\n}\n\ntemplate <typename T>\nTRITONSERVER_Error*\nBackendInputCollector::SetElementCount(\n    const std::string& source_input, char* buffer,\n    const size_t buffer_byte_size)\n{\n  size_t buffer_offset = 0;\n  for (size_t req_idx = 0; req_idx < request_count_; req_idx++) {\n    if (buffer_offset + sizeof(T) > buffer_byte_size) {\n      return TRITONSERVER_ErrorNew(\n          TRITONSERVER_ERROR_INVALID_ARG,\n          \"unexpected total byte size for batch input\");\n    }\n\n    TRITONBACKEND_Input* input;\n    RETURN_IF_ERROR(TRITONBACKEND_RequestInput(\n        requests_[req_idx], source_input.c_str(), &input));\n    const int64_t* shape;\n    uint32_t dims_count;\n    int64_t element_cnt = 0;\n    RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy(\n        input, host_policy_cstr_, nullptr, nullptr, &shape, &dims_count,\n        nullptr, nullptr));\n    RETURN_IF_ERROR(GetElementCount(shape, dims_count, &element_cnt));\n    *(reinterpret_cast<T*>(buffer) + req_idx) = element_cnt;\n    buffer_offset += sizeof(T);\n  }\n  // Set the rest of the buffer to 0\n  for (; buffer_offset + sizeof(T) <= buffer_byte_size;\n       buffer_offset += sizeof(T)) {\n    *reinterpret_cast<T*>(buffer + buffer_offset) = 0;\n  }\n  return nullptr;  // success\n}\n\ntemplate <typename T>\nTRITONSERVER_Error*\nBackendInputCollector::SetAccumulatedElementCount(\n    const std::string& source_input, char* buffer,\n    const size_t buffer_byte_size)\n{\n  size_t accumulated_element_count = 0;\n  size_t buffer_offset = 0;\n  for (size_t req_idx = 0; req_idx < request_count_; req_idx++) {\n    if (buffer_offset + sizeof(T) > buffer_byte_size) {\n      return TRITONSERVER_ErrorNew(\n          TRITONSERVER_ERROR_INVALID_ARG,\n          \"unexpected total byte size for batch input\");\n    }\n\n    TRITONBACKEND_Input* input;\n    RETURN_IF_ERROR(TRITONBACKEND_RequestInput(\n        requests_[req_idx], source_input.c_str(), &input));\n    const int64_t* shape;\n    uint32_t dims_count;\n    int64_t element_cnt = 0;\n    RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy(\n        input, host_policy_cstr_, nullptr, nullptr, &shape, &dims_count,\n        nullptr, nullptr));\n    RETURN_IF_ERROR(GetElementCount(shape, dims_count, &element_cnt));\n    accumulated_element_count += element_cnt;\n    *(reinterpret_cast<T*>(buffer) + req_idx) = accumulated_element_count;\n    buffer_offset += sizeof(T);\n  }\n  // Set the rest of the buffer to 'accumulated_element_count'\n  // (no increase in element count)\n  for (; buffer_offset + sizeof(T) <= buffer_byte_size;\n       buffer_offset += sizeof(T)) {\n    *reinterpret_cast<T*>(buffer + buffer_offset) = accumulated_element_count;\n  }\n  return nullptr;  // success\n}\n\ntemplate <typename T>\nTRITONSERVER_Error*\nBackendInputCollector::SetBatchItemShape(\n    const std::string& source_input, char* buffer,\n    const size_t buffer_byte_size)\n{\n  size_t buffer_offset = 0;\n  for (size_t req_idx = 0; req_idx < request_count_; req_idx++) {\n    TRITONBACKEND_Input* input;\n    RETURN_IF_ERROR(TRITONBACKEND_RequestInput(\n        requests_[req_idx], source_input.c_str(), &input));\n    const int64_t* shape;\n    uint32_t dims_count;\n    RETURN_IF_ERROR(TRITONBACKEND_InputPropertiesForHostPolicy(\n        input, host_policy_cstr_, nullptr, nullptr, &shape, &dims_count,\n        nullptr, nullptr));\n    // Assuming first dimension is batch size and ragged input is only set\n    // for batching enabled model.\n    size_t batch_1_size = sizeof(T) * (dims_count - 1);\n    if (buffer_offset + (size_t)shape[0] * batch_1_size > buffer_byte_size) {\n      return TRITONSERVER_ErrorNew(\n          TRITONSERVER_ERROR_INVALID_ARG,\n          (GetRequestId(requests_[req_idx]) +\n           \"unexpected total byte size for batch input\")\n              .c_str());\n    }\n    // The batch input tracks the shape without batch dimension for\n    // each batch item\n    for (size_t idx = 1; idx < dims_count; ++idx) {\n      // Need to set the element explicitly for type conversion\n      *(reinterpret_cast<T*>(buffer + buffer_offset) + (idx - 1)) = shape[idx];\n    }\n    // memcpy the data repeatedly if the request has batch size > 1\n    for (int64_t idx = 1; idx < shape[0]; ++idx) {\n      memcpy(\n          buffer + buffer_offset + idx * batch_1_size, buffer + buffer_offset,\n          batch_1_size);\n    }\n    buffer_offset += batch_1_size * (size_t)shape[0];\n  }\n  return nullptr;  // success\n}\n\nbool\nBackendInputCollector::FlushPendingCopyKernel(\n    char* tensor_buffer, const size_t tensor_buffer_byte_size,\n    const TRITONSERVER_MemoryType tensor_memory_type,\n    const int64_t tensor_memory_type_id)\n{\n  if (pending_copy_kernel_input_buffers_.size() == 0) {\n    return false;\n  }\n\n  bool cuda_copy = false;\n  TRITONSERVER_Error* error = nullptr;\n  // Only try to launch kernel if buffer count is large enough for\n  // good GPU utilization\n  if (pending_copy_kernel_input_buffer_counts_ >= kernel_buffer_threshold_) {\n    error = LaunchCopyKernel(\n        tensor_buffer, tensor_buffer_byte_size, tensor_memory_type,\n        tensor_memory_type_id);\n    cuda_copy = (error == nullptr);\n    LOG_MESSAGE(\n        TRITONSERVER_LOG_VERBOSE,\n        (std::string(\"gather kernel launched with status: \") +\n         ((error == nullptr) ? \"Success\" : TRITONSERVER_ErrorMessage(error)))\n            .c_str());\n  }\n  // If kernel can't be launched then just perform a direct copy.\n  if ((pending_copy_kernel_input_buffer_counts_ < kernel_buffer_threshold_) ||\n      (error != nullptr)) {\n    size_t offset = 0;\n    for (auto& pr : pending_copy_kernel_input_buffers_) {\n      cuda_copy |= SetInputTensor(\n          \"gather kernel fallback\", pr, tensor_buffer, tensor_buffer_byte_size,\n          tensor_memory_type, tensor_memory_type_id,\n          pending_copy_kernel_buffer_offset_ + offset,\n          TRITONSERVER_MEMORY_CPU_PINNED, false, true);\n      offset += pr.memory_desc_.byte_size_;\n    }\n  }\n  TRITONSERVER_ErrorDelete(error);\n\n  // Pending kernel copies are handled...\n  pending_copy_kernel_buffer_byte_size_ = 0;\n  pending_copy_kernel_buffer_offset_ = 0;\n  pending_copy_kernel_input_buffer_counts_ = 0;\n  pending_copy_kernel_input_buffers_.clear();\n\n  return cuda_copy;\n}\n\nTRITONSERVER_Error*\nBackendInputCollector::LaunchCopyKernel(\n    char* tensor_buffer, const size_t tensor_buffer_byte_size,\n    const TRITONSERVER_MemoryType tensor_memory_type,\n    const int64_t tensor_memory_type_id)\n{\n#ifdef TRITON_ENABLE_GPU\n  input_ptr_buffer_host_.emplace_back(new std::vector<int8_t*>());\n  byte_size_buffer_host_.emplace_back(new std::vector<size_t>());\n  byte_size_offset_buffer_host_.emplace_back(new std::vector<size_t>());\n\n  auto& input_ptr_buffer_host = *input_ptr_buffer_host_.back();\n  auto& byte_size_buffer_host = *byte_size_buffer_host_.back();\n  auto& byte_size_offset_buffer_host = *byte_size_offset_buffer_host_.back();\n\n  input_ptr_buffer_host.reserve(pending_copy_kernel_input_buffer_counts_);\n  byte_size_buffer_host.reserve(pending_copy_kernel_input_buffer_counts_);\n  byte_size_offset_buffer_host.reserve(\n      pending_copy_kernel_input_buffer_counts_);\n\n  size_t byte_size_offset = 0;\n  for (const auto& response_input : pending_copy_kernel_input_buffers_) {\n    const auto& input = response_input.memory_desc_;\n    input_ptr_buffer_host.emplace_back(\n        const_cast<int8_t*>(reinterpret_cast<const int8_t*>(input.buffer_)));\n    byte_size_buffer_host.emplace_back(input.byte_size_);\n    byte_size_offset_buffer_host.emplace_back(byte_size_offset);\n    byte_size_offset += input.byte_size_;\n  }\n\n  BackendMemory* backend_memory = nullptr;\n  std::vector<BackendMemory::AllocationType> alloc_types;\n  switch (tensor_memory_type) {\n    case TRITONSERVER_MEMORY_GPU:\n      alloc_types = {\n          BackendMemory::AllocationType::GPU_POOL,\n          BackendMemory::AllocationType::GPU};\n      break;\n    case TRITONSERVER_MEMORY_CPU_PINNED:\n      alloc_types = {\n          BackendMemory::AllocationType::CPU_PINNED_POOL,\n          BackendMemory::AllocationType::CPU_PINNED};\n      break;\n    case TRITONSERVER_MEMORY_CPU:\n      alloc_types = {BackendMemory::AllocationType::CPU};\n      break;\n  }\n\n  // input_ptr_buffer\n  size_t input_ptr_buffer_byte_size =\n      pending_copy_kernel_input_buffer_counts_ * sizeof(int8_t*);\n  auto err = BackendMemory::Create(\n      memory_manager_, alloc_types, tensor_memory_type_id,\n      input_ptr_buffer_byte_size, &backend_memory);\n  if (err != nullptr) {\n    LOG_MESSAGE(\n        TRITONSERVER_LOG_VERBOSE,\n        (std::string(\"unable to create backend memory for type: \") +\n         TRITONSERVER_MemoryTypeString(tensor_memory_type) +\n         \" id: \" + std::to_string(tensor_memory_type_id) + \": \" +\n         TRITONSERVER_ErrorMessage(err))\n            .c_str());\n    TRITONSERVER_ErrorDelete(err);\n  } else {\n    in_use_memories_.emplace_back(backend_memory);\n  }\n  if (backend_memory == nullptr ||\n      (backend_memory->MemoryType() != tensor_memory_type) ||\n      (backend_memory->MemoryTypeId() != tensor_memory_type_id)) {\n    return TRITONSERVER_ErrorNew(\n        TRITONSERVER_ERROR_INTERNAL,\n        \"Failed to obtain memory buffer for copy kernel input\");\n  }\n  char* input_ptr_buffer = backend_memory->MemoryPtr();\n\n  // byte_size_buffer\n  size_t byte_size_buffer_byte_size =\n      pending_copy_kernel_input_buffer_counts_ * sizeof(size_t);\n  err = BackendMemory::Create(\n      memory_manager_, alloc_types, tensor_memory_type_id,\n      byte_size_buffer_byte_size, &backend_memory);\n  if (err != nullptr) {\n    LOG_MESSAGE(\n        TRITONSERVER_LOG_VERBOSE,\n        (std::string(\"unable to create backend memory for type: \") +\n         TRITONSERVER_MemoryTypeString(tensor_memory_type) +\n         \" id: \" + std::to_string(tensor_memory_type_id) + \": \" +\n         TRITONSERVER_ErrorMessage(err))\n            .c_str());\n    TRITONSERVER_ErrorDelete(err);\n  } else {\n    in_use_memories_.emplace_back(backend_memory);\n  }\n  if (backend_memory == nullptr ||\n      (backend_memory->MemoryType() != tensor_memory_type) ||\n      (backend_memory->MemoryTypeId() != tensor_memory_type_id)) {\n    return TRITONSERVER_ErrorNew(\n        TRITONSERVER_ERROR_INTERNAL,\n        \"Failed to obtain memory buffer for copy kernel input\");\n  }\n  char* byte_size_buffer = backend_memory->MemoryPtr();\n\n  // byte_size_offset_buffer\n  size_t byte_size_offset_buffer_byte_size =\n      pending_copy_kernel_input_buffer_counts_ * sizeof(size_t);\n  err = BackendMemory::Create(\n      memory_manager_, alloc_types, tensor_memory_type_id,\n      byte_size_offset_buffer_byte_size, &backend_memory);\n  if (err != nullptr) {\n    LOG_MESSAGE(\n        TRITONSERVER_LOG_VERBOSE,\n        (std::string(\"unable to create backend memory for type: \") +\n         TRITONSERVER_MemoryTypeString(tensor_memory_type) +\n         \" id: \" + std::to_string(tensor_memory_type_id) + \": \" +\n         TRITONSERVER_ErrorMessage(err))\n            .c_str());\n    TRITONSERVER_ErrorDelete(err);\n  } else {\n    in_use_memories_.emplace_back(backend_memory);\n  }\n  if (backend_memory == nullptr ||\n      (backend_memory->MemoryType() != tensor_memory_type) ||\n      (backend_memory->MemoryTypeId() != tensor_memory_type_id)) {\n    return TRITONSERVER_ErrorNew(\n        TRITONSERVER_ERROR_INTERNAL,\n        \"Failed to obtain memory buffer for copy kernel input\");\n  }\n  char* byte_size_offset_buffer = backend_memory->MemoryPtr();\n\n  cudaMemcpyAsync(\n      input_ptr_buffer, input_ptr_buffer_host.data(),\n      pending_copy_kernel_input_buffer_counts_ * sizeof(int8_t*),\n      cudaMemcpyDefault, stream_);\n  cudaMemcpyAsync(\n      byte_size_buffer, byte_size_buffer_host.data(),\n      pending_copy_kernel_input_buffer_counts_ * sizeof(size_t),\n      cudaMemcpyDefault, stream_);\n  cudaMemcpyAsync(\n      byte_size_offset_buffer, byte_size_offset_buffer_host.data(),\n      pending_copy_kernel_input_buffer_counts_ * sizeof(size_t),\n      cudaMemcpyDefault, stream_);\n  if (buffer_ready_event_ != nullptr) {\n    cudaEventSynchronize(buffer_ready_event_);\n    buffer_ready_event_ = nullptr;\n  }\n  RETURN_IF_CUDA_ERROR(\n      RunGatherKernel(\n          (const int8_t**)input_ptr_buffer, (const size_t*)byte_size_buffer,\n          (const size_t*)byte_size_offset_buffer,\n          (int8_t*)tensor_buffer + pending_copy_kernel_buffer_offset_,\n          pending_copy_kernel_input_buffer_counts_, stream_),\n      TRITONSERVER_ERROR_INTERNAL,\n      std::string(\"Failed to launch gather kernel\"));\n  return nullptr;\n#else\n  return TRITONSERVER_ErrorNew(\n      TRITONSERVER_ERROR_UNSUPPORTED,\n      \"Copy kernel can not be launched with TRITON_ENABLE_GPU=OFF\");\n#endif  // TRITON_ENABLE_GPU\n}\n\n}}  // namespace triton::backend\n"
  },
  {
    "path": "src/backend_memory.cc",
    "content": "// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.\n//\n// Redistribution and use in source and binary forms, with or without\n// modification, are permitted provided that the following conditions\n// are met:\n//  * Redistributions of source code must retain the above copyright\n//    notice, this list of conditions and the following disclaimer.\n//  * Redistributions in binary form must reproduce the above copyright\n//    notice, this list of conditions and the following disclaimer in the\n//    documentation and/or other materials provided with the distribution.\n//  * Neither the name of NVIDIA CORPORATION nor the names of its\n//    contributors may be used to endorse or promote products derived\n//    from this software without specific prior written permission.\n//\n// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n#include \"triton/backend/backend_memory.h\"\n\n#include <map>\n\n#include \"triton/backend/backend_common.h\"\n\nnamespace triton { namespace backend {\n\nTRITONSERVER_Error*\nBackendMemory::Create(\n    TRITONBACKEND_MemoryManager* manager, const AllocationType alloc_type,\n    const int64_t memory_type_id, const size_t byte_size, BackendMemory** mem)\n{\n  *mem = nullptr;\n\n  void* ptr = nullptr;\n  switch (alloc_type) {\n    case AllocationType::CPU_PINNED: {\n#ifdef TRITON_ENABLE_GPU\n      RETURN_IF_CUDA_ERROR(\n          cudaHostAlloc(&ptr, byte_size, cudaHostAllocPortable),\n          TRITONSERVER_ERROR_UNAVAILABLE,\n          std::string(\"failed to allocate pinned system memory\"));\n#else\n      return TRITONSERVER_ErrorNew(\n          TRITONSERVER_ERROR_UNSUPPORTED,\n          \"pinned-memory allocation not supported\");\n#endif  // TRITON_ENABLE_GPU\n      break;\n    }\n\n    case AllocationType::GPU: {\n#ifdef TRITON_ENABLE_GPU\n      int current_device;\n      RETURN_IF_CUDA_ERROR(\n          cudaGetDevice(&current_device), TRITONSERVER_ERROR_INTERNAL,\n          std::string(\"failed to get device\"));\n      bool overridden = (current_device != memory_type_id);\n      if (overridden) {\n        RETURN_IF_CUDA_ERROR(\n            cudaSetDevice(memory_type_id), TRITONSERVER_ERROR_INTERNAL,\n            std::string(\"failed to set device\"));\n      }\n\n      auto err = cudaMalloc(&ptr, byte_size);\n\n      if (overridden) {\n        LOG_IF_CUDA_ERROR(\n            cudaSetDevice(current_device), \"failed to set CUDA device\");\n      }\n\n      RETURN_ERROR_IF_FALSE(\n          err == cudaSuccess, TRITONSERVER_ERROR_UNAVAILABLE,\n          std::string(\"failed to allocate GPU memory: \") +\n              cudaGetErrorString(err));\n#else\n      return TRITONSERVER_ErrorNew(\n          TRITONSERVER_ERROR_UNSUPPORTED, \"GPU allocation not supported\");\n#endif  // TRITON_ENABLE_GPU\n      break;\n    }\n\n    case AllocationType::CPU:\n    case AllocationType::CPU_PINNED_POOL:\n    case AllocationType::GPU_POOL:\n      RETURN_IF_ERROR(TRITONBACKEND_MemoryManagerAllocate(\n          manager, &ptr, AllocTypeToMemoryType(alloc_type), memory_type_id,\n          byte_size));\n      break;\n  }\n\n  *mem = new BackendMemory(\n      manager, alloc_type, memory_type_id, reinterpret_cast<char*>(ptr),\n      byte_size);\n\n  return nullptr;  // success\n}\n\nTRITONSERVER_Error*\nBackendMemory::Create(\n    TRITONBACKEND_MemoryManager* manager,\n    const std::vector<AllocationType>& alloc_types,\n    const int64_t memory_type_id, const size_t byte_size, BackendMemory** mem)\n{\n  *mem = nullptr;\n  RETURN_ERROR_IF_TRUE(\n      alloc_types.size() == 0, TRITONSERVER_ERROR_INVALID_ARG,\n      std::string(\"BackendMemory::Create, at least one allocation type must be \"\n                  \"specified\"));\n\n  bool success = false;\n  std::unordered_map<AllocationType, TRITONSERVER_Error*> errors;\n  for (const AllocationType alloc_type : alloc_types) {\n    TRITONSERVER_Error* err =\n        Create(manager, alloc_type, memory_type_id, byte_size, mem);\n    if (err == nullptr) {\n      success = true;\n      break;\n    }\n\n    errors.insert({alloc_type, err});\n  }\n\n  // If allocation failed for all allocation types then display all\n  // the error messages and show the entire allocation request as\n  // failing.\n  if (!success) {\n    std::string msg = \"BackendMemory::Create, all allocation types failed:\";\n    for (const auto& pr : errors) {\n      const AllocationType alloc_type = pr.first;\n      TRITONSERVER_Error* err = pr.second;\n      msg += std::string(\"\\n\\t\") + AllocTypeString(alloc_type) + \": \" +\n             TRITONSERVER_ErrorMessage(err);\n      TRITONSERVER_ErrorDelete(err);\n    }\n\n    return TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_UNAVAILABLE, msg.c_str());\n  }\n\n  // If it succeeded we might have to clean up errors associated with\n  // attempts that failed\n  for (const auto& pr : errors) {\n    TRITONSERVER_ErrorDelete(pr.second);\n  }\n\n  return nullptr;  // success\n}\n\nTRITONSERVER_Error*\nBackendMemory::Create(\n    TRITONBACKEND_MemoryManager* manager, const AllocationType alloc_type,\n    const int64_t memory_type_id, void* buffer, const size_t byte_size,\n    BackendMemory** mem)\n{\n  *mem = new BackendMemory(\n      manager, alloc_type, memory_type_id, reinterpret_cast<char*>(buffer),\n      byte_size, false /* owns_buffer */);\n\n  return nullptr;  // success\n}\n\nBackendMemory::~BackendMemory()\n{\n  if (owns_buffer_) {\n    switch (alloctype_) {\n      case AllocationType::CPU_PINNED:\n#ifdef TRITON_ENABLE_GPU\n        if (buffer_ != nullptr) {\n          LOG_IF_CUDA_ERROR(\n              cudaFreeHost(buffer_), \"failed to free pinned memory\");\n        }\n#endif  // TRITON_ENABLE_GPU\n        break;\n\n      case AllocationType::GPU:\n#ifdef TRITON_ENABLE_GPU\n        if (buffer_ != nullptr) {\n          LOG_IF_CUDA_ERROR(cudaFree(buffer_), \"failed to free CUDA memory\");\n        }\n#endif  // TRITON_ENABLE_GPU\n        break;\n\n      case AllocationType::CPU:\n      case AllocationType::CPU_PINNED_POOL:\n      case AllocationType::GPU_POOL:\n        LOG_IF_ERROR(\n            TRITONBACKEND_MemoryManagerFree(\n                manager_, buffer_, AllocTypeToMemoryType(alloctype_),\n                memtype_id_),\n            \"failed to free memory buffer\");\n        break;\n    }\n  }\n}\n\nTRITONSERVER_MemoryType\nBackendMemory::AllocTypeToMemoryType(const AllocationType a)\n{\n  switch (a) {\n    case AllocationType::CPU:\n      return TRITONSERVER_MEMORY_CPU;\n    case AllocationType::CPU_PINNED:\n    case AllocationType::CPU_PINNED_POOL:\n      return TRITONSERVER_MEMORY_CPU_PINNED;\n    case AllocationType::GPU:\n    case AllocationType::GPU_POOL:\n      return TRITONSERVER_MEMORY_GPU;\n  }\n\n  return TRITONSERVER_MEMORY_CPU;  // unreachable\n}\n\nconst char*\nBackendMemory::AllocTypeString(const AllocationType a)\n{\n  switch (a) {\n    case AllocationType::CPU:\n      return \"CPU\";\n    case AllocationType::CPU_PINNED:\n      return \"CPU_PINNED\";\n    case AllocationType::GPU:\n      return \"GPU\";\n    case AllocationType::CPU_PINNED_POOL:\n      return \"CPU_PINNED_POOL\";\n    case AllocationType::GPU_POOL:\n      return \"GPU_POOL\";\n  }\n\n  return \"<unknown>\";\n}\n\n}}  // namespace triton::backend\n"
  },
  {
    "path": "src/backend_model.cc",
    "content": "// Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n//\n// Redistribution and use in source and binary forms, with or without\n// modification, are permitted provided that the following conditions\n// are met:\n//  * Redistributions of source code must retain the above copyright\n//    notice, this list of conditions and the following disclaimer.\n//  * Redistributions in binary form must reproduce the above copyright\n//    notice, this list of conditions and the following disclaimer in the\n//    documentation and/or other materials provided with the distribution.\n//  * Neither the name of NVIDIA CORPORATION nor the names of its\n//    contributors may be used to endorse or promote products derived\n//    from this software without specific prior written permission.\n//\n// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n#include \"triton/backend/backend_model.h\"\n\n#include \"triton/backend/backend_common.h\"\n\nnamespace triton { namespace backend {\n\n//\n// BackendModel\n//\nBackendModel::BackendModel(\n    TRITONBACKEND_Model* triton_model, const bool allow_optional)\n    : triton_model_(triton_model), allow_optional_(allow_optional)\n{\n  const char* model_name;\n  THROW_IF_BACKEND_MODEL_ERROR(\n      TRITONBACKEND_ModelName(triton_model, &model_name));\n  name_ = model_name;\n\n  THROW_IF_BACKEND_MODEL_ERROR(\n      TRITONBACKEND_ModelVersion(triton_model, &version_));\n\n  const char* repository_path = nullptr;\n  TRITONBACKEND_ArtifactType repository_artifact_type;\n  THROW_IF_BACKEND_MODEL_ERROR(TRITONBACKEND_ModelRepository(\n      triton_model, &repository_artifact_type, &repository_path));\n  if (repository_artifact_type != TRITONBACKEND_ARTIFACT_FILESYSTEM) {\n    throw BackendModelException(TRITONSERVER_ErrorNew(\n        TRITONSERVER_ERROR_UNSUPPORTED,\n        (std::string(\"unsupported repository artifact type for model '\") +\n         model_name + \"'\")\n            .c_str()));\n  }\n  repository_path_ = repository_path;\n\n  THROW_IF_BACKEND_MODEL_ERROR(\n      TRITONBACKEND_ModelServer(triton_model, &triton_server_));\n  TRITONBACKEND_Backend* backend;\n  THROW_IF_BACKEND_MODEL_ERROR(\n      TRITONBACKEND_ModelBackend(triton_model, &backend));\n  THROW_IF_BACKEND_MODEL_ERROR(\n      TRITONBACKEND_BackendMemoryManager(backend, &triton_memory_manager_));\n\n  THROW_IF_BACKEND_MODEL_ERROR(ParseModelConfig());\n}\n\nTRITONSERVER_Error*\nBackendModel::ParseModelConfig()\n{\n  TRITONSERVER_Message* config_message;\n  RETURN_IF_ERROR(TRITONBACKEND_ModelConfig(\n      triton_model_, 1 /* config_version */, &config_message));\n\n  // Get the model configuration as a json string from\n  // config_message. We use TritonJson, which is a wrapper that\n  // returns nice errors (currently the underlying implementation is\n  // rapidjson... but others could be added).\n  const char* buffer;\n  size_t byte_size;\n  RETURN_IF_ERROR(\n      TRITONSERVER_MessageSerializeToJson(config_message, &buffer, &byte_size));\n\n  TRITONSERVER_Error* err = model_config_.Parse(buffer, byte_size);\n  RETURN_IF_ERROR(TRITONSERVER_MessageDelete(config_message));\n  RETURN_IF_ERROR(err);\n\n  int64_t mbs = 0;\n  RETURN_IF_ERROR(model_config_.MemberAsInt(\"max_batch_size\", &mbs));\n  max_batch_size_ = mbs;\n\n  enable_pinned_input_ = false;\n  enable_pinned_output_ = false;\n  {\n    common::TritonJson::Value optimization;\n    if (model_config_.Find(\"optimization\", &optimization)) {\n      common::TritonJson::Value pinned_memory;\n      if (optimization.Find(\"input_pinned_memory\", &pinned_memory)) {\n        RETURN_IF_ERROR(\n            pinned_memory.MemberAsBool(\"enable\", &enable_pinned_input_));\n      }\n      if (optimization.Find(\"output_pinned_memory\", &pinned_memory)) {\n        RETURN_IF_ERROR(\n            pinned_memory.MemberAsBool(\"enable\", &enable_pinned_output_));\n      }\n    }\n  }\n\n  RETURN_IF_ERROR(\n      BatchInput::ParseFromModelConfig(model_config_, &batch_inputs_));\n  RETURN_IF_ERROR(\n      BatchOutput::ParseFromModelConfig(model_config_, &batch_outputs_));\n  for (const auto& batch_output : batch_outputs_) {\n    for (const auto& name : batch_output.TargetNames()) {\n      batch_output_map_.emplace(name, &batch_output);\n    }\n  }\n  triton::common::TritonJson::Value config_inputs;\n  RETURN_IF_ERROR(model_config_.MemberAsArray(\"input\", &config_inputs));\n  for (size_t i = 0; i < config_inputs.ArraySize(); i++) {\n    triton::common::TritonJson::Value io;\n    RETURN_IF_ERROR(config_inputs.IndexAsObject(i, &io));\n    std::string io_name;\n    RETURN_IF_ERROR(io.MemberAsString(\"name\", &io_name));\n    triton::common::TritonJson::Value input_property_json;\n    bool allow_ragged_batch = false;\n    if (io.Find(\"allow_ragged_batch\", &input_property_json)) {\n      RETURN_IF_ERROR(input_property_json.AsBool(&allow_ragged_batch));\n    }\n    if (allow_ragged_batch) {\n      ragged_inputs_.emplace(io_name);\n    }\n    bool optional = false;\n    if (io.Find(\"optional\", &input_property_json)) {\n      RETURN_IF_ERROR(input_property_json.AsBool(&optional));\n    }\n    if (optional) {\n      if (allow_optional_) {\n        optional_inputs_.emplace(io_name);\n      } else {\n        RETURN_IF_ERROR(TRITONSERVER_ErrorNew(\n            TRITONSERVER_ERROR_INVALID_ARG,\n            (std::string(\"'optional' is set to true for input '\") + io_name +\n             \"' while the backend model doesn't support optional input\")\n                .c_str()));\n      }\n    }\n  }\n\n  return nullptr;\n}\n\nTRITONSERVER_Error*\nBackendModel::SetModelConfig()\n{\n  triton::common::TritonJson::WriteBuffer json_buffer;\n  RETURN_IF_ERROR(ModelConfig().Write(&json_buffer));\n\n  TRITONSERVER_Message* message;\n  RETURN_IF_ERROR(TRITONSERVER_MessageNewFromSerializedJson(\n      &message, json_buffer.Base(), json_buffer.Size()));\n  RETURN_IF_ERROR(TRITONBACKEND_ModelSetConfig(\n      triton_model_, 1 /* config_version */, message));\n  RETURN_IF_ERROR(TRITONSERVER_MessageDelete(message));\n\n  // Triton core can normalize the missing config settings\n  // in the above call. We must retrieve the updated model\n  // configuration from the core.\n  RETURN_IF_ERROR(ParseModelConfig());\n\n  return nullptr;\n}\n\nTRITONSERVER_Error*\nBackendModel::SupportsFirstDimBatching(bool* supports)\n{\n  *supports = max_batch_size_ > 0;\n  return nullptr;\n}\n\nconst BatchOutput*\nBackendModel::FindBatchOutput(const std::string& output_name) const\n{\n  const auto it = batch_output_map_.find(output_name);\n  return ((it == batch_output_map_.end()) ? nullptr : it->second);\n}\n\n}}  // namespace triton::backend\n"
  },
  {
    "path": "src/backend_model_instance.cc",
    "content": "// Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.\n//\n// Redistribution and use in source and binary forms, with or without\n// modification, are permitted provided that the following conditions\n// are met:\n//  * Redistributions of source code must retain the above copyright\n//    notice, this list of conditions and the following disclaimer.\n//  * Redistributions in binary form must reproduce the above copyright\n//    notice, this list of conditions and the following disclaimer in the\n//    documentation and/or other materials provided with the distribution.\n//  * Neither the name of NVIDIA CORPORATION nor the names of its\n//    contributors may be used to endorse or promote products derived\n//    from this software without specific prior written permission.\n//\n// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n#include \"triton/backend/backend_model_instance.h\"\n\n#include <vector>\n\n#include \"triton/backend/backend_common.h\"\n#include \"triton/backend/backend_model.h\"\n\nnamespace triton { namespace backend {\n\n//\n// BackendModelInstance\n//\nBackendModelInstance::BackendModelInstance(\n    BackendModel* backend_model,\n    TRITONBACKEND_ModelInstance* triton_model_instance)\n    : backend_model_(backend_model),\n      triton_model_instance_(triton_model_instance)\n{\n  const char* instance_name;\n  THROW_IF_BACKEND_INSTANCE_ERROR(\n      TRITONBACKEND_ModelInstanceName(triton_model_instance, &instance_name));\n  name_ = instance_name;\n\n  THROW_IF_BACKEND_INSTANCE_ERROR(\n      TRITONBACKEND_ModelInstanceKind(triton_model_instance, &kind_));\n\n  THROW_IF_BACKEND_INSTANCE_ERROR(\n      TRITONBACKEND_ModelInstanceDeviceId(triton_model_instance, &device_id_));\n\n  common::TritonJson::Value& model_config = backend_model->ModelConfig();\n\n  // If the model configuration specifies a 'default_model_filename'\n  // and/or specifies 'cc_model_filenames' then determine the\n  // appropriate 'artifact_filename' value. If model configuration\n  // does not specify then just leave 'artifact_filename' empty and\n  // the backend can then provide its own logic for determine the\n  // filename if that is appropriate.\n  THROW_IF_BACKEND_INSTANCE_ERROR(model_config.MemberAsString(\n      \"default_model_filename\", &artifact_filename_));\n\n  switch (kind_) {\n    case TRITONSERVER_INSTANCEGROUPKIND_CPU: {\n      LOG_MESSAGE(\n          TRITONSERVER_LOG_VERBOSE,\n          (std::string(\"Creating instance \") + name_ +\n           \" on CPU using artifact '\" + artifact_filename_ + \"'\")\n              .c_str());\n      break;\n    }\n    case TRITONSERVER_INSTANCEGROUPKIND_MODEL: {\n      LOG_MESSAGE(\n          TRITONSERVER_LOG_VERBOSE,\n          (std::string(\"Creating instance \") + name_ +\n           \" on model-specified devices using artifact '\" + artifact_filename_ +\n           \"'\")\n              .c_str());\n      break;\n    }\n    case TRITONSERVER_INSTANCEGROUPKIND_GPU: {\n#if defined(TRITON_ENABLE_GPU)\n      cudaDeviceProp cuprops;\n      cudaError_t cuerr = cudaGetDeviceProperties(&cuprops, device_id_);\n      if (cuerr != cudaSuccess) {\n        throw BackendModelInstanceException(TRITONSERVER_ErrorNew(\n            TRITONSERVER_ERROR_INTERNAL,\n            (std::string(\"unable to get CUDA device properties for \") + name_ +\n             \": \" + cudaGetErrorString(cuerr))\n                .c_str()));\n      }\n\n      const std::string cc =\n          std::to_string(cuprops.major) + \".\" + std::to_string(cuprops.minor);\n      common::TritonJson::Value cc_names;\n      common::TritonJson::Value cc_name;\n      if ((model_config.Find(\"cc_model_filenames\", &cc_names)) &&\n          (cc_names.Find(cc.c_str(), &cc_name))) {\n        cc_name.AsString(&artifact_filename_);\n      }\n\n      LOG_MESSAGE(\n          TRITONSERVER_LOG_VERBOSE,\n          (std::string(\"Creating instance \") + name_ + \" on GPU \" +\n           std::to_string(device_id_) + \" (\" + cc + \") using artifact '\" +\n           artifact_filename_ + \"'\")\n              .c_str());\n#elif !defined(TRITON_ENABLE_MALI_GPU)\n      throw BackendModelInstanceException(TRITONSERVER_ErrorNew(\n          TRITONSERVER_ERROR_INTERNAL, \"GPU instances not supported\"));\n#endif  // TRITON_ENABLE_GPU\n      break;\n    }\n    default: {\n      throw BackendModelInstanceException(TRITONSERVER_ErrorNew(\n          TRITONSERVER_ERROR_INTERNAL,\n          (std::string(\"unexpected instance kind for \") + name_).c_str()));\n    }\n  }\n\n  stream_ = nullptr;\n  if (kind_ == TRITONSERVER_INSTANCEGROUPKIND_GPU) {\n    THROW_IF_BACKEND_INSTANCE_ERROR(\n        CreateCudaStream(device_id_, 0 /* cuda_stream_priority */, &stream_));\n  }\n\n  // Get the host policy setting as a json string from message,\n  // and extract the host policy name for the instance.\n  TRITONSERVER_Message* message = nullptr;\n  THROW_IF_BACKEND_MODEL_ERROR(\n      TRITONBACKEND_ModelInstanceHostPolicy(triton_model_instance_, &message));\n  const char* buffer;\n  size_t byte_size;\n  THROW_IF_BACKEND_MODEL_ERROR(\n      TRITONSERVER_MessageSerializeToJson(message, &buffer, &byte_size));\n\n  common::TritonJson::Value host_policy;\n  TRITONSERVER_Error* err = host_policy.Parse(buffer, byte_size);\n  THROW_IF_BACKEND_MODEL_ERROR(err);\n  std::vector<std::string> host_policy_name;\n  THROW_IF_BACKEND_MODEL_ERROR(host_policy.Members(&host_policy_name));\n  if (host_policy_name.size() != 1) {\n    throw BackendModelInstanceException(TRITONSERVER_ErrorNew(\n        TRITONSERVER_ERROR_INTERNAL,\n        (std::string(\"unexpected no host policy for \") + name_).c_str()));\n  }\n  host_policy_name_ = host_policy_name[0];\n}\n\n\nBackendModelInstance::~BackendModelInstance()\n{\n#ifdef TRITON_ENABLE_GPU\n  if (stream_ != nullptr) {\n    cudaError_t err = cudaStreamDestroy(stream_);\n    if (err != cudaSuccess) {\n      TRITONSERVER_LogMessage(\n          TRITONSERVER_LOG_ERROR, __FILE__, __LINE__,\n          (std::string(\"~BackendModelInstance: \") + name_ +\n           \" failed to destroy cuda stream: \" + cudaGetErrorString(err))\n              .c_str());\n    }\n    stream_ = nullptr;\n  }\n#endif  // TRITON_ENABLE_GPU\n}\n\n}}  // namespace triton::backend\n"
  },
  {
    "path": "src/backend_output_responder.cc",
    "content": "// Copyright 2019-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n//\n// Redistribution and use in source and binary forms, with or without\n// modification, are permitted provided that the following conditions\n// are met:\n//  * Redistributions of source code must retain the above copyright\n//    notice, this list of conditions and the following disclaimer.\n//  * Redistributions in binary form must reproduce the above copyright\n//    notice, this list of conditions and the following disclaimer in the\n//    documentation and/or other materials provided with the distribution.\n//  * Neither the name of NVIDIA CORPORATION nor the names of its\n//    contributors may be used to endorse or promote products derived\n//    from this software without specific prior written permission.\n//\n// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n#include \"triton/backend/backend_output_responder.h\"\n\n#include \"triton/backend/backend_common.h\"\n#include \"triton/backend/backend_model.h\"\n#include \"triton/backend/backend_model_instance.h\"\n\nnamespace triton { namespace backend {\n\n//\n// BackendOutputResponder\n//\nBackendOutputResponder::~BackendOutputResponder()\n{\n  for (auto& pinned_memory : pinned_memories_) {\n    LOG_IF_ERROR(\n        TRITONBACKEND_MemoryManagerFree(\n            memory_manager_, reinterpret_cast<void*>(pinned_memory),\n            TRITONSERVER_MEMORY_CPU_PINNED, 0),\n        \"failed to free pinned memory\");\n  }\n}\n\nvoid\nBackendOutputResponder::ProcessTensor(\n    const std::string& output_name, const TRITONSERVER_DataType datatype,\n    std::vector<int64_t>& batchn_shape, const char* buffer,\n    const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id)\n{\n  // A value of CPU_PINNED indicates that pinned memory buffer is not\n  // needed for this tensor. Any other value indicates that a pinned\n  // memory buffer is needed when the target memory type matches\n  // 'use_pinned_memory_type'.\n  TRITONSERVER_MemoryType use_pinned_memory_type =\n      TRITONSERVER_MEMORY_CPU_PINNED;\n  if (pinned_enabled_) {\n    use_pinned_memory_type = GetUsePinnedMemoryType(memory_type);\n  }\n\n  const int64_t batchn_batch_size = batchn_shape[0];\n  int64_t batch_size_offset = 0;\n\n  size_t tensor_offset = 0;\n\n  for (size_t idx = 0; idx < responses_->size(); idx++) {\n    auto& request = requests_[idx];\n    auto& response = (*responses_)[idx];\n\n    // If then pending copies are from tensor buffer that is not\n    // contiguous with 'response's part of that buffer, then need to\n    // go ahead and perform the pending copies so that can start a\n    // new contiguous region if necessary.\n    if ((pending_pinned_byte_size_ > 0) &&\n        (tensor_offset !=\n         (pending_pinned_byte_size_ + pending_pinned_offset_))) {\n      need_sync_ |= FlushPendingPinned(buffer, memory_type, memory_type_id);\n    }\n\n    // Override shape to be correct for this response.\n    if (first_dim_batching_) {\n      TRITONBACKEND_Input* input;\n      TRITONBACKEND_RequestInputByIndex(request, 0, &input);\n      const int64_t* shape;\n      TRITONBACKEND_InputProperties(\n          input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr);\n      if ((batchn_batch_size != -1) &&\n          ((batch_size_offset + shape[0]) > batchn_batch_size)) {\n        if (response != nullptr) {\n          RESPOND_AND_SET_NULL_IF_ERROR(\n              &response,\n              TRITONSERVER_ErrorNew(\n                  TRITONSERVER_ERROR_UNSUPPORTED,\n                  std::string(\n                      GetRequestId(request) +\n                      \"failed to split the output tensor '\" + output_name +\n                      \"' in responses: expected batch size of at least \" +\n                      std::to_string(batch_size_offset + shape[0]) +\n                      \" in model output, got \" +\n                      std::to_string(batchn_batch_size))\n                      .c_str()));\n        }\n      }\n      batchn_shape[0] = shape[0];\n      batch_size_offset += shape[0];\n    }\n\n    int64_t tensor_byte_size = 0;\n    RESPOND_AND_SET_NULL_IF_ERROR(\n        &response, GetByteSize(datatype, batchn_shape, &tensor_byte_size));\n\n    TRITONBACKEND_Output* response_output;\n    if (response != nullptr) {\n      uint32_t output_count;\n      RESPOND_AND_SET_NULL_IF_ERROR(\n          &response, TRITONBACKEND_RequestOutputCount(request, &output_count));\n      if (response != nullptr) {\n        for (uint32_t output_idx = 0; output_idx < output_count; output_idx++) {\n          const char* name;\n          RESPOND_AND_SET_NULL_IF_ERROR(\n              &response,\n              TRITONBACKEND_RequestOutputName(request, output_idx, &name));\n          if ((response != nullptr) && (output_name == name)) {\n            RESPOND_AND_SET_NULL_IF_ERROR(\n                &response, TRITONBACKEND_ResponseOutput(\n                               response, &response_output, name, datatype,\n                               batchn_shape.data(), batchn_shape.size()));\n            if (response != nullptr) {\n              need_sync_ |= SetFixedSizeBuffer(\n                  &response, response_output, output_name, tensor_byte_size,\n                  tensor_offset, buffer, memory_type, memory_type_id,\n                  use_pinned_memory_type, false /* state */);\n            }\n\n            break;\n          }\n        }\n      }\n    }\n\n    tensor_offset += tensor_byte_size;\n  }\n\n  // Done with the tensor, flush any pending pinned copies.\n  need_sync_ |= FlushPendingPinned(buffer, memory_type, memory_type_id);\n#ifdef TRITON_ENABLE_GPU\n  if (need_sync_ && (event_ != nullptr)) {\n    cudaEventRecord(event_, stream_);\n  }\n#endif  // TRITON_ENABLE_GPU\n}\n\nstd::vector<TRITONBACKEND_State*>\nBackendOutputResponder::ProcessStateTensor(\n    const std::string& output_state_name, const TRITONSERVER_DataType datatype,\n    std::vector<int64_t>& batchn_shape, const char* buffer,\n    const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id)\n{\n  // A value of CPU_PINNED indicates that pinned memory buffer is not\n  // needed for this tensor. Any other value indicates that a pinned\n  // memory buffer is needed when the target memory type matches\n  // 'use_pinned_memory_type'.\n  TRITONSERVER_MemoryType use_pinned_memory_type =\n      TRITONSERVER_MEMORY_CPU_PINNED;\n  if (pinned_enabled_) {\n    use_pinned_memory_type = GetUsePinnedMemoryType(memory_type);\n  }\n\n  std::vector<TRITONBACKEND_State*> states;\n\n  const int64_t batchn_batch_size = batchn_shape[0];\n  int64_t batch_size_offset = 0;\n\n  size_t tensor_offset = 0;\n\n  for (size_t idx = 0; idx < responses_->size(); idx++) {\n    auto& request = requests_[idx];\n    auto& response = (*responses_)[idx];\n\n    // If then pending copies are from tensor buffer that is not\n    // contiguous with 'response's part of that buffer, then need to\n    // go ahead and perform the pending copies so that can start a\n    // new contiguous region if necessary.\n    if ((pending_pinned_byte_size_ > 0) &&\n        (tensor_offset !=\n         (pending_pinned_byte_size_ + pending_pinned_offset_))) {\n      need_sync_ |= FlushPendingPinned(buffer, memory_type, memory_type_id);\n    }\n\n    // Override shape to be correct for this response.\n    if (first_dim_batching_) {\n      TRITONBACKEND_Input* input;\n      TRITONBACKEND_RequestInputByIndex(request, 0, &input);\n      const int64_t* shape;\n      TRITONBACKEND_InputProperties(\n          input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr);\n      if ((batchn_batch_size != -1) &&\n          ((batch_size_offset + shape[0]) > batchn_batch_size)) {\n        if (response != nullptr) {\n          RESPOND_AND_SET_NULL_IF_ERROR(\n              &response,\n              TRITONSERVER_ErrorNew(\n                  TRITONSERVER_ERROR_UNSUPPORTED,\n                  std::string(\n                      GetRequestId(request) +\n                      \"failed to split the output state tensor '\" +\n                      output_state_name +\n                      \"' in responses: expected batch size of at least \" +\n                      std::to_string(batch_size_offset + shape[0]) +\n                      \" in model output, got \" +\n                      std::to_string(batchn_batch_size))\n                      .c_str()));\n        }\n      }\n      batchn_shape[0] = shape[0];\n      batch_size_offset += shape[0];\n    }\n\n    int64_t tensor_byte_size = 0;\n    RESPOND_AND_SET_NULL_IF_ERROR(\n        &response, GetByteSize(datatype, batchn_shape, &tensor_byte_size));\n\n    TRITONBACKEND_State* output_state;\n    if (response != nullptr) {\n      RESPOND_AND_SET_NULL_IF_ERROR(\n          &response, TRITONBACKEND_StateNew(\n                         &output_state, request, output_state_name.c_str(),\n                         datatype, batchn_shape.data(), batchn_shape.size()));\n      if (response != nullptr) {\n        states.push_back(output_state);\n        need_sync_ |= SetFixedSizeBuffer(\n            &response, output_state, output_state_name, tensor_byte_size,\n            tensor_offset, buffer, memory_type, memory_type_id,\n            use_pinned_memory_type, true /* state */);\n      }\n    }\n\n    tensor_offset += tensor_byte_size;\n  }\n\n  // Done with the tensor, flush any pending pinned copies.\n  need_sync_ |= FlushPendingPinned(buffer, memory_type, memory_type_id);\n#ifdef TRITON_ENABLE_GPU\n  if (need_sync_ && (event_ != nullptr)) {\n    cudaEventRecord(event_, stream_);\n  }\n#endif  // TRITON_ENABLE_GPU\n\n  return states;\n}\n\nbool\nBackendOutputResponder::Finalize()\n{\n#ifdef TRITON_ENABLE_GPU\n  if ((!deferred_pinned_.empty()) && need_sync_) {\n    if (event_ != nullptr) {\n      cudaEventSynchronize(event_);\n    } else {\n      cudaStreamSynchronize(stream_);\n    }\n    need_sync_ = false;\n  }\n#endif  // TRITON_ENABLE_GPU\n\n  // After the above sync all the GPU->pinned copies are complete. Any\n  // deferred copies of pinned->CPU can now be done.\n  for (auto& def : deferred_pinned_) {\n    auto pinned_memory_type = TRITONSERVER_MEMORY_CPU_PINNED;\n    int64_t pinned_memory_id = 0;\n    char* pinned_buffer = def.pinned_memory_;\n\n    size_t offset = 0;\n    for (auto& pr : def.responses_) {\n      auto& response = pr.first;\n      auto& response_output = pr.second;\n\n      bool cuda_used = false;\n      RESPOND_AND_SET_NULL_IF_ERROR(\n          response,\n          CopyBuffer(\n              response_output.name_, pinned_memory_type, pinned_memory_id,\n              response_output.memory_type_, response_output.memory_type_id_,\n              response_output.buffer_byte_size_, pinned_buffer + offset,\n              const_cast<void*>(response_output.buffer_), stream_, &cuda_used,\n              copy_on_stream_));\n      need_sync_ |= cuda_used;\n\n      offset += response_output.buffer_byte_size_;\n    }\n  }\n\n#ifdef TRITON_ENABLE_GPU\n  // Record the new event location if deferred copies occur\n  if ((!deferred_pinned_.empty()) && need_sync_ && (event_ != nullptr)) {\n    cudaEventRecord(event_, stream_);\n  }\n#endif  // TRITON_ENABLE_GPU\n  deferred_pinned_.clear();\n\n  return need_sync_;\n}\n\n\nbool\nBackendOutputResponder::SetFixedSizeBuffer(\n    TRITONBACKEND_Response** response, void* response_output_or_state,\n    const std::string& output_name, const size_t tensor_byte_size,\n    const size_t tensor_offset, const char* tensor_buffer,\n    const TRITONSERVER_MemoryType tensor_memory_type,\n    const int64_t tensor_memory_type_id,\n    const TRITONSERVER_MemoryType use_pinned_memory_type, bool state)\n{\n  void* buffer = nullptr;\n  bool cuda_copy = false;\n\n  TRITONSERVER_MemoryType actual_memory_type = tensor_memory_type;\n  int64_t actual_memory_type_id = tensor_memory_type_id;\n\n  if (state) {\n    TRITONBACKEND_State* response_state =\n        reinterpret_cast<TRITONBACKEND_State*>(response_output_or_state);\n    auto err = TRITONBACKEND_StateBuffer(\n        response_state, &buffer, tensor_byte_size, &actual_memory_type,\n        &actual_memory_type_id);\n    if (err != nullptr) {\n      RESPOND_AND_SET_NULL_IF_ERROR(response, err);\n      return cuda_copy;\n    }\n  } else {\n    TRITONBACKEND_Output* response_output =\n        reinterpret_cast<TRITONBACKEND_Output*>(response_output_or_state);\n    auto err = TRITONBACKEND_OutputBuffer(\n        response_output, &buffer, tensor_byte_size, &actual_memory_type,\n        &actual_memory_type_id);\n    if (err != nullptr) {\n      RESPOND_AND_SET_NULL_IF_ERROR(response, err);\n      return cuda_copy;\n    }\n  }\n\n  // If the response buffer matches the memory type that should use an\n  // intermediate pinned memory buffer for the transfer, then just\n  // record the response as pending and increase the size required for\n  // the intermediate pinned buffer.\n  if ((use_pinned_memory_type != TRITONSERVER_MEMORY_CPU_PINNED) &&\n      (actual_memory_type == use_pinned_memory_type)) {\n    if (pending_pinned_byte_size_ == 0) {\n      pending_pinned_offset_ = tensor_offset;\n    }\n\n    pending_pinned_byte_size_ += tensor_byte_size;\n    pending_pinned_outputs_.push_back(std::make_pair(\n        response, OutputData(\n                      output_name, buffer, tensor_byte_size, actual_memory_type,\n                      actual_memory_type_id)));\n  } else {\n    // Direct copy without intermediate pinned memory.\n    bool cuda_used = false;\n    auto err = CopyBuffer(\n        output_name, tensor_memory_type, tensor_memory_type_id,\n        actual_memory_type, actual_memory_type_id, tensor_byte_size,\n        tensor_buffer + tensor_offset, buffer, stream_, &cuda_used,\n        copy_on_stream_);\n    cuda_copy |= cuda_used;\n\n    if (err != nullptr) {\n      RESPOND_AND_SET_NULL_IF_ERROR(response, err);\n      return cuda_copy;\n    }\n  }\n\n  return cuda_copy;\n}\n\nbool\nBackendOutputResponder::FlushPendingPinned(\n    const char* tensor_buffer, const TRITONSERVER_MemoryType tensor_memory_type,\n    const int64_t tensor_memory_type_id)\n{\n  bool cuda_copy = false;\n\n  // Will be copying from CPU->pinned->GPU or GPU->pinned->CPU\n\n  // Attempt to allocate a pinned buffer to use for staging the\n  // copy... if we fail to allocated the pinned buffer then we just\n  // directly go CPU->GPU or GPU->CPU.\n  char* pinned_memory = nullptr;\n  if (pending_pinned_byte_size_ > 0) {\n    TRITONSERVER_Error* err = TRITONBACKEND_MemoryManagerAllocate(\n        memory_manager_, reinterpret_cast<void**>(&pinned_memory),\n        TRITONSERVER_MEMORY_CPU_PINNED, 0 /* memory_type_id */,\n        pending_pinned_byte_size_);\n    if (err != nullptr) {\n      pinned_memory = nullptr;\n      TRITONSERVER_ErrorDelete(err);\n    }\n  }\n\n  // If the pinned buffer wasn't actually allocated then just perform\n  // a direct copy.\n  if (pinned_memory == nullptr) {\n    size_t offset = 0;\n    for (auto& pr : pending_pinned_outputs_) {\n      auto& response = pr.first;\n      auto& response_output = pr.second;\n\n      bool cuda_used = false;\n      RESPOND_AND_SET_NULL_IF_ERROR(\n          response,\n          CopyBuffer(\n              response_output.name_, tensor_memory_type, tensor_memory_type_id,\n              response_output.memory_type_, response_output.memory_type_id_,\n              response_output.buffer_byte_size_,\n              tensor_buffer + pending_pinned_offset_ + offset,\n              const_cast<void*>(response_output.buffer_), stream_, &cuda_used,\n              copy_on_stream_));\n      cuda_copy |= cuda_used;\n\n      offset += response_output.buffer_byte_size_;\n    }\n  }\n  // We have a pinned buffer so do a single copy of a block of tensor\n  // data to the pinned buffer.\n  else {  // pinned_memory_type == TRITONSERVER_MEMORY_CPU_PINNED\n    bool cuda_used = false;\n    auto err = CopyBuffer(\n        \"pinned buffer\", tensor_memory_type, tensor_memory_type_id,\n        TRITONSERVER_MEMORY_CPU_PINNED, 0 /* memory_type_id */,\n        pending_pinned_byte_size_, tensor_buffer + pending_pinned_offset_,\n        pinned_memory, stream_, &cuda_used, copy_on_stream_);\n    cuda_copy |= cuda_used;\n\n    // If something goes wrong with the copy all the pending\n    // responses fail...\n    if (err != nullptr) {\n      for (auto& pr : pending_pinned_outputs_) {\n        auto& response = pr.first;\n        if (*response != nullptr) {\n          LOG_IF_ERROR(\n              TRITONBACKEND_ResponseSend(\n                  *response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, err),\n              \"failed to send TensorFlow error response\");\n          *response = nullptr;\n        }\n      }\n      TRITONSERVER_ErrorDelete(err);\n    }\n\n    // If the copy was not async (i.e. if tensor was in CPU so a\n    // CPU->CPU-PINNED copy was performed above), then the pinned\n    // buffer now holds the tensor contents and we can immediately\n    // issue the copies from the pinned buffer to the\n    // responses.\n    //\n    // Otherwise the GPU->CPU-PINNED async copies are in flight and we\n    // simply remember the pinned buffer and the corresponding\n    // response outputs so that we can do the pinned->CPU copies in\n    // finalize after we have waited for all async copies to complete.\n    if (!cuda_used) {\n      size_t offset = 0;\n      for (auto& pr : pending_pinned_outputs_) {\n        auto& response = pr.first;\n        auto& response_output = pr.second;\n\n        bool cuda_used = false;\n        RESPOND_AND_SET_NULL_IF_ERROR(\n            response,\n            CopyBuffer(\n                response_output.name_, TRITONSERVER_MEMORY_CPU_PINNED,\n                0 /* memory_type_id */, response_output.memory_type_,\n                response_output.memory_type_id_,\n                response_output.buffer_byte_size_, pinned_memory + offset,\n                const_cast<void*>(response_output.buffer_), stream_, &cuda_used,\n                copy_on_stream_));\n        cuda_copy |= cuda_used;\n\n        offset += response_output.buffer_byte_size_;\n      }\n    } else {\n      deferred_pinned_.emplace_back(\n          pinned_memory, pending_pinned_byte_size_,\n          std::move(pending_pinned_outputs_));\n    }\n  }\n\n  // Pending pinned copies are handled...\n  pending_pinned_byte_size_ = 0;\n  pending_pinned_offset_ = 0;\n  pending_pinned_outputs_.clear();\n\n  // Need to hold on to the allocated pinned buffer as there are still\n  // copies in flight. Will delete it in finalize.\n  if (pinned_memory != nullptr) {\n    pinned_memories_.push_back(pinned_memory);\n  }\n\n  return cuda_copy;\n}\n\nvoid\nBackendOutputResponder::ProcessBatchOutput(\n    const std::string& name, const BatchOutput& batch_output,\n    const char* buffer, const TRITONSERVER_MemoryType memory_type,\n    const int64_t memory_type_id)\n{\n  // A value of CPU_PINNED indicates that pinned memory buffer is not\n  // needed for this tensor. Any other value indicates that a pinned\n  // memory buffer is needed when the target memory type matches\n  // 'use_pinned_memory_type'.\n  TRITONSERVER_MemoryType use_pinned_memory_type =\n      TRITONSERVER_MEMORY_CPU_PINNED;\n  if (pinned_enabled_) {\n    use_pinned_memory_type = GetUsePinnedMemoryType(memory_type);\n  }\n\n  // Batch output may be processed differently based on the kind\n  switch (batch_output.BatchOutputKind()) {\n    case BatchOutput::Kind::BATCH_SCATTER_WITH_INPUT_SHAPE: {\n      const auto& output_name = batch_output.TargetNames()[0];\n      const auto& input_name = batch_output.SourceInputs()[0];\n      const auto& datatype = batch_output.DataType();\n      size_t tensor_offset = 0;\n\n      for (size_t idx = 0; idx < responses_->size(); idx++) {\n        auto& request = requests_[idx];\n        auto& response = (*responses_)[idx];\n\n        // If then pending copies are from tensor buffer that is not\n        // contiguous with 'response's part of that buffer, then need to\n        // go ahead and perform the pending copies so that can start a\n        // new contiguous region if necessary.\n        if ((pending_pinned_byte_size_ > 0) &&\n            (tensor_offset !=\n             (pending_pinned_byte_size_ + pending_pinned_offset_))) {\n          need_sync_ |= FlushPendingPinned(buffer, memory_type, memory_type_id);\n        }\n\n        // Override shape to be correct for this response, with a naive\n        // assumption that the dynamic dimension in output is mapped to the same\n        // dimension in the input\n        auto output_batchn_shape = batch_output.OutputShape();\n        {\n          TRITONBACKEND_Input* input;\n          TRITONBACKEND_RequestInput(request, input_name.c_str(), &input);\n          const int64_t* shape;\n          TRITONBACKEND_InputProperties(\n              input, nullptr, nullptr, &shape, nullptr, nullptr, nullptr);\n          for (size_t dim_idx = 0; dim_idx < output_batchn_shape.size();\n               dim_idx++) {\n            if (output_batchn_shape[dim_idx] == -1) {\n              output_batchn_shape[dim_idx] = shape[dim_idx];\n            }\n          }\n        }\n\n        int64_t tensor_byte_size = 0;\n        RESPOND_AND_SET_NULL_IF_ERROR(\n            &response,\n            GetByteSize(datatype, output_batchn_shape, &tensor_byte_size));\n\n        TRITONBACKEND_Output* response_output;\n        if (response != nullptr) {\n          uint32_t output_count;\n          RESPOND_AND_SET_NULL_IF_ERROR(\n              &response,\n              TRITONBACKEND_RequestOutputCount(request, &output_count));\n          if (response != nullptr) {\n            for (uint32_t output_idx = 0; output_idx < output_count;\n                 output_idx++) {\n              const char* name;\n              RESPOND_AND_SET_NULL_IF_ERROR(\n                  &response,\n                  TRITONBACKEND_RequestOutputName(request, output_idx, &name));\n              if ((response != nullptr) && (output_name == name)) {\n                RESPOND_AND_SET_NULL_IF_ERROR(\n                    &response, TRITONBACKEND_ResponseOutput(\n                                   response, &response_output, name, datatype,\n                                   output_batchn_shape.data(),\n                                   output_batchn_shape.size()));\n                if (response != nullptr) {\n                  need_sync_ |= SetFixedSizeBuffer(\n                      &response, response_output, output_name, tensor_byte_size,\n                      tensor_offset, buffer, memory_type, memory_type_id,\n                      use_pinned_memory_type, false /* state */);\n                }\n\n                break;\n              }\n            }\n          }\n        }\n\n        tensor_offset += tensor_byte_size;\n      }\n      break;\n    }\n  }\n\n  // Done with the tensor, flush any pending pinned copies.\n  need_sync_ |= FlushPendingPinned(buffer, memory_type, memory_type_id);\n#ifdef TRITON_ENABLE_GPU\n  if (need_sync_ && (event_ != nullptr)) {\n    cudaEventRecord(event_, stream_);\n  }\n#endif  // TRITON_ENABLE_GPU\n}\n\n}}  // namespace triton::backend\n"
  },
  {
    "path": "src/device_memory_tracker.cc",
    "content": "// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n//\n// Redistribution and use in source and binary forms, with or without\n// modification, are permitted provided that the following conditions\n// are met:\n//  * Redistributions of source code must retain the above copyright\n//    notice, this list of conditions and the following disclaimer.\n//  * Redistributions in binary form must reproduce the above copyright\n//    notice, this list of conditions and the following disclaimer in the\n//    documentation and/or other materials provided with the distribution.\n//  * Neither the name of NVIDIA CORPORATION nor the names of its\n//    contributors may be used to endorse or promote products derived\n//    from this software without specific prior written permission.\n//\n// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n#include \"triton/backend/device_memory_tracker.h\"\n\n#include <iostream>\n#include <stdexcept>\n\n#include \"triton/core/tritonserver.h\"\n\nnamespace triton { namespace backend {\n\nstd::unique_ptr<DeviceMemoryTracker> DeviceMemoryTracker::tracker_{nullptr};\n// Boilerplate from CUPTI examples\nnamespace {\n\n#define LOG_IF_CUPTI_ERR(call)                                \\\n  do {                                                        \\\n    CUptiResult _status = call;                               \\\n    if (_status != CUPTI_SUCCESS) {                           \\\n      const char* errstr;                                     \\\n      cuptiGetResultString(_status, &errstr);                 \\\n      LOG_ERROR << #call << \" failed with error: \" << errstr; \\\n    }                                                         \\\n  } while (0)\n\n#define THROW_IF_CUPTI_ERR(call)                                 \\\n  do {                                                           \\\n    CUptiResult _status = call;                                  \\\n    if (_status != CUPTI_SUCCESS) {                              \\\n      const char* errstr;                                        \\\n      cuptiGetResultString(_status, &errstr);                    \\\n      throw std::runtime_error(                                  \\\n          std::string(#call) + \" failed with error: \" + errstr); \\\n    }                                                            \\\n  } while (0)\n\n#define BUF_SIZE (32 * 1024)\n#define ALIGN_SIZE (8)\n#define ALIGN_BUFFER(buffer, align)                                 \\\n  (((uintptr_t)(buffer) & ((align)-1))                              \\\n       ? ((buffer) + (align) - ((uintptr_t)(buffer) & ((align)-1))) \\\n       : (buffer))\n\nvoid\nbufferRequested(uint8_t** buffer, size_t* size, size_t* maxNumRecords)\n{\n  uint8_t* bfr = (uint8_t*)malloc(BUF_SIZE + ALIGN_SIZE);\n  if (bfr != nullptr) {\n    *size = BUF_SIZE;\n    *buffer = ALIGN_BUFFER(bfr, ALIGN_SIZE);\n    *maxNumRecords = 0;\n  } else {\n    LOG_ERROR << \"Failed to allocate buffer for CUPTI: out of memory\";\n  }\n}\n\nvoid\nbufferCompleted(\n    CUcontext ctx, uint32_t streamId, uint8_t* buffer, size_t size,\n    size_t validSize)\n{\n  CUptiResult status;\n  CUpti_Activity* record = nullptr;\n\n  if (validSize > 0) {\n    do {\n      status = cuptiActivityGetNextRecord(buffer, validSize, &record);\n      if (status == CUPTI_SUCCESS) {\n        DeviceMemoryTracker::TrackActivity(record);\n      } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED)\n        break;\n      else {\n        LOG_IF_CUPTI_ERR(status);\n      }\n    } while (1);\n\n    // report any records dropped from the queue\n    size_t dropped = 0;\n    LOG_IF_CUPTI_ERR(\n        cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped));\n    if (dropped != 0) {\n      LOG_WARNING << \"Dropped \" << dropped << \" activity records\";\n    }\n  }\n\n  free(buffer);\n}\n\n}  // namespace\n\nDeviceMemoryTracker::DeviceMemoryTracker()\n{\n  cudaError_t cuerr = cudaGetDeviceCount(&device_cnt_);\n  if ((cuerr == cudaErrorNoDevice) || (cuerr == cudaErrorInsufficientDriver)) {\n    device_cnt_ = 0;\n  } else if (cuerr != cudaSuccess) {\n    throw std::runtime_error(\n        \"Unexpected failure on getting CUDA device count.\");\n  }\n\n  // Use 'cuptiSubscribe' to check if the cupti has been initialized\n  // elsewhere. Due to cupti limitation, there can only be one cupti client\n  // within the process, so in the case of per-backend memory tracking, we\n  // have to make the assumption that the other cupti client is using the same\n  // memory tracker implementation so that the backend may use the cupti\n  // configuration that is external to the backend without issue.\n  auto cupti_res = cuptiSubscribe(&subscriber_, nullptr, nullptr);\n  switch (cupti_res) {\n    case CUPTI_SUCCESS: {\n      THROW_IF_CUPTI_ERR(\n          cuptiActivityRegisterCallbacks(bufferRequested, bufferCompleted));\n      THROW_IF_CUPTI_ERR(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));\n      THROW_IF_CUPTI_ERR(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMORY2));\n      THROW_IF_CUPTI_ERR(\n          cuptiActivityEnable(CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION));\n      break;\n    }\n    case CUPTI_ERROR_MULTIPLE_SUBSCRIBERS_NOT_SUPPORTED: {\n      LOG_WARNING << \"CUPTI has been initialized elsewhere, assuming the \"\n                     \"implementation is the same\";\n      break;\n    }\n    default: {\n      // other error, should propagate and disable memory tracking for the\n      // backend\n      const char* errstr;\n      cuptiGetResultString(cupti_res, &errstr);\n      throw std::runtime_error(\n          std::string(\"Unexpected failure on configuring CUPTI: \") + errstr);\n    }\n  }\n}\n\nDeviceMemoryTracker::~DeviceMemoryTracker()\n{\n  if (subscriber_) {\n    cuptiUnsubscribe(subscriber_);\n  }\n}\n\nint\nDeviceMemoryTracker::CudaDeviceCount()\n{\n  if (tracker_) {\n    return tracker_->device_cnt_;\n  }\n  throw std::runtime_error(\n      \"DeviceMemoryTracker::Init() must be called before using any \"\n      \"DeviceMemoryTracker features.\");\n}\n\nbool\nDeviceMemoryTracker::Init()\n{\n  if (tracker_ == nullptr) {\n    try {\n      tracker_.reset(new DeviceMemoryTracker());\n    }\n    catch (const std::runtime_error& ex) {\n      // Fail initialization\n      LOG_ERROR << ex.what();\n      return false;\n    }\n  }\n  return true;\n}\n\nvoid\nDeviceMemoryTracker::Fini()\n{\n  tracker_.reset();\n}\n\nvoid\nDeviceMemoryTracker::TrackThreadMemoryUsage(MemoryUsage* usage)\n{\n  if (!usage) {\n    return;\n  }\n  if (tracker_) {\n    THROW_IF_CUPTI_ERR(cuptiActivityPushExternalCorrelationId(\n        CUPTI_EXTERNAL_CORRELATION_KIND_UNKNOWN,\n        reinterpret_cast<uint64_t>(&usage->cupti_tracker_)));\n    usage->tracked_ = true;\n  } else {\n    throw std::runtime_error(\n        \"DeviceMemoryTracker::Init() must be called before using any \"\n        \"DeviceMemoryTracker features.\");\n  }\n}\n\nvoid\nDeviceMemoryTracker::UntrackThreadMemoryUsage(MemoryUsage* usage)\n{\n  if (!usage) {\n    return;\n  }\n  if (tracker_) {\n    THROW_IF_CUPTI_ERR(cuptiActivityFlushAll(0));\n    uint64_t id = 0;\n    THROW_IF_CUPTI_ERR(cuptiActivityPopExternalCorrelationId(\n        CUPTI_EXTERNAL_CORRELATION_KIND_UNKNOWN, &id));\n    usage->tracked_ = false;\n  } else {\n    throw std::runtime_error(\n        \"DeviceMemoryTracker::Init() must be called before using any \"\n        \"DeviceMemoryTracker features.\");\n  }\n}\n\nvoid\nDeviceMemoryTracker::TrackActivityInternal(CUpti_Activity* record)\n{\n  switch (record->kind) {\n    case CUPTI_ACTIVITY_KIND_MEMORY2: {\n      CUpti_ActivityMemory3* memory_record = (CUpti_ActivityMemory3*)record;\n      TRITONBACKEND_CuptiTracker* usage = nullptr;\n      {\n        std::lock_guard<std::mutex> lk(mtx_);\n        auto it = activity_to_memory_usage_.find(memory_record->correlationId);\n        if (it != activity_to_memory_usage_.end()) {\n          usage = reinterpret_cast<TRITONBACKEND_CuptiTracker*>(it->second);\n          activity_to_memory_usage_.erase(it);\n        }\n      }\n      const bool is_allocation =\n          (memory_record->memoryOperationType ==\n           CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_ALLOCATION);\n      const bool is_release =\n          (memory_record->memoryOperationType ==\n           CUPTI_ACTIVITY_MEMORY_OPERATION_TYPE_RELEASE);\n      // Ignore memory record that is not associated with a\n      // TRITONBACKEND_CuptiTracker object or not related to allocations\n      if ((usage == nullptr) || (!usage->valid_) ||\n          (!is_allocation && !is_release)) {\n        break;\n      }\n\n      switch (memory_record->memoryKind) {\n        case CUPTI_ACTIVITY_MEMORY_KIND_DEVICE: {\n          usage->valid_ = UpdateMemoryTypeUsage(\n              memory_record, is_allocation, usage->cuda_memory_usage_byte_,\n              usage->cuda_array_len_);\n          break;\n        }\n        case CUPTI_ACTIVITY_MEMORY_KIND_PINNED: {\n          usage->valid_ = UpdateMemoryTypeUsage(\n              memory_record, is_allocation, usage->pinned_memory_usage_byte_,\n              usage->pinned_array_len_);\n          break;\n        }\n        case CUPTI_ACTIVITY_MEMORY_KIND_PAGEABLE: {\n          usage->valid_ = UpdateMemoryTypeUsage(\n              memory_record, is_allocation, usage->system_memory_usage_byte_,\n              usage->system_array_len_);\n          break;\n        }\n        default:\n          LOG_WARNING << \"Unrecognized type of memory is allocated, kind \"\n                      << memory_record->memoryKind;\n          usage->valid_ = false;\n          break;\n      }\n      break;\n    }\n    case CUPTI_ACTIVITY_KIND_EXTERNAL_CORRELATION: {\n      CUpti_ActivityExternalCorrelation* corr =\n          (CUpti_ActivityExternalCorrelation*)record;\n      if (CUPTI_EXTERNAL_CORRELATION_KIND_UNKNOWN == corr->externalKind) {\n        std::lock_guard<std::mutex> lk(mtx_);\n        activity_to_memory_usage_[corr->correlationId] =\n            static_cast<uintptr_t>(corr->externalId);\n      }\n      break;\n    }\n    case CUPTI_ACTIVITY_KIND_RUNTIME: {\n      // DO NOTHING, runtime API will be captured and reported to properly\n      // initialize records for CUPTI_ACTIVITY_KIND_MEMORY2.\n      break;\n    }\n    default:\n      LOG_ERROR << \"Unexpected capture of cupti record, kind: \" << record->kind;\n      break;\n  }\n}\n\ninline bool\nDeviceMemoryTracker::UpdateMemoryTypeUsage(\n    CUpti_ActivityMemory3* memory_record, const bool is_allocation,\n    int64_t* memory_usage, uint32_t usage_len)\n{\n  if (memory_record->deviceId >= usage_len) {\n    return false;\n  }\n  if (is_allocation) {\n    memory_usage[memory_record->deviceId] += memory_record->bytes;\n  } else {\n    memory_usage[memory_record->deviceId] -= memory_record->bytes;\n  }\n  return true;\n}\n\n}}  // namespace triton::backend\n"
  },
  {
    "path": "src/kernel.cu",
    "content": "// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.\n//\n// Redistribution and use in source and binary forms, with or without\n// modification, are permitted provided that the following conditions\n// are met:\n//  * Redistributions of source code must retain the above copyright\n//    notice, this list of conditions and the following disclaimer.\n//  * Redistributions in binary form must reproduce the above copyright\n//    notice, this list of conditions and the following disclaimer in the\n//    documentation and/or other materials provided with the distribution.\n//  * Neither the name of NVIDIA CORPORATION nor the names of its\n//    contributors may be used to endorse or promote products derived\n//    from this software without specific prior written permission.\n//\n// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n#include <cuda.h>\n\n#include \"kernel.h\"\n\n#define THREADBLOCK_SIZE 512\n__launch_bounds__(THREADBLOCK_SIZE) __global__ void TritonGatherKernel(\n    const int8_t** __restrict input_ptr_buffer,\n    const size_t* __restrict byte_size_buffer,\n    const size_t* __restrict byte_size_offset_buffer,\n    int8_t* __restrict output_buffer)\n{\n  int request_idx = blockIdx.x;\n  int lane_id = threadIdx.x;\n  const int8_t* request_input_buffer = input_ptr_buffer[request_idx];\n  int byte_size = byte_size_buffer[request_idx];\n  int byte_size_offset = byte_size_offset_buffer[request_idx];\n\n  int8_t* output_buffer_with_offset = output_buffer + byte_size_offset;\n  if (((byte_size % 4) == 0) && (((uint64_t)request_input_buffer % 4) == 0) &&\n      (((uint64_t)output_buffer_with_offset % 4) == 0)) {\n    int32_t* input_4 = (int32_t*)request_input_buffer;\n    int32_t* output_4 = (int32_t*)output_buffer_with_offset;\n    int element_count = byte_size / 4;\n    for (int elem_id = lane_id; elem_id < element_count;\n         elem_id += THREADBLOCK_SIZE) {\n      output_4[elem_id] = input_4[elem_id];\n    }\n  } else {\n    for (int elem_id = lane_id; elem_id < byte_size;\n         elem_id += THREADBLOCK_SIZE) {\n      output_buffer_with_offset[elem_id] =\n          __ldg(request_input_buffer + elem_id);\n    }\n  }\n}\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\ncudaError_t\nRunGatherKernel(\n    const int8_t** input_ptr_buffer, const size_t* byte_size_buffer,\n    const size_t* byte_size_offset_buffer, int8_t* output_buffer,\n    size_t request_count, cudaStream_t stream)\n{\n  TritonGatherKernel<<<request_count, THREADBLOCK_SIZE, 0, stream>>>(\n      input_ptr_buffer, byte_size_buffer, byte_size_offset_buffer,\n      output_buffer);\n  return cudaGetLastError();\n}\n\n#ifdef __cplusplus\n}\n#endif\n"
  },
  {
    "path": "src/kernel.h",
    "content": "// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.\n//\n// Redistribution and use in source and binary forms, with or without\n// modification, are permitted provided that the following conditions\n// are met:\n//  * Redistributions of source code must retain the above copyright\n//    notice, this list of conditions and the following disclaimer.\n//  * Redistributions in binary form must reproduce the above copyright\n//    notice, this list of conditions and the following disclaimer in the\n//    documentation and/or other materials provided with the distribution.\n//  * Neither the name of NVIDIA CORPORATION nor the names of its\n//    contributors may be used to endorse or promote products derived\n//    from this software without specific prior written permission.\n//\n// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n\n#pragma once\n#include <cuda_runtime_api.h>\n#include <stdint.h>\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\ncudaError_t RunGatherKernel(\n    const int8_t** input_ptr_buffer, const size_t* byte_size_buffer,\n    const size_t* byte_size_offset_buffer, int8_t* output_buffer,\n    size_t request_count, cudaStream_t stream);\n\n#ifdef __cplusplus\n}\n#endif\n"
  }
]